1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2006 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 */ 96 97 #include <sys/cdefs.h> 98 __FBSDID("$FreeBSD$"); 99 100 #include "opt_inet.h" 101 #include "opt_mac.h" 102 #include "opt_zero.h" 103 #include "opt_compat.h" 104 105 #include <sys/param.h> 106 #include <sys/systm.h> 107 #include <sys/fcntl.h> 108 #include <sys/limits.h> 109 #include <sys/lock.h> 110 #include <sys/mac.h> 111 #include <sys/malloc.h> 112 #include <sys/mbuf.h> 113 #include <sys/mutex.h> 114 #include <sys/domain.h> 115 #include <sys/file.h> /* for struct knote */ 116 #include <sys/kernel.h> 117 #include <sys/event.h> 118 #include <sys/eventhandler.h> 119 #include <sys/poll.h> 120 #include <sys/proc.h> 121 #include <sys/protosw.h> 122 #include <sys/socket.h> 123 #include <sys/socketvar.h> 124 #include <sys/resourcevar.h> 125 #include <sys/signalvar.h> 126 #include <sys/sysctl.h> 127 #include <sys/uio.h> 128 #include <sys/jail.h> 129 130 #include <security/mac/mac_framework.h> 131 132 #include <vm/uma.h> 133 134 #ifdef COMPAT_IA32 135 #include <sys/mount.h> 136 #include <compat/freebsd32/freebsd32.h> 137 138 extern struct sysentvec ia32_freebsd_sysvec; 139 #endif 140 141 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 142 int flags); 143 144 static void filt_sordetach(struct knote *kn); 145 static int filt_soread(struct knote *kn, long hint); 146 static void filt_sowdetach(struct knote *kn); 147 static int filt_sowrite(struct knote *kn, long hint); 148 static int filt_solisten(struct knote *kn, long hint); 149 150 static struct filterops solisten_filtops = 151 { 1, NULL, filt_sordetach, filt_solisten }; 152 static struct filterops soread_filtops = 153 { 1, NULL, filt_sordetach, filt_soread }; 154 static struct filterops sowrite_filtops = 155 { 1, NULL, filt_sowdetach, filt_sowrite }; 156 157 uma_zone_t socket_zone; 158 so_gen_t so_gencnt; /* generation count for sockets */ 159 160 int maxsockets; 161 162 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 163 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 164 165 static int somaxconn = SOMAXCONN; 166 static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS); 167 /* XXX: we dont have SYSCTL_USHORT */ 168 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW, 169 0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection " 170 "queue size"); 171 static int numopensockets; 172 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 173 &numopensockets, 0, "Number of open sockets"); 174 #ifdef ZERO_COPY_SOCKETS 175 /* These aren't static because they're used in other files. */ 176 int so_zero_copy_send = 1; 177 int so_zero_copy_receive = 1; 178 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 179 "Zero copy controls"); 180 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 181 &so_zero_copy_receive, 0, "Enable zero copy receive"); 182 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 183 &so_zero_copy_send, 0, "Enable zero copy send"); 184 #endif /* ZERO_COPY_SOCKETS */ 185 186 /* 187 * accept_mtx locks down per-socket fields relating to accept queues. See 188 * socketvar.h for an annotation of the protected fields of struct socket. 189 */ 190 struct mtx accept_mtx; 191 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 192 193 /* 194 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 195 * so_gencnt field. 196 */ 197 static struct mtx so_global_mtx; 198 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 199 200 /* 201 * General IPC sysctl name space, used by sockets and a variety of other IPC 202 * types. 203 */ 204 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 205 206 /* 207 * Sysctl to get and set the maximum global sockets limit. Notify protocols 208 * of the change so that they can update their dependent limits as required. 209 */ 210 static int 211 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 212 { 213 int error, newmaxsockets; 214 215 newmaxsockets = maxsockets; 216 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req); 217 if (error == 0 && req->newptr) { 218 if (newmaxsockets > maxsockets) { 219 maxsockets = newmaxsockets; 220 if (maxsockets > ((maxfiles / 4) * 3)) { 221 maxfiles = (maxsockets * 5) / 4; 222 maxfilesperproc = (maxfiles * 9) / 10; 223 } 224 EVENTHANDLER_INVOKE(maxsockets_change); 225 } else 226 error = EINVAL; 227 } 228 return (error); 229 } 230 231 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 232 &maxsockets, 0, sysctl_maxsockets, "IU", 233 "Maximum number of sockets avaliable"); 234 235 /* 236 * Initialise maxsockets. 237 */ 238 static void init_maxsockets(void *ignored) 239 { 240 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 241 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 242 } 243 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 244 245 /* 246 * Socket operation routines. These routines are called by the routines in 247 * sys_socket.c or from a system process, and implement the semantics of 248 * socket operations by switching out to the protocol specific routines. 249 */ 250 251 /* 252 * Get a socket structure from our zone, and initialize it. Note that it 253 * would probably be better to allocate socket and PCB at the same time, but 254 * I'm not convinced that all the protocols can be easily modified to do 255 * this. 256 * 257 * soalloc() returns a socket with a ref count of 0. 258 */ 259 static struct socket * 260 soalloc(void) 261 { 262 struct socket *so; 263 264 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 265 if (so == NULL) 266 return (NULL); 267 #ifdef MAC 268 if (mac_init_socket(so, M_NOWAIT) != 0) { 269 uma_zfree(socket_zone, so); 270 return (NULL); 271 } 272 #endif 273 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 274 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 275 TAILQ_INIT(&so->so_aiojobq); 276 mtx_lock(&so_global_mtx); 277 so->so_gencnt = ++so_gencnt; 278 ++numopensockets; 279 mtx_unlock(&so_global_mtx); 280 return (so); 281 } 282 283 /* 284 * Free the storage associated with a socket at the socket layer, tear down 285 * locks, labels, etc. All protocol state is assumed already to have been 286 * torn down (and possibly never set up) by the caller. 287 */ 288 static void 289 sodealloc(struct socket *so) 290 { 291 292 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 293 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 294 295 mtx_lock(&so_global_mtx); 296 so->so_gencnt = ++so_gencnt; 297 --numopensockets; /* Could be below, but faster here. */ 298 mtx_unlock(&so_global_mtx); 299 if (so->so_rcv.sb_hiwat) 300 (void)chgsbsize(so->so_cred->cr_uidinfo, 301 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 302 if (so->so_snd.sb_hiwat) 303 (void)chgsbsize(so->so_cred->cr_uidinfo, 304 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 305 #ifdef INET 306 /* remove acccept filter if one is present. */ 307 if (so->so_accf != NULL) 308 do_setopt_accept_filter(so, NULL); 309 #endif 310 #ifdef MAC 311 mac_destroy_socket(so); 312 #endif 313 crfree(so->so_cred); 314 SOCKBUF_LOCK_DESTROY(&so->so_snd); 315 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 316 uma_zfree(socket_zone, so); 317 } 318 319 /* 320 * socreate returns a socket with a ref count of 1. The socket should be 321 * closed with soclose(). 322 */ 323 int 324 socreate(dom, aso, type, proto, cred, td) 325 int dom; 326 struct socket **aso; 327 int type; 328 int proto; 329 struct ucred *cred; 330 struct thread *td; 331 { 332 struct protosw *prp; 333 struct socket *so; 334 int error; 335 336 if (proto) 337 prp = pffindproto(dom, proto, type); 338 else 339 prp = pffindtype(dom, type); 340 341 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 342 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 343 return (EPROTONOSUPPORT); 344 345 if (jailed(cred) && jail_socket_unixiproute_only && 346 prp->pr_domain->dom_family != PF_LOCAL && 347 prp->pr_domain->dom_family != PF_INET && 348 prp->pr_domain->dom_family != PF_ROUTE) { 349 return (EPROTONOSUPPORT); 350 } 351 352 if (prp->pr_type != type) 353 return (EPROTOTYPE); 354 so = soalloc(); 355 if (so == NULL) 356 return (ENOBUFS); 357 358 TAILQ_INIT(&so->so_incomp); 359 TAILQ_INIT(&so->so_comp); 360 so->so_type = type; 361 so->so_cred = crhold(cred); 362 so->so_proto = prp; 363 #ifdef MAC 364 mac_create_socket(cred, so); 365 #endif 366 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 367 NULL, NULL, NULL); 368 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 369 NULL, NULL, NULL); 370 so->so_count = 1; 371 /* 372 * Auto-sizing of socket buffers is managed by the protocols and 373 * the appropriate flags must be set in the pru_attach function. 374 */ 375 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 376 if (error) { 377 KASSERT(so->so_count == 1, ("socreate: so_count %d", 378 so->so_count)); 379 so->so_count = 0; 380 sodealloc(so); 381 return (error); 382 } 383 *aso = so; 384 return (0); 385 } 386 387 #ifdef REGRESSION 388 static int regression_sonewconn_earlytest = 1; 389 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 390 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 391 #endif 392 393 /* 394 * When an attempt at a new connection is noted on a socket which accepts 395 * connections, sonewconn is called. If the connection is possible (subject 396 * to space constraints, etc.) then we allocate a new structure, propoerly 397 * linked into the data structure of the original socket, and return this. 398 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 399 * 400 * Note: the ref count on the socket is 0 on return. 401 */ 402 struct socket * 403 sonewconn(head, connstatus) 404 register struct socket *head; 405 int connstatus; 406 { 407 register struct socket *so; 408 int over; 409 410 ACCEPT_LOCK(); 411 over = (head->so_qlen > 3 * head->so_qlimit / 2); 412 ACCEPT_UNLOCK(); 413 #ifdef REGRESSION 414 if (regression_sonewconn_earlytest && over) 415 #else 416 if (over) 417 #endif 418 return (NULL); 419 so = soalloc(); 420 if (so == NULL) 421 return (NULL); 422 if ((head->so_options & SO_ACCEPTFILTER) != 0) 423 connstatus = 0; 424 so->so_head = head; 425 so->so_type = head->so_type; 426 so->so_options = head->so_options &~ SO_ACCEPTCONN; 427 so->so_linger = head->so_linger; 428 so->so_state = head->so_state | SS_NOFDREF; 429 so->so_proto = head->so_proto; 430 so->so_cred = crhold(head->so_cred); 431 #ifdef MAC 432 SOCK_LOCK(head); 433 mac_create_socket_from_socket(head, so); 434 SOCK_UNLOCK(head); 435 #endif 436 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 437 NULL, NULL, NULL); 438 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 439 NULL, NULL, NULL); 440 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || 441 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 442 sodealloc(so); 443 return (NULL); 444 } 445 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 446 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 447 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 448 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 449 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 450 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 451 so->so_state |= connstatus; 452 ACCEPT_LOCK(); 453 if (connstatus) { 454 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 455 so->so_qstate |= SQ_COMP; 456 head->so_qlen++; 457 } else { 458 /* 459 * Keep removing sockets from the head until there's room for 460 * us to insert on the tail. In pre-locking revisions, this 461 * was a simple if(), but as we could be racing with other 462 * threads and soabort() requires dropping locks, we must 463 * loop waiting for the condition to be true. 464 */ 465 while (head->so_incqlen > head->so_qlimit) { 466 struct socket *sp; 467 sp = TAILQ_FIRST(&head->so_incomp); 468 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 469 head->so_incqlen--; 470 sp->so_qstate &= ~SQ_INCOMP; 471 sp->so_head = NULL; 472 ACCEPT_UNLOCK(); 473 soabort(sp); 474 ACCEPT_LOCK(); 475 } 476 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 477 so->so_qstate |= SQ_INCOMP; 478 head->so_incqlen++; 479 } 480 ACCEPT_UNLOCK(); 481 if (connstatus) { 482 sorwakeup(head); 483 wakeup_one(&head->so_timeo); 484 } 485 return (so); 486 } 487 488 int 489 sobind(so, nam, td) 490 struct socket *so; 491 struct sockaddr *nam; 492 struct thread *td; 493 { 494 495 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 496 } 497 498 /* 499 * solisten() transitions a socket from a non-listening state to a listening 500 * state, but can also be used to update the listen queue depth on an 501 * existing listen socket. The protocol will call back into the sockets 502 * layer using solisten_proto_check() and solisten_proto() to check and set 503 * socket-layer listen state. Call backs are used so that the protocol can 504 * acquire both protocol and socket layer locks in whatever order is required 505 * by the protocol. 506 * 507 * Protocol implementors are advised to hold the socket lock across the 508 * socket-layer test and set to avoid races at the socket layer. 509 */ 510 int 511 solisten(so, backlog, td) 512 struct socket *so; 513 int backlog; 514 struct thread *td; 515 { 516 517 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td)); 518 } 519 520 int 521 solisten_proto_check(so) 522 struct socket *so; 523 { 524 525 SOCK_LOCK_ASSERT(so); 526 527 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 528 SS_ISDISCONNECTING)) 529 return (EINVAL); 530 return (0); 531 } 532 533 void 534 solisten_proto(so, backlog) 535 struct socket *so; 536 int backlog; 537 { 538 539 SOCK_LOCK_ASSERT(so); 540 541 if (backlog < 0 || backlog > somaxconn) 542 backlog = somaxconn; 543 so->so_qlimit = backlog; 544 so->so_options |= SO_ACCEPTCONN; 545 } 546 547 /* 548 * Attempt to free a socket. This should really be sotryfree(). 549 * 550 * sofree() will succeed if: 551 * 552 * - There are no outstanding file descriptor references or related consumers 553 * (so_count == 0). 554 * 555 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 556 * 557 * - The protocol does not have an outstanding strong reference on the socket 558 * (SS_PROTOREF). 559 * 560 * - The socket is not in a completed connection queue, so a process has been 561 * notified that it is present. If it is removed, the user process may 562 * block in accept() despite select() saying the socket was ready. 563 * 564 * Otherwise, it will quietly abort so that a future call to sofree(), when 565 * conditions are right, can succeed. 566 */ 567 void 568 sofree(so) 569 struct socket *so; 570 { 571 struct protosw *pr = so->so_proto; 572 struct socket *head; 573 574 ACCEPT_LOCK_ASSERT(); 575 SOCK_LOCK_ASSERT(so); 576 577 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 578 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 579 SOCK_UNLOCK(so); 580 ACCEPT_UNLOCK(); 581 return; 582 } 583 584 head = so->so_head; 585 if (head != NULL) { 586 KASSERT((so->so_qstate & SQ_COMP) != 0 || 587 (so->so_qstate & SQ_INCOMP) != 0, 588 ("sofree: so_head != NULL, but neither SQ_COMP nor " 589 "SQ_INCOMP")); 590 KASSERT((so->so_qstate & SQ_COMP) == 0 || 591 (so->so_qstate & SQ_INCOMP) == 0, 592 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 593 TAILQ_REMOVE(&head->so_incomp, so, so_list); 594 head->so_incqlen--; 595 so->so_qstate &= ~SQ_INCOMP; 596 so->so_head = NULL; 597 } 598 KASSERT((so->so_qstate & SQ_COMP) == 0 && 599 (so->so_qstate & SQ_INCOMP) == 0, 600 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 601 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 602 if (so->so_options & SO_ACCEPTCONN) { 603 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); 604 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated")); 605 } 606 SOCK_UNLOCK(so); 607 ACCEPT_UNLOCK(); 608 609 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 610 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 611 if (pr->pr_usrreqs->pru_detach != NULL) 612 (*pr->pr_usrreqs->pru_detach)(so); 613 614 /* 615 * From this point on, we assume that no other references to this 616 * socket exist anywhere else in the stack. Therefore, no locks need 617 * to be acquired or held. 618 * 619 * We used to do a lot of socket buffer and socket locking here, as 620 * well as invoke sorflush() and perform wakeups. The direct call to 621 * dom_dispose() and sbrelease_internal() are an inlining of what was 622 * necessary from sorflush(). 623 * 624 * Notice that the socket buffer and kqueue state are torn down 625 * before calling pru_detach. This means that protocols shold not 626 * assume they can perform socket wakeups, etc, in their detach 627 * code. 628 */ 629 KASSERT((so->so_snd.sb_flags & SB_LOCK) == 0, ("sofree: snd sblock")); 630 KASSERT((so->so_rcv.sb_flags & SB_LOCK) == 0, ("sofree: rcv sblock")); 631 sbdestroy(&so->so_snd, so); 632 sbdestroy(&so->so_rcv, so); 633 knlist_destroy(&so->so_rcv.sb_sel.si_note); 634 knlist_destroy(&so->so_snd.sb_sel.si_note); 635 sodealloc(so); 636 } 637 638 /* 639 * Close a socket on last file table reference removal. Initiate disconnect 640 * if connected. Free socket when disconnect complete. 641 * 642 * This function will sorele() the socket. Note that soclose() may be called 643 * prior to the ref count reaching zero. The actual socket structure will 644 * not be freed until the ref count reaches zero. 645 */ 646 int 647 soclose(so) 648 struct socket *so; 649 { 650 int error = 0; 651 652 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 653 654 funsetown(&so->so_sigio); 655 if (so->so_state & SS_ISCONNECTED) { 656 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 657 error = sodisconnect(so); 658 if (error) 659 goto drop; 660 } 661 if (so->so_options & SO_LINGER) { 662 if ((so->so_state & SS_ISDISCONNECTING) && 663 (so->so_state & SS_NBIO)) 664 goto drop; 665 while (so->so_state & SS_ISCONNECTED) { 666 error = tsleep(&so->so_timeo, 667 PSOCK | PCATCH, "soclos", so->so_linger * hz); 668 if (error) 669 break; 670 } 671 } 672 } 673 674 drop: 675 if (so->so_proto->pr_usrreqs->pru_close != NULL) 676 (*so->so_proto->pr_usrreqs->pru_close)(so); 677 if (so->so_options & SO_ACCEPTCONN) { 678 struct socket *sp; 679 ACCEPT_LOCK(); 680 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 681 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 682 so->so_incqlen--; 683 sp->so_qstate &= ~SQ_INCOMP; 684 sp->so_head = NULL; 685 ACCEPT_UNLOCK(); 686 soabort(sp); 687 ACCEPT_LOCK(); 688 } 689 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 690 TAILQ_REMOVE(&so->so_comp, sp, so_list); 691 so->so_qlen--; 692 sp->so_qstate &= ~SQ_COMP; 693 sp->so_head = NULL; 694 ACCEPT_UNLOCK(); 695 soabort(sp); 696 ACCEPT_LOCK(); 697 } 698 ACCEPT_UNLOCK(); 699 } 700 ACCEPT_LOCK(); 701 SOCK_LOCK(so); 702 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 703 so->so_state |= SS_NOFDREF; 704 sorele(so); 705 return (error); 706 } 707 708 /* 709 * soabort() is used to abruptly tear down a connection, such as when a 710 * resource limit is reached (listen queue depth exceeded), or if a listen 711 * socket is closed while there are sockets waiting to be accepted. 712 * 713 * This interface is tricky, because it is called on an unreferenced socket, 714 * and must be called only by a thread that has actually removed the socket 715 * from the listen queue it was on, or races with other threads are risked. 716 * 717 * This interface will call into the protocol code, so must not be called 718 * with any socket locks held. Protocols do call it while holding their own 719 * recursible protocol mutexes, but this is something that should be subject 720 * to review in the future. 721 */ 722 void 723 soabort(so) 724 struct socket *so; 725 { 726 727 /* 728 * In as much as is possible, assert that no references to this 729 * socket are held. This is not quite the same as asserting that the 730 * current thread is responsible for arranging for no references, but 731 * is as close as we can get for now. 732 */ 733 KASSERT(so->so_count == 0, ("soabort: so_count")); 734 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 735 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 736 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 737 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 738 739 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 740 (*so->so_proto->pr_usrreqs->pru_abort)(so); 741 ACCEPT_LOCK(); 742 SOCK_LOCK(so); 743 sofree(so); 744 } 745 746 int 747 soaccept(so, nam) 748 struct socket *so; 749 struct sockaddr **nam; 750 { 751 int error; 752 753 SOCK_LOCK(so); 754 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 755 so->so_state &= ~SS_NOFDREF; 756 SOCK_UNLOCK(so); 757 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 758 return (error); 759 } 760 761 int 762 soconnect(so, nam, td) 763 struct socket *so; 764 struct sockaddr *nam; 765 struct thread *td; 766 { 767 int error; 768 769 if (so->so_options & SO_ACCEPTCONN) 770 return (EOPNOTSUPP); 771 /* 772 * If protocol is connection-based, can only connect once. 773 * Otherwise, if connected, try to disconnect first. This allows 774 * user to disconnect by connecting to, e.g., a null address. 775 */ 776 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 777 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 778 (error = sodisconnect(so)))) { 779 error = EISCONN; 780 } else { 781 /* 782 * Prevent accumulated error from previous connection from 783 * biting us. 784 */ 785 so->so_error = 0; 786 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 787 } 788 789 return (error); 790 } 791 792 int 793 soconnect2(so1, so2) 794 struct socket *so1; 795 struct socket *so2; 796 { 797 798 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 799 } 800 801 int 802 sodisconnect(so) 803 struct socket *so; 804 { 805 int error; 806 807 if ((so->so_state & SS_ISCONNECTED) == 0) 808 return (ENOTCONN); 809 if (so->so_state & SS_ISDISCONNECTING) 810 return (EALREADY); 811 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 812 return (error); 813 } 814 815 #ifdef ZERO_COPY_SOCKETS 816 struct so_zerocopy_stats{ 817 int size_ok; 818 int align_ok; 819 int found_ifp; 820 }; 821 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 822 #include <netinet/in.h> 823 #include <net/route.h> 824 #include <netinet/in_pcb.h> 825 #include <vm/vm.h> 826 #include <vm/vm_page.h> 827 #include <vm/vm_object.h> 828 829 /* 830 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise 831 * sosend_dgram() and sosend_generic() use m_uiotombuf(). 832 * 833 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 834 * all of the data referenced by the uio. If desired, it uses zero-copy. 835 * *space will be updated to reflect data copied in. 836 * 837 * NB: If atomic I/O is requested, the caller must already have checked that 838 * space can hold resid bytes. 839 * 840 * NB: In the event of an error, the caller may need to free the partial 841 * chain pointed to by *mpp. The contents of both *uio and *space may be 842 * modified even in the case of an error. 843 */ 844 static int 845 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 846 int flags) 847 { 848 struct mbuf *m, **mp, *top; 849 long len, resid; 850 int error; 851 #ifdef ZERO_COPY_SOCKETS 852 int cow_send; 853 #endif 854 855 *retmp = top = NULL; 856 mp = ⊤ 857 len = 0; 858 resid = uio->uio_resid; 859 error = 0; 860 do { 861 #ifdef ZERO_COPY_SOCKETS 862 cow_send = 0; 863 #endif /* ZERO_COPY_SOCKETS */ 864 if (resid >= MINCLSIZE) { 865 #ifdef ZERO_COPY_SOCKETS 866 if (top == NULL) { 867 m = m_gethdr(M_WAITOK, MT_DATA); 868 m->m_pkthdr.len = 0; 869 m->m_pkthdr.rcvif = NULL; 870 } else 871 m = m_get(M_WAITOK, MT_DATA); 872 if (so_zero_copy_send && 873 resid>=PAGE_SIZE && 874 *space>=PAGE_SIZE && 875 uio->uio_iov->iov_len>=PAGE_SIZE) { 876 so_zerocp_stats.size_ok++; 877 so_zerocp_stats.align_ok++; 878 cow_send = socow_setup(m, uio); 879 len = cow_send; 880 } 881 if (!cow_send) { 882 m_clget(m, M_WAITOK); 883 len = min(min(MCLBYTES, resid), *space); 884 } 885 #else /* ZERO_COPY_SOCKETS */ 886 if (top == NULL) { 887 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 888 m->m_pkthdr.len = 0; 889 m->m_pkthdr.rcvif = NULL; 890 } else 891 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 892 len = min(min(MCLBYTES, resid), *space); 893 #endif /* ZERO_COPY_SOCKETS */ 894 } else { 895 if (top == NULL) { 896 m = m_gethdr(M_TRYWAIT, MT_DATA); 897 m->m_pkthdr.len = 0; 898 m->m_pkthdr.rcvif = NULL; 899 900 len = min(min(MHLEN, resid), *space); 901 /* 902 * For datagram protocols, leave room 903 * for protocol headers in first mbuf. 904 */ 905 if (atomic && m && len < MHLEN) 906 MH_ALIGN(m, len); 907 } else { 908 m = m_get(M_TRYWAIT, MT_DATA); 909 len = min(min(MLEN, resid), *space); 910 } 911 } 912 if (m == NULL) { 913 error = ENOBUFS; 914 goto out; 915 } 916 917 *space -= len; 918 #ifdef ZERO_COPY_SOCKETS 919 if (cow_send) 920 error = 0; 921 else 922 #endif /* ZERO_COPY_SOCKETS */ 923 error = uiomove(mtod(m, void *), (int)len, uio); 924 resid = uio->uio_resid; 925 m->m_len = len; 926 *mp = m; 927 top->m_pkthdr.len += len; 928 if (error) 929 goto out; 930 mp = &m->m_next; 931 if (resid <= 0) { 932 if (flags & MSG_EOR) 933 top->m_flags |= M_EOR; 934 break; 935 } 936 } while (*space > 0 && atomic); 937 out: 938 *retmp = top; 939 return (error); 940 } 941 #endif /*ZERO_COPY_SOCKETS*/ 942 943 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 944 945 int 946 sosend_dgram(so, addr, uio, top, control, flags, td) 947 struct socket *so; 948 struct sockaddr *addr; 949 struct uio *uio; 950 struct mbuf *top; 951 struct mbuf *control; 952 int flags; 953 struct thread *td; 954 { 955 long space, resid; 956 int clen = 0, error, dontroute; 957 #ifdef ZERO_COPY_SOCKETS 958 int atomic = sosendallatonce(so) || top; 959 #endif 960 961 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); 962 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 963 ("sodgram_send: !PR_ATOMIC")); 964 965 if (uio != NULL) 966 resid = uio->uio_resid; 967 else 968 resid = top->m_pkthdr.len; 969 /* 970 * In theory resid should be unsigned. However, space must be 971 * signed, as it might be less than 0 if we over-committed, and we 972 * must use a signed comparison of space and resid. On the other 973 * hand, a negative resid causes us to loop sending 0-length 974 * segments to the protocol. 975 * 976 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 977 * type sockets since that's an error. 978 */ 979 if (resid < 0) { 980 error = EINVAL; 981 goto out; 982 } 983 984 dontroute = 985 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 986 if (td != NULL) 987 td->td_proc->p_stats->p_ru.ru_msgsnd++; 988 if (control != NULL) 989 clen = control->m_len; 990 991 SOCKBUF_LOCK(&so->so_snd); 992 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 993 SOCKBUF_UNLOCK(&so->so_snd); 994 error = EPIPE; 995 goto out; 996 } 997 if (so->so_error) { 998 error = so->so_error; 999 so->so_error = 0; 1000 SOCKBUF_UNLOCK(&so->so_snd); 1001 goto out; 1002 } 1003 if ((so->so_state & SS_ISCONNECTED) == 0) { 1004 /* 1005 * `sendto' and `sendmsg' is allowed on a connection-based 1006 * socket if it supports implied connect. Return ENOTCONN if 1007 * not connected and no address is supplied. 1008 */ 1009 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1010 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1011 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1012 !(resid == 0 && clen != 0)) { 1013 SOCKBUF_UNLOCK(&so->so_snd); 1014 error = ENOTCONN; 1015 goto out; 1016 } 1017 } else if (addr == NULL) { 1018 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1019 error = ENOTCONN; 1020 else 1021 error = EDESTADDRREQ; 1022 SOCKBUF_UNLOCK(&so->so_snd); 1023 goto out; 1024 } 1025 } 1026 1027 /* 1028 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1029 * problem and need fixing. 1030 */ 1031 space = sbspace(&so->so_snd); 1032 if (flags & MSG_OOB) 1033 space += 1024; 1034 space -= clen; 1035 SOCKBUF_UNLOCK(&so->so_snd); 1036 if (resid > space) { 1037 error = EMSGSIZE; 1038 goto out; 1039 } 1040 if (uio == NULL) { 1041 resid = 0; 1042 if (flags & MSG_EOR) 1043 top->m_flags |= M_EOR; 1044 } else { 1045 #ifdef ZERO_COPY_SOCKETS 1046 error = sosend_copyin(uio, &top, atomic, &space, flags); 1047 if (error) 1048 goto out; 1049 #else 1050 /* 1051 * Copy the data from userland into a mbuf chain. 1052 * If no data is to be copied in, a single empty mbuf 1053 * is returned. 1054 */ 1055 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1056 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1057 if (top == NULL) { 1058 error = EFAULT; /* only possible error */ 1059 goto out; 1060 } 1061 space -= resid - uio->uio_resid; 1062 #endif 1063 resid = uio->uio_resid; 1064 } 1065 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1066 /* 1067 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1068 * than with. 1069 */ 1070 if (dontroute) { 1071 SOCK_LOCK(so); 1072 so->so_options |= SO_DONTROUTE; 1073 SOCK_UNLOCK(so); 1074 } 1075 /* 1076 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1077 * of date. We could have recieved a reset packet in an interrupt or 1078 * maybe we slept while doing page faults in uiomove() etc. We could 1079 * probably recheck again inside the locking protection here, but 1080 * there are probably other places that this also happens. We must 1081 * rethink this. 1082 */ 1083 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1084 (flags & MSG_OOB) ? PRUS_OOB : 1085 /* 1086 * If the user set MSG_EOF, the protocol understands this flag and 1087 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1088 */ 1089 ((flags & MSG_EOF) && 1090 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1091 (resid <= 0)) ? 1092 PRUS_EOF : 1093 /* If there is more to send set PRUS_MORETOCOME */ 1094 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1095 top, addr, control, td); 1096 if (dontroute) { 1097 SOCK_LOCK(so); 1098 so->so_options &= ~SO_DONTROUTE; 1099 SOCK_UNLOCK(so); 1100 } 1101 clen = 0; 1102 control = NULL; 1103 top = NULL; 1104 out: 1105 if (top != NULL) 1106 m_freem(top); 1107 if (control != NULL) 1108 m_freem(control); 1109 return (error); 1110 } 1111 1112 /* 1113 * Send on a socket. If send must go all at once and message is larger than 1114 * send buffering, then hard error. Lock against other senders. If must go 1115 * all at once and not enough room now, then inform user that this would 1116 * block and do nothing. Otherwise, if nonblocking, send as much as 1117 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1118 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1119 * in mbuf chain must be small enough to send all at once. 1120 * 1121 * Returns nonzero on error, timeout or signal; callers must check for short 1122 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1123 * on return. 1124 */ 1125 #define snderr(errno) { error = (errno); goto release; } 1126 int 1127 sosend_generic(so, addr, uio, top, control, flags, td) 1128 struct socket *so; 1129 struct sockaddr *addr; 1130 struct uio *uio; 1131 struct mbuf *top; 1132 struct mbuf *control; 1133 int flags; 1134 struct thread *td; 1135 { 1136 long space, resid; 1137 int clen = 0, error, dontroute; 1138 int atomic = sosendallatonce(so) || top; 1139 1140 if (uio != NULL) 1141 resid = uio->uio_resid; 1142 else 1143 resid = top->m_pkthdr.len; 1144 /* 1145 * In theory resid should be unsigned. However, space must be 1146 * signed, as it might be less than 0 if we over-committed, and we 1147 * must use a signed comparison of space and resid. On the other 1148 * hand, a negative resid causes us to loop sending 0-length 1149 * segments to the protocol. 1150 * 1151 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1152 * type sockets since that's an error. 1153 */ 1154 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1155 error = EINVAL; 1156 goto out; 1157 } 1158 1159 dontroute = 1160 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1161 (so->so_proto->pr_flags & PR_ATOMIC); 1162 if (td != NULL) 1163 td->td_proc->p_stats->p_ru.ru_msgsnd++; 1164 if (control != NULL) 1165 clen = control->m_len; 1166 1167 SOCKBUF_LOCK(&so->so_snd); 1168 restart: 1169 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1170 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1171 if (error) 1172 goto out_locked; 1173 do { 1174 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1175 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 1176 snderr(EPIPE); 1177 if (so->so_error) { 1178 error = so->so_error; 1179 so->so_error = 0; 1180 goto release; 1181 } 1182 if ((so->so_state & SS_ISCONNECTED) == 0) { 1183 /* 1184 * `sendto' and `sendmsg' is allowed on a connection- 1185 * based socket if it supports implied connect. 1186 * Return ENOTCONN if not connected and no address is 1187 * supplied. 1188 */ 1189 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1190 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1191 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1192 !(resid == 0 && clen != 0)) 1193 snderr(ENOTCONN); 1194 } else if (addr == NULL) 1195 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 1196 ENOTCONN : EDESTADDRREQ); 1197 } 1198 space = sbspace(&so->so_snd); 1199 if (flags & MSG_OOB) 1200 space += 1024; 1201 if ((atomic && resid > so->so_snd.sb_hiwat) || 1202 clen > so->so_snd.sb_hiwat) 1203 snderr(EMSGSIZE); 1204 if (space < resid + clen && 1205 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1206 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 1207 snderr(EWOULDBLOCK); 1208 sbunlock(&so->so_snd); 1209 error = sbwait(&so->so_snd); 1210 if (error) 1211 goto out_locked; 1212 goto restart; 1213 } 1214 SOCKBUF_UNLOCK(&so->so_snd); 1215 space -= clen; 1216 do { 1217 if (uio == NULL) { 1218 resid = 0; 1219 if (flags & MSG_EOR) 1220 top->m_flags |= M_EOR; 1221 } else { 1222 #ifdef ZERO_COPY_SOCKETS 1223 error = sosend_copyin(uio, &top, atomic, 1224 &space, flags); 1225 if (error != 0) { 1226 SOCKBUF_LOCK(&so->so_snd); 1227 goto release; 1228 } 1229 #else 1230 /* 1231 * Copy the data from userland into a mbuf 1232 * chain. If no data is to be copied in, 1233 * a single empty mbuf is returned. 1234 */ 1235 top = m_uiotombuf(uio, M_WAITOK, space, 1236 (atomic ? max_hdr : 0), 1237 (atomic ? M_PKTHDR : 0) | 1238 ((flags & MSG_EOR) ? M_EOR : 0)); 1239 if (top == NULL) { 1240 SOCKBUF_LOCK(&so->so_snd); 1241 error = EFAULT; /* only possible error */ 1242 goto release; 1243 } 1244 space -= resid - uio->uio_resid; 1245 #endif 1246 resid = uio->uio_resid; 1247 } 1248 if (dontroute) { 1249 SOCK_LOCK(so); 1250 so->so_options |= SO_DONTROUTE; 1251 SOCK_UNLOCK(so); 1252 } 1253 /* 1254 * XXX all the SBS_CANTSENDMORE checks previously 1255 * done could be out of date. We could have recieved 1256 * a reset packet in an interrupt or maybe we slept 1257 * while doing page faults in uiomove() etc. We 1258 * could probably recheck again inside the locking 1259 * protection here, but there are probably other 1260 * places that this also happens. We must rethink 1261 * this. 1262 */ 1263 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1264 (flags & MSG_OOB) ? PRUS_OOB : 1265 /* 1266 * If the user set MSG_EOF, the protocol understands 1267 * this flag and nothing left to send then use 1268 * PRU_SEND_EOF instead of PRU_SEND. 1269 */ 1270 ((flags & MSG_EOF) && 1271 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1272 (resid <= 0)) ? 1273 PRUS_EOF : 1274 /* If there is more to send set PRUS_MORETOCOME. */ 1275 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1276 top, addr, control, td); 1277 if (dontroute) { 1278 SOCK_LOCK(so); 1279 so->so_options &= ~SO_DONTROUTE; 1280 SOCK_UNLOCK(so); 1281 } 1282 clen = 0; 1283 control = NULL; 1284 top = NULL; 1285 if (error) { 1286 SOCKBUF_LOCK(&so->so_snd); 1287 goto release; 1288 } 1289 } while (resid && space > 0); 1290 SOCKBUF_LOCK(&so->so_snd); 1291 } while (resid); 1292 1293 release: 1294 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1295 sbunlock(&so->so_snd); 1296 out_locked: 1297 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1298 SOCKBUF_UNLOCK(&so->so_snd); 1299 out: 1300 if (top != NULL) 1301 m_freem(top); 1302 if (control != NULL) 1303 m_freem(control); 1304 return (error); 1305 } 1306 #undef snderr 1307 1308 int 1309 sosend(so, addr, uio, top, control, flags, td) 1310 struct socket *so; 1311 struct sockaddr *addr; 1312 struct uio *uio; 1313 struct mbuf *top; 1314 struct mbuf *control; 1315 int flags; 1316 struct thread *td; 1317 { 1318 1319 /* XXXRW: Temporary debugging. */ 1320 KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend, 1321 ("sosend: protocol calls sosend")); 1322 1323 return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1324 control, flags, td)); 1325 } 1326 1327 /* 1328 * The part of soreceive() that implements reading non-inline out-of-band 1329 * data from a socket. For more complete comments, see soreceive(), from 1330 * which this code originated. 1331 * 1332 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1333 * unable to return an mbuf chain to the caller. 1334 */ 1335 static int 1336 soreceive_rcvoob(so, uio, flags) 1337 struct socket *so; 1338 struct uio *uio; 1339 int flags; 1340 { 1341 struct protosw *pr = so->so_proto; 1342 struct mbuf *m; 1343 int error; 1344 1345 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1346 1347 m = m_get(M_TRYWAIT, MT_DATA); 1348 if (m == NULL) 1349 return (ENOBUFS); 1350 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1351 if (error) 1352 goto bad; 1353 do { 1354 #ifdef ZERO_COPY_SOCKETS 1355 if (so_zero_copy_receive) { 1356 int disposable; 1357 1358 if ((m->m_flags & M_EXT) 1359 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1360 disposable = 1; 1361 else 1362 disposable = 0; 1363 1364 error = uiomoveco(mtod(m, void *), 1365 min(uio->uio_resid, m->m_len), 1366 uio, disposable); 1367 } else 1368 #endif /* ZERO_COPY_SOCKETS */ 1369 error = uiomove(mtod(m, void *), 1370 (int) min(uio->uio_resid, m->m_len), uio); 1371 m = m_free(m); 1372 } while (uio->uio_resid && error == 0 && m); 1373 bad: 1374 if (m != NULL) 1375 m_freem(m); 1376 return (error); 1377 } 1378 1379 /* 1380 * Following replacement or removal of the first mbuf on the first mbuf chain 1381 * of a socket buffer, push necessary state changes back into the socket 1382 * buffer so that other consumers see the values consistently. 'nextrecord' 1383 * is the callers locally stored value of the original value of 1384 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1385 * NOTE: 'nextrecord' may be NULL. 1386 */ 1387 static __inline void 1388 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1389 { 1390 1391 SOCKBUF_LOCK_ASSERT(sb); 1392 /* 1393 * First, update for the new value of nextrecord. If necessary, make 1394 * it the first record. 1395 */ 1396 if (sb->sb_mb != NULL) 1397 sb->sb_mb->m_nextpkt = nextrecord; 1398 else 1399 sb->sb_mb = nextrecord; 1400 1401 /* 1402 * Now update any dependent socket buffer fields to reflect the new 1403 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1404 * addition of a second clause that takes care of the case where 1405 * sb_mb has been updated, but remains the last record. 1406 */ 1407 if (sb->sb_mb == NULL) { 1408 sb->sb_mbtail = NULL; 1409 sb->sb_lastrecord = NULL; 1410 } else if (sb->sb_mb->m_nextpkt == NULL) 1411 sb->sb_lastrecord = sb->sb_mb; 1412 } 1413 1414 1415 /* 1416 * Implement receive operations on a socket. We depend on the way that 1417 * records are added to the sockbuf by sbappend. In particular, each record 1418 * (mbufs linked through m_next) must begin with an address if the protocol 1419 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1420 * data, and then zero or more mbufs of data. In order to allow parallelism 1421 * between network receive and copying to user space, as well as avoid 1422 * sleeping with a mutex held, we release the socket buffer mutex during the 1423 * user space copy. Although the sockbuf is locked, new data may still be 1424 * appended, and thus we must maintain consistency of the sockbuf during that 1425 * time. 1426 * 1427 * The caller may receive the data as a single mbuf chain by supplying an 1428 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1429 * the count in uio_resid. 1430 */ 1431 int 1432 soreceive_generic(so, psa, uio, mp0, controlp, flagsp) 1433 struct socket *so; 1434 struct sockaddr **psa; 1435 struct uio *uio; 1436 struct mbuf **mp0; 1437 struct mbuf **controlp; 1438 int *flagsp; 1439 { 1440 struct mbuf *m, **mp; 1441 int flags, len, error, offset; 1442 struct protosw *pr = so->so_proto; 1443 struct mbuf *nextrecord; 1444 int moff, type = 0; 1445 int orig_resid = uio->uio_resid; 1446 1447 mp = mp0; 1448 if (psa != NULL) 1449 *psa = NULL; 1450 if (controlp != NULL) 1451 *controlp = NULL; 1452 if (flagsp != NULL) 1453 flags = *flagsp &~ MSG_EOR; 1454 else 1455 flags = 0; 1456 if (flags & MSG_OOB) 1457 return (soreceive_rcvoob(so, uio, flags)); 1458 if (mp != NULL) 1459 *mp = NULL; 1460 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1461 && uio->uio_resid) 1462 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1463 1464 SOCKBUF_LOCK(&so->so_rcv); 1465 restart: 1466 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1467 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1468 if (error) 1469 goto out; 1470 1471 m = so->so_rcv.sb_mb; 1472 /* 1473 * If we have less data than requested, block awaiting more (subject 1474 * to any timeout) if: 1475 * 1. the current count is less than the low water mark, or 1476 * 2. MSG_WAITALL is set, and it is possible to do the entire 1477 * receive operation at once if we block (resid <= hiwat). 1478 * 3. MSG_DONTWAIT is not set 1479 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1480 * we have to do the receive in sections, and thus risk returning a 1481 * short count if a timeout or signal occurs after we start. 1482 */ 1483 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1484 so->so_rcv.sb_cc < uio->uio_resid) && 1485 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1486 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1487 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1488 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1489 ("receive: m == %p so->so_rcv.sb_cc == %u", 1490 m, so->so_rcv.sb_cc)); 1491 if (so->so_error) { 1492 if (m != NULL) 1493 goto dontblock; 1494 error = so->so_error; 1495 if ((flags & MSG_PEEK) == 0) 1496 so->so_error = 0; 1497 goto release; 1498 } 1499 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1500 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1501 if (m) 1502 goto dontblock; 1503 else 1504 goto release; 1505 } 1506 for (; m != NULL; m = m->m_next) 1507 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1508 m = so->so_rcv.sb_mb; 1509 goto dontblock; 1510 } 1511 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1512 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1513 error = ENOTCONN; 1514 goto release; 1515 } 1516 if (uio->uio_resid == 0) 1517 goto release; 1518 if ((so->so_state & SS_NBIO) || 1519 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1520 error = EWOULDBLOCK; 1521 goto release; 1522 } 1523 SBLASTRECORDCHK(&so->so_rcv); 1524 SBLASTMBUFCHK(&so->so_rcv); 1525 sbunlock(&so->so_rcv); 1526 error = sbwait(&so->so_rcv); 1527 if (error) 1528 goto out; 1529 goto restart; 1530 } 1531 dontblock: 1532 /* 1533 * From this point onward, we maintain 'nextrecord' as a cache of the 1534 * pointer to the next record in the socket buffer. We must keep the 1535 * various socket buffer pointers and local stack versions of the 1536 * pointers in sync, pushing out modifications before dropping the 1537 * socket buffer mutex, and re-reading them when picking it up. 1538 * 1539 * Otherwise, we will race with the network stack appending new data 1540 * or records onto the socket buffer by using inconsistent/stale 1541 * versions of the field, possibly resulting in socket buffer 1542 * corruption. 1543 * 1544 * By holding the high-level sblock(), we prevent simultaneous 1545 * readers from pulling off the front of the socket buffer. 1546 */ 1547 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1548 if (uio->uio_td) 1549 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1550 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1551 SBLASTRECORDCHK(&so->so_rcv); 1552 SBLASTMBUFCHK(&so->so_rcv); 1553 nextrecord = m->m_nextpkt; 1554 if (pr->pr_flags & PR_ADDR) { 1555 KASSERT(m->m_type == MT_SONAME, 1556 ("m->m_type == %d", m->m_type)); 1557 orig_resid = 0; 1558 if (psa != NULL) 1559 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1560 M_NOWAIT); 1561 if (flags & MSG_PEEK) { 1562 m = m->m_next; 1563 } else { 1564 sbfree(&so->so_rcv, m); 1565 so->so_rcv.sb_mb = m_free(m); 1566 m = so->so_rcv.sb_mb; 1567 sockbuf_pushsync(&so->so_rcv, nextrecord); 1568 } 1569 } 1570 1571 /* 1572 * Process one or more MT_CONTROL mbufs present before any data mbufs 1573 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1574 * just copy the data; if !MSG_PEEK, we call into the protocol to 1575 * perform externalization (or freeing if controlp == NULL). 1576 */ 1577 if (m != NULL && m->m_type == MT_CONTROL) { 1578 struct mbuf *cm = NULL, *cmn; 1579 struct mbuf **cme = &cm; 1580 1581 do { 1582 if (flags & MSG_PEEK) { 1583 if (controlp != NULL) { 1584 *controlp = m_copy(m, 0, m->m_len); 1585 controlp = &(*controlp)->m_next; 1586 } 1587 m = m->m_next; 1588 } else { 1589 sbfree(&so->so_rcv, m); 1590 so->so_rcv.sb_mb = m->m_next; 1591 m->m_next = NULL; 1592 *cme = m; 1593 cme = &(*cme)->m_next; 1594 m = so->so_rcv.sb_mb; 1595 } 1596 } while (m != NULL && m->m_type == MT_CONTROL); 1597 if ((flags & MSG_PEEK) == 0) 1598 sockbuf_pushsync(&so->so_rcv, nextrecord); 1599 while (cm != NULL) { 1600 cmn = cm->m_next; 1601 cm->m_next = NULL; 1602 if (pr->pr_domain->dom_externalize != NULL) { 1603 SOCKBUF_UNLOCK(&so->so_rcv); 1604 error = (*pr->pr_domain->dom_externalize) 1605 (cm, controlp); 1606 SOCKBUF_LOCK(&so->so_rcv); 1607 } else if (controlp != NULL) 1608 *controlp = cm; 1609 else 1610 m_freem(cm); 1611 if (controlp != NULL) { 1612 orig_resid = 0; 1613 while (*controlp != NULL) 1614 controlp = &(*controlp)->m_next; 1615 } 1616 cm = cmn; 1617 } 1618 if (m != NULL) 1619 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1620 else 1621 nextrecord = so->so_rcv.sb_mb; 1622 orig_resid = 0; 1623 } 1624 if (m != NULL) { 1625 if ((flags & MSG_PEEK) == 0) { 1626 KASSERT(m->m_nextpkt == nextrecord, 1627 ("soreceive: post-control, nextrecord !sync")); 1628 if (nextrecord == NULL) { 1629 KASSERT(so->so_rcv.sb_mb == m, 1630 ("soreceive: post-control, sb_mb!=m")); 1631 KASSERT(so->so_rcv.sb_lastrecord == m, 1632 ("soreceive: post-control, lastrecord!=m")); 1633 } 1634 } 1635 type = m->m_type; 1636 if (type == MT_OOBDATA) 1637 flags |= MSG_OOB; 1638 } else { 1639 if ((flags & MSG_PEEK) == 0) { 1640 KASSERT(so->so_rcv.sb_mb == nextrecord, 1641 ("soreceive: sb_mb != nextrecord")); 1642 if (so->so_rcv.sb_mb == NULL) { 1643 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1644 ("soreceive: sb_lastercord != NULL")); 1645 } 1646 } 1647 } 1648 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1649 SBLASTRECORDCHK(&so->so_rcv); 1650 SBLASTMBUFCHK(&so->so_rcv); 1651 1652 /* 1653 * Now continue to read any data mbufs off of the head of the socket 1654 * buffer until the read request is satisfied. Note that 'type' is 1655 * used to store the type of any mbuf reads that have happened so far 1656 * such that soreceive() can stop reading if the type changes, which 1657 * causes soreceive() to return only one of regular data and inline 1658 * out-of-band data in a single socket receive operation. 1659 */ 1660 moff = 0; 1661 offset = 0; 1662 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1663 /* 1664 * If the type of mbuf has changed since the last mbuf 1665 * examined ('type'), end the receive operation. 1666 */ 1667 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1668 if (m->m_type == MT_OOBDATA) { 1669 if (type != MT_OOBDATA) 1670 break; 1671 } else if (type == MT_OOBDATA) 1672 break; 1673 else 1674 KASSERT(m->m_type == MT_DATA, 1675 ("m->m_type == %d", m->m_type)); 1676 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1677 len = uio->uio_resid; 1678 if (so->so_oobmark && len > so->so_oobmark - offset) 1679 len = so->so_oobmark - offset; 1680 if (len > m->m_len - moff) 1681 len = m->m_len - moff; 1682 /* 1683 * If mp is set, just pass back the mbufs. Otherwise copy 1684 * them out via the uio, then free. Sockbuf must be 1685 * consistent here (points to current mbuf, it points to next 1686 * record) when we drop priority; we must note any additions 1687 * to the sockbuf when we block interrupts again. 1688 */ 1689 if (mp == NULL) { 1690 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1691 SBLASTRECORDCHK(&so->so_rcv); 1692 SBLASTMBUFCHK(&so->so_rcv); 1693 SOCKBUF_UNLOCK(&so->so_rcv); 1694 #ifdef ZERO_COPY_SOCKETS 1695 if (so_zero_copy_receive) { 1696 int disposable; 1697 1698 if ((m->m_flags & M_EXT) 1699 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1700 disposable = 1; 1701 else 1702 disposable = 0; 1703 1704 error = uiomoveco(mtod(m, char *) + moff, 1705 (int)len, uio, 1706 disposable); 1707 } else 1708 #endif /* ZERO_COPY_SOCKETS */ 1709 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1710 SOCKBUF_LOCK(&so->so_rcv); 1711 if (error) { 1712 /* 1713 * The MT_SONAME mbuf has already been removed 1714 * from the record, so it is necessary to 1715 * remove the data mbufs, if any, to preserve 1716 * the invariant in the case of PR_ADDR that 1717 * requires MT_SONAME mbufs at the head of 1718 * each record. 1719 */ 1720 if (m && pr->pr_flags & PR_ATOMIC && 1721 ((flags & MSG_PEEK) == 0)) 1722 (void)sbdroprecord_locked(&so->so_rcv); 1723 goto release; 1724 } 1725 } else 1726 uio->uio_resid -= len; 1727 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1728 if (len == m->m_len - moff) { 1729 if (m->m_flags & M_EOR) 1730 flags |= MSG_EOR; 1731 if (flags & MSG_PEEK) { 1732 m = m->m_next; 1733 moff = 0; 1734 } else { 1735 nextrecord = m->m_nextpkt; 1736 sbfree(&so->so_rcv, m); 1737 if (mp != NULL) { 1738 *mp = m; 1739 mp = &m->m_next; 1740 so->so_rcv.sb_mb = m = m->m_next; 1741 *mp = NULL; 1742 } else { 1743 so->so_rcv.sb_mb = m_free(m); 1744 m = so->so_rcv.sb_mb; 1745 } 1746 sockbuf_pushsync(&so->so_rcv, nextrecord); 1747 SBLASTRECORDCHK(&so->so_rcv); 1748 SBLASTMBUFCHK(&so->so_rcv); 1749 } 1750 } else { 1751 if (flags & MSG_PEEK) 1752 moff += len; 1753 else { 1754 if (mp != NULL) { 1755 int copy_flag; 1756 1757 if (flags & MSG_DONTWAIT) 1758 copy_flag = M_DONTWAIT; 1759 else 1760 copy_flag = M_TRYWAIT; 1761 if (copy_flag == M_TRYWAIT) 1762 SOCKBUF_UNLOCK(&so->so_rcv); 1763 *mp = m_copym(m, 0, len, copy_flag); 1764 if (copy_flag == M_TRYWAIT) 1765 SOCKBUF_LOCK(&so->so_rcv); 1766 if (*mp == NULL) { 1767 /* 1768 * m_copym() couldn't 1769 * allocate an mbuf. Adjust 1770 * uio_resid back (it was 1771 * adjusted down by len 1772 * bytes, which we didn't end 1773 * up "copying" over). 1774 */ 1775 uio->uio_resid += len; 1776 break; 1777 } 1778 } 1779 m->m_data += len; 1780 m->m_len -= len; 1781 so->so_rcv.sb_cc -= len; 1782 } 1783 } 1784 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1785 if (so->so_oobmark) { 1786 if ((flags & MSG_PEEK) == 0) { 1787 so->so_oobmark -= len; 1788 if (so->so_oobmark == 0) { 1789 so->so_rcv.sb_state |= SBS_RCVATMARK; 1790 break; 1791 } 1792 } else { 1793 offset += len; 1794 if (offset == so->so_oobmark) 1795 break; 1796 } 1797 } 1798 if (flags & MSG_EOR) 1799 break; 1800 /* 1801 * If the MSG_WAITALL flag is set (for non-atomic socket), we 1802 * must not quit until "uio->uio_resid == 0" or an error 1803 * termination. If a signal/timeout occurs, return with a 1804 * short count but without error. Keep sockbuf locked 1805 * against other readers. 1806 */ 1807 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1808 !sosendallatonce(so) && nextrecord == NULL) { 1809 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1810 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1811 break; 1812 /* 1813 * Notify the protocol that some data has been 1814 * drained before blocking. 1815 */ 1816 if (pr->pr_flags & PR_WANTRCVD) { 1817 SOCKBUF_UNLOCK(&so->so_rcv); 1818 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1819 SOCKBUF_LOCK(&so->so_rcv); 1820 } 1821 SBLASTRECORDCHK(&so->so_rcv); 1822 SBLASTMBUFCHK(&so->so_rcv); 1823 error = sbwait(&so->so_rcv); 1824 if (error) 1825 goto release; 1826 m = so->so_rcv.sb_mb; 1827 if (m != NULL) 1828 nextrecord = m->m_nextpkt; 1829 } 1830 } 1831 1832 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1833 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1834 flags |= MSG_TRUNC; 1835 if ((flags & MSG_PEEK) == 0) 1836 (void) sbdroprecord_locked(&so->so_rcv); 1837 } 1838 if ((flags & MSG_PEEK) == 0) { 1839 if (m == NULL) { 1840 /* 1841 * First part is an inline SB_EMPTY_FIXUP(). Second 1842 * part makes sure sb_lastrecord is up-to-date if 1843 * there is still data in the socket buffer. 1844 */ 1845 so->so_rcv.sb_mb = nextrecord; 1846 if (so->so_rcv.sb_mb == NULL) { 1847 so->so_rcv.sb_mbtail = NULL; 1848 so->so_rcv.sb_lastrecord = NULL; 1849 } else if (nextrecord->m_nextpkt == NULL) 1850 so->so_rcv.sb_lastrecord = nextrecord; 1851 } 1852 SBLASTRECORDCHK(&so->so_rcv); 1853 SBLASTMBUFCHK(&so->so_rcv); 1854 /* 1855 * If soreceive() is being done from the socket callback, 1856 * then don't need to generate ACK to peer to update window, 1857 * since ACK will be generated on return to TCP. 1858 */ 1859 if (!(flags & MSG_SOCALLBCK) && 1860 (pr->pr_flags & PR_WANTRCVD)) { 1861 SOCKBUF_UNLOCK(&so->so_rcv); 1862 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1863 SOCKBUF_LOCK(&so->so_rcv); 1864 } 1865 } 1866 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1867 if (orig_resid == uio->uio_resid && orig_resid && 1868 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1869 sbunlock(&so->so_rcv); 1870 goto restart; 1871 } 1872 1873 if (flagsp != NULL) 1874 *flagsp |= flags; 1875 release: 1876 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1877 sbunlock(&so->so_rcv); 1878 out: 1879 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1880 SOCKBUF_UNLOCK(&so->so_rcv); 1881 return (error); 1882 } 1883 1884 int 1885 soreceive(so, psa, uio, mp0, controlp, flagsp) 1886 struct socket *so; 1887 struct sockaddr **psa; 1888 struct uio *uio; 1889 struct mbuf **mp0; 1890 struct mbuf **controlp; 1891 int *flagsp; 1892 { 1893 1894 /* XXXRW: Temporary debugging. */ 1895 KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive, 1896 ("soreceive: protocol calls soreceive")); 1897 1898 return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, 1899 controlp, flagsp)); 1900 } 1901 1902 int 1903 soshutdown(so, how) 1904 struct socket *so; 1905 int how; 1906 { 1907 struct protosw *pr = so->so_proto; 1908 1909 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1910 return (EINVAL); 1911 1912 if (how != SHUT_WR) 1913 sorflush(so); 1914 if (how != SHUT_RD) 1915 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1916 return (0); 1917 } 1918 1919 void 1920 sorflush(so) 1921 struct socket *so; 1922 { 1923 struct sockbuf *sb = &so->so_rcv; 1924 struct protosw *pr = so->so_proto; 1925 struct sockbuf asb; 1926 1927 /* 1928 * XXXRW: This is quite ugly. Previously, this code made a copy of 1929 * the socket buffer, then zero'd the original to clear the buffer 1930 * fields. However, with mutexes in the socket buffer, this causes 1931 * problems. We only clear the zeroable bits of the original; 1932 * however, we have to initialize and destroy the mutex in the copy 1933 * so that dom_dispose() and sbrelease() can lock t as needed. 1934 */ 1935 SOCKBUF_LOCK(sb); 1936 sb->sb_flags |= SB_NOINTR; 1937 (void) sblock(sb, M_WAITOK); 1938 /* 1939 * socantrcvmore_locked() drops the socket buffer mutex so that it 1940 * can safely perform wakeups. Re-acquire the mutex before 1941 * continuing. 1942 */ 1943 socantrcvmore_locked(so); 1944 SOCKBUF_LOCK(sb); 1945 sbunlock(sb); 1946 /* 1947 * Invalidate/clear most of the sockbuf structure, but leave selinfo 1948 * and mutex data unchanged. 1949 */ 1950 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1951 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1952 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1953 bzero(&sb->sb_startzero, 1954 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1955 SOCKBUF_UNLOCK(sb); 1956 1957 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1958 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1959 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1960 sbrelease(&asb, so); 1961 SOCKBUF_LOCK_DESTROY(&asb); 1962 } 1963 1964 /* 1965 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 1966 * additional variant to handle the case where the option value needs to be 1967 * some kind of integer, but not a specific size. In addition to their use 1968 * here, these functions are also called by the protocol-level pr_ctloutput() 1969 * routines. 1970 */ 1971 int 1972 sooptcopyin(sopt, buf, len, minlen) 1973 struct sockopt *sopt; 1974 void *buf; 1975 size_t len; 1976 size_t minlen; 1977 { 1978 size_t valsize; 1979 1980 /* 1981 * If the user gives us more than we wanted, we ignore it, but if we 1982 * don't get the minimum length the caller wants, we return EINVAL. 1983 * On success, sopt->sopt_valsize is set to however much we actually 1984 * retrieved. 1985 */ 1986 if ((valsize = sopt->sopt_valsize) < minlen) 1987 return EINVAL; 1988 if (valsize > len) 1989 sopt->sopt_valsize = valsize = len; 1990 1991 if (sopt->sopt_td != NULL) 1992 return (copyin(sopt->sopt_val, buf, valsize)); 1993 1994 bcopy(sopt->sopt_val, buf, valsize); 1995 return (0); 1996 } 1997 1998 /* 1999 * Kernel version of setsockopt(2). 2000 * 2001 * XXX: optlen is size_t, not socklen_t 2002 */ 2003 int 2004 so_setsockopt(struct socket *so, int level, int optname, void *optval, 2005 size_t optlen) 2006 { 2007 struct sockopt sopt; 2008 2009 sopt.sopt_level = level; 2010 sopt.sopt_name = optname; 2011 sopt.sopt_dir = SOPT_SET; 2012 sopt.sopt_val = optval; 2013 sopt.sopt_valsize = optlen; 2014 sopt.sopt_td = NULL; 2015 return (sosetopt(so, &sopt)); 2016 } 2017 2018 int 2019 sosetopt(so, sopt) 2020 struct socket *so; 2021 struct sockopt *sopt; 2022 { 2023 int error, optval; 2024 struct linger l; 2025 struct timeval tv; 2026 u_long val; 2027 #ifdef MAC 2028 struct mac extmac; 2029 #endif 2030 2031 error = 0; 2032 if (sopt->sopt_level != SOL_SOCKET) { 2033 if (so->so_proto && so->so_proto->pr_ctloutput) 2034 return ((*so->so_proto->pr_ctloutput) 2035 (so, sopt)); 2036 error = ENOPROTOOPT; 2037 } else { 2038 switch (sopt->sopt_name) { 2039 #ifdef INET 2040 case SO_ACCEPTFILTER: 2041 error = do_setopt_accept_filter(so, sopt); 2042 if (error) 2043 goto bad; 2044 break; 2045 #endif 2046 case SO_LINGER: 2047 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2048 if (error) 2049 goto bad; 2050 2051 SOCK_LOCK(so); 2052 so->so_linger = l.l_linger; 2053 if (l.l_onoff) 2054 so->so_options |= SO_LINGER; 2055 else 2056 so->so_options &= ~SO_LINGER; 2057 SOCK_UNLOCK(so); 2058 break; 2059 2060 case SO_DEBUG: 2061 case SO_KEEPALIVE: 2062 case SO_DONTROUTE: 2063 case SO_USELOOPBACK: 2064 case SO_BROADCAST: 2065 case SO_REUSEADDR: 2066 case SO_REUSEPORT: 2067 case SO_OOBINLINE: 2068 case SO_TIMESTAMP: 2069 case SO_BINTIME: 2070 case SO_NOSIGPIPE: 2071 error = sooptcopyin(sopt, &optval, sizeof optval, 2072 sizeof optval); 2073 if (error) 2074 goto bad; 2075 SOCK_LOCK(so); 2076 if (optval) 2077 so->so_options |= sopt->sopt_name; 2078 else 2079 so->so_options &= ~sopt->sopt_name; 2080 SOCK_UNLOCK(so); 2081 break; 2082 2083 case SO_SNDBUF: 2084 case SO_RCVBUF: 2085 case SO_SNDLOWAT: 2086 case SO_RCVLOWAT: 2087 error = sooptcopyin(sopt, &optval, sizeof optval, 2088 sizeof optval); 2089 if (error) 2090 goto bad; 2091 2092 /* 2093 * Values < 1 make no sense for any of these options, 2094 * so disallow them. 2095 */ 2096 if (optval < 1) { 2097 error = EINVAL; 2098 goto bad; 2099 } 2100 2101 switch (sopt->sopt_name) { 2102 case SO_SNDBUF: 2103 case SO_RCVBUF: 2104 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 2105 &so->so_snd : &so->so_rcv, (u_long)optval, 2106 so, curthread) == 0) { 2107 error = ENOBUFS; 2108 goto bad; 2109 } 2110 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : 2111 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; 2112 break; 2113 2114 /* 2115 * Make sure the low-water is never greater than the 2116 * high-water. 2117 */ 2118 case SO_SNDLOWAT: 2119 SOCKBUF_LOCK(&so->so_snd); 2120 so->so_snd.sb_lowat = 2121 (optval > so->so_snd.sb_hiwat) ? 2122 so->so_snd.sb_hiwat : optval; 2123 SOCKBUF_UNLOCK(&so->so_snd); 2124 break; 2125 case SO_RCVLOWAT: 2126 SOCKBUF_LOCK(&so->so_rcv); 2127 so->so_rcv.sb_lowat = 2128 (optval > so->so_rcv.sb_hiwat) ? 2129 so->so_rcv.sb_hiwat : optval; 2130 SOCKBUF_UNLOCK(&so->so_rcv); 2131 break; 2132 } 2133 break; 2134 2135 case SO_SNDTIMEO: 2136 case SO_RCVTIMEO: 2137 #ifdef COMPAT_IA32 2138 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2139 struct timeval32 tv32; 2140 2141 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2142 sizeof tv32); 2143 CP(tv32, tv, tv_sec); 2144 CP(tv32, tv, tv_usec); 2145 } else 2146 #endif 2147 error = sooptcopyin(sopt, &tv, sizeof tv, 2148 sizeof tv); 2149 if (error) 2150 goto bad; 2151 2152 /* assert(hz > 0); */ 2153 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2154 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2155 error = EDOM; 2156 goto bad; 2157 } 2158 /* assert(tick > 0); */ 2159 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2160 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2161 if (val > INT_MAX) { 2162 error = EDOM; 2163 goto bad; 2164 } 2165 if (val == 0 && tv.tv_usec != 0) 2166 val = 1; 2167 2168 switch (sopt->sopt_name) { 2169 case SO_SNDTIMEO: 2170 so->so_snd.sb_timeo = val; 2171 break; 2172 case SO_RCVTIMEO: 2173 so->so_rcv.sb_timeo = val; 2174 break; 2175 } 2176 break; 2177 2178 case SO_LABEL: 2179 #ifdef MAC 2180 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2181 sizeof extmac); 2182 if (error) 2183 goto bad; 2184 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2185 so, &extmac); 2186 #else 2187 error = EOPNOTSUPP; 2188 #endif 2189 break; 2190 2191 default: 2192 error = ENOPROTOOPT; 2193 break; 2194 } 2195 if (error == 0 && so->so_proto != NULL && 2196 so->so_proto->pr_ctloutput != NULL) { 2197 (void) ((*so->so_proto->pr_ctloutput) 2198 (so, sopt)); 2199 } 2200 } 2201 bad: 2202 return (error); 2203 } 2204 2205 /* 2206 * Helper routine for getsockopt. 2207 */ 2208 int 2209 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2210 { 2211 int error; 2212 size_t valsize; 2213 2214 error = 0; 2215 2216 /* 2217 * Documented get behavior is that we always return a value, possibly 2218 * truncated to fit in the user's buffer. Traditional behavior is 2219 * that we always tell the user precisely how much we copied, rather 2220 * than something useful like the total amount we had available for 2221 * her. Note that this interface is not idempotent; the entire 2222 * answer must generated ahead of time. 2223 */ 2224 valsize = min(len, sopt->sopt_valsize); 2225 sopt->sopt_valsize = valsize; 2226 if (sopt->sopt_val != NULL) { 2227 if (sopt->sopt_td != NULL) 2228 error = copyout(buf, sopt->sopt_val, valsize); 2229 else 2230 bcopy(buf, sopt->sopt_val, valsize); 2231 } 2232 return (error); 2233 } 2234 2235 int 2236 sogetopt(so, sopt) 2237 struct socket *so; 2238 struct sockopt *sopt; 2239 { 2240 int error, optval; 2241 struct linger l; 2242 struct timeval tv; 2243 #ifdef MAC 2244 struct mac extmac; 2245 #endif 2246 2247 error = 0; 2248 if (sopt->sopt_level != SOL_SOCKET) { 2249 if (so->so_proto && so->so_proto->pr_ctloutput) { 2250 return ((*so->so_proto->pr_ctloutput) 2251 (so, sopt)); 2252 } else 2253 return (ENOPROTOOPT); 2254 } else { 2255 switch (sopt->sopt_name) { 2256 #ifdef INET 2257 case SO_ACCEPTFILTER: 2258 error = do_getopt_accept_filter(so, sopt); 2259 break; 2260 #endif 2261 case SO_LINGER: 2262 SOCK_LOCK(so); 2263 l.l_onoff = so->so_options & SO_LINGER; 2264 l.l_linger = so->so_linger; 2265 SOCK_UNLOCK(so); 2266 error = sooptcopyout(sopt, &l, sizeof l); 2267 break; 2268 2269 case SO_USELOOPBACK: 2270 case SO_DONTROUTE: 2271 case SO_DEBUG: 2272 case SO_KEEPALIVE: 2273 case SO_REUSEADDR: 2274 case SO_REUSEPORT: 2275 case SO_BROADCAST: 2276 case SO_OOBINLINE: 2277 case SO_ACCEPTCONN: 2278 case SO_TIMESTAMP: 2279 case SO_BINTIME: 2280 case SO_NOSIGPIPE: 2281 optval = so->so_options & sopt->sopt_name; 2282 integer: 2283 error = sooptcopyout(sopt, &optval, sizeof optval); 2284 break; 2285 2286 case SO_TYPE: 2287 optval = so->so_type; 2288 goto integer; 2289 2290 case SO_ERROR: 2291 SOCK_LOCK(so); 2292 optval = so->so_error; 2293 so->so_error = 0; 2294 SOCK_UNLOCK(so); 2295 goto integer; 2296 2297 case SO_SNDBUF: 2298 optval = so->so_snd.sb_hiwat; 2299 goto integer; 2300 2301 case SO_RCVBUF: 2302 optval = so->so_rcv.sb_hiwat; 2303 goto integer; 2304 2305 case SO_SNDLOWAT: 2306 optval = so->so_snd.sb_lowat; 2307 goto integer; 2308 2309 case SO_RCVLOWAT: 2310 optval = so->so_rcv.sb_lowat; 2311 goto integer; 2312 2313 case SO_SNDTIMEO: 2314 case SO_RCVTIMEO: 2315 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2316 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2317 2318 tv.tv_sec = optval / hz; 2319 tv.tv_usec = (optval % hz) * tick; 2320 #ifdef COMPAT_IA32 2321 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2322 struct timeval32 tv32; 2323 2324 CP(tv, tv32, tv_sec); 2325 CP(tv, tv32, tv_usec); 2326 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2327 } else 2328 #endif 2329 error = sooptcopyout(sopt, &tv, sizeof tv); 2330 break; 2331 2332 case SO_LABEL: 2333 #ifdef MAC 2334 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2335 sizeof(extmac)); 2336 if (error) 2337 return (error); 2338 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2339 so, &extmac); 2340 if (error) 2341 return (error); 2342 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2343 #else 2344 error = EOPNOTSUPP; 2345 #endif 2346 break; 2347 2348 case SO_PEERLABEL: 2349 #ifdef MAC 2350 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2351 sizeof(extmac)); 2352 if (error) 2353 return (error); 2354 error = mac_getsockopt_peerlabel( 2355 sopt->sopt_td->td_ucred, so, &extmac); 2356 if (error) 2357 return (error); 2358 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2359 #else 2360 error = EOPNOTSUPP; 2361 #endif 2362 break; 2363 2364 case SO_LISTENQLIMIT: 2365 optval = so->so_qlimit; 2366 goto integer; 2367 2368 case SO_LISTENQLEN: 2369 optval = so->so_qlen; 2370 goto integer; 2371 2372 case SO_LISTENINCQLEN: 2373 optval = so->so_incqlen; 2374 goto integer; 2375 2376 default: 2377 error = ENOPROTOOPT; 2378 break; 2379 } 2380 return (error); 2381 } 2382 } 2383 2384 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2385 int 2386 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2387 { 2388 struct mbuf *m, *m_prev; 2389 int sopt_size = sopt->sopt_valsize; 2390 2391 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2392 if (m == NULL) 2393 return ENOBUFS; 2394 if (sopt_size > MLEN) { 2395 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 2396 if ((m->m_flags & M_EXT) == 0) { 2397 m_free(m); 2398 return ENOBUFS; 2399 } 2400 m->m_len = min(MCLBYTES, sopt_size); 2401 } else { 2402 m->m_len = min(MLEN, sopt_size); 2403 } 2404 sopt_size -= m->m_len; 2405 *mp = m; 2406 m_prev = m; 2407 2408 while (sopt_size) { 2409 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2410 if (m == NULL) { 2411 m_freem(*mp); 2412 return ENOBUFS; 2413 } 2414 if (sopt_size > MLEN) { 2415 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2416 M_DONTWAIT); 2417 if ((m->m_flags & M_EXT) == 0) { 2418 m_freem(m); 2419 m_freem(*mp); 2420 return ENOBUFS; 2421 } 2422 m->m_len = min(MCLBYTES, sopt_size); 2423 } else { 2424 m->m_len = min(MLEN, sopt_size); 2425 } 2426 sopt_size -= m->m_len; 2427 m_prev->m_next = m; 2428 m_prev = m; 2429 } 2430 return (0); 2431 } 2432 2433 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2434 int 2435 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2436 { 2437 struct mbuf *m0 = m; 2438 2439 if (sopt->sopt_val == NULL) 2440 return (0); 2441 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2442 if (sopt->sopt_td != NULL) { 2443 int error; 2444 2445 error = copyin(sopt->sopt_val, mtod(m, char *), 2446 m->m_len); 2447 if (error != 0) { 2448 m_freem(m0); 2449 return(error); 2450 } 2451 } else 2452 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2453 sopt->sopt_valsize -= m->m_len; 2454 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2455 m = m->m_next; 2456 } 2457 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2458 panic("ip6_sooptmcopyin"); 2459 return (0); 2460 } 2461 2462 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2463 int 2464 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2465 { 2466 struct mbuf *m0 = m; 2467 size_t valsize = 0; 2468 2469 if (sopt->sopt_val == NULL) 2470 return (0); 2471 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2472 if (sopt->sopt_td != NULL) { 2473 int error; 2474 2475 error = copyout(mtod(m, char *), sopt->sopt_val, 2476 m->m_len); 2477 if (error != 0) { 2478 m_freem(m0); 2479 return(error); 2480 } 2481 } else 2482 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2483 sopt->sopt_valsize -= m->m_len; 2484 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2485 valsize += m->m_len; 2486 m = m->m_next; 2487 } 2488 if (m != NULL) { 2489 /* enough soopt buffer should be given from user-land */ 2490 m_freem(m0); 2491 return(EINVAL); 2492 } 2493 sopt->sopt_valsize = valsize; 2494 return (0); 2495 } 2496 2497 /* 2498 * sohasoutofband(): protocol notifies socket layer of the arrival of new 2499 * out-of-band data, which will then notify socket consumers. 2500 */ 2501 void 2502 sohasoutofband(so) 2503 struct socket *so; 2504 { 2505 if (so->so_sigio != NULL) 2506 pgsigio(&so->so_sigio, SIGURG, 0); 2507 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2508 } 2509 2510 int 2511 sopoll(struct socket *so, int events, struct ucred *active_cred, 2512 struct thread *td) 2513 { 2514 2515 /* XXXRW: Temporary debugging. */ 2516 KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll, 2517 ("sopoll: protocol calls sopoll")); 2518 2519 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 2520 td)); 2521 } 2522 2523 int 2524 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 2525 struct thread *td) 2526 { 2527 int revents = 0; 2528 2529 SOCKBUF_LOCK(&so->so_snd); 2530 SOCKBUF_LOCK(&so->so_rcv); 2531 if (events & (POLLIN | POLLRDNORM)) 2532 if (soreadable(so)) 2533 revents |= events & (POLLIN | POLLRDNORM); 2534 2535 if (events & POLLINIGNEOF) 2536 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2537 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2538 revents |= POLLINIGNEOF; 2539 2540 if (events & (POLLOUT | POLLWRNORM)) 2541 if (sowriteable(so)) 2542 revents |= events & (POLLOUT | POLLWRNORM); 2543 2544 if (events & (POLLPRI | POLLRDBAND)) 2545 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2546 revents |= events & (POLLPRI | POLLRDBAND); 2547 2548 if (revents == 0) { 2549 if (events & 2550 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2551 POLLRDBAND)) { 2552 selrecord(td, &so->so_rcv.sb_sel); 2553 so->so_rcv.sb_flags |= SB_SEL; 2554 } 2555 2556 if (events & (POLLOUT | POLLWRNORM)) { 2557 selrecord(td, &so->so_snd.sb_sel); 2558 so->so_snd.sb_flags |= SB_SEL; 2559 } 2560 } 2561 2562 SOCKBUF_UNLOCK(&so->so_rcv); 2563 SOCKBUF_UNLOCK(&so->so_snd); 2564 return (revents); 2565 } 2566 2567 int 2568 soo_kqfilter(struct file *fp, struct knote *kn) 2569 { 2570 struct socket *so = kn->kn_fp->f_data; 2571 struct sockbuf *sb; 2572 2573 switch (kn->kn_filter) { 2574 case EVFILT_READ: 2575 if (so->so_options & SO_ACCEPTCONN) 2576 kn->kn_fop = &solisten_filtops; 2577 else 2578 kn->kn_fop = &soread_filtops; 2579 sb = &so->so_rcv; 2580 break; 2581 case EVFILT_WRITE: 2582 kn->kn_fop = &sowrite_filtops; 2583 sb = &so->so_snd; 2584 break; 2585 default: 2586 return (EINVAL); 2587 } 2588 2589 SOCKBUF_LOCK(sb); 2590 knlist_add(&sb->sb_sel.si_note, kn, 1); 2591 sb->sb_flags |= SB_KNOTE; 2592 SOCKBUF_UNLOCK(sb); 2593 return (0); 2594 } 2595 2596 static void 2597 filt_sordetach(struct knote *kn) 2598 { 2599 struct socket *so = kn->kn_fp->f_data; 2600 2601 SOCKBUF_LOCK(&so->so_rcv); 2602 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2603 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2604 so->so_rcv.sb_flags &= ~SB_KNOTE; 2605 SOCKBUF_UNLOCK(&so->so_rcv); 2606 } 2607 2608 /*ARGSUSED*/ 2609 static int 2610 filt_soread(struct knote *kn, long hint) 2611 { 2612 struct socket *so; 2613 2614 so = kn->kn_fp->f_data; 2615 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2616 2617 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2618 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2619 kn->kn_flags |= EV_EOF; 2620 kn->kn_fflags = so->so_error; 2621 return (1); 2622 } else if (so->so_error) /* temporary udp error */ 2623 return (1); 2624 else if (kn->kn_sfflags & NOTE_LOWAT) 2625 return (kn->kn_data >= kn->kn_sdata); 2626 else 2627 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2628 } 2629 2630 static void 2631 filt_sowdetach(struct knote *kn) 2632 { 2633 struct socket *so = kn->kn_fp->f_data; 2634 2635 SOCKBUF_LOCK(&so->so_snd); 2636 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2637 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2638 so->so_snd.sb_flags &= ~SB_KNOTE; 2639 SOCKBUF_UNLOCK(&so->so_snd); 2640 } 2641 2642 /*ARGSUSED*/ 2643 static int 2644 filt_sowrite(struct knote *kn, long hint) 2645 { 2646 struct socket *so; 2647 2648 so = kn->kn_fp->f_data; 2649 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2650 kn->kn_data = sbspace(&so->so_snd); 2651 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2652 kn->kn_flags |= EV_EOF; 2653 kn->kn_fflags = so->so_error; 2654 return (1); 2655 } else if (so->so_error) /* temporary udp error */ 2656 return (1); 2657 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2658 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2659 return (0); 2660 else if (kn->kn_sfflags & NOTE_LOWAT) 2661 return (kn->kn_data >= kn->kn_sdata); 2662 else 2663 return (kn->kn_data >= so->so_snd.sb_lowat); 2664 } 2665 2666 /*ARGSUSED*/ 2667 static int 2668 filt_solisten(struct knote *kn, long hint) 2669 { 2670 struct socket *so = kn->kn_fp->f_data; 2671 2672 kn->kn_data = so->so_qlen; 2673 return (! TAILQ_EMPTY(&so->so_comp)); 2674 } 2675 2676 int 2677 socheckuid(struct socket *so, uid_t uid) 2678 { 2679 2680 if (so == NULL) 2681 return (EPERM); 2682 if (so->so_cred->cr_uid != uid) 2683 return (EPERM); 2684 return (0); 2685 } 2686 2687 static int 2688 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 2689 { 2690 int error; 2691 int val; 2692 2693 val = somaxconn; 2694 error = sysctl_handle_int(oidp, &val, sizeof(int), req); 2695 if (error || !req->newptr ) 2696 return (error); 2697 2698 if (val < 1 || val > USHRT_MAX) 2699 return (EINVAL); 2700 2701 somaxconn = val; 2702 return (0); 2703 } 2704