1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2006 Robert N. M. Watson 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34 /* 35 * Comments on the socket life cycle: 36 * 37 * soalloc() sets of socket layer state for a socket, called only by 38 * socreate() and sonewconn(). Socket layer private. 39 * 40 * sdealloc() tears down socket layer state for a socket, called only by 41 * sofree() and sonewconn(). Socket layer private. 42 * 43 * pru_attach() associates protocol layer state with an allocated socket; 44 * called only once, may fail, aborting socket allocation. This is called 45 * from socreate() and sonewconn(). Socket layer private. 46 * 47 * pru_detach() disassociates protocol layer state from an attached socket, 48 * and will be called exactly once for sockets in which pru_attach() has 49 * been successfully called. If pru_attach() returned an error, 50 * pru_detach() will not be called. Socket layer private. 51 * 52 * socreate() creates a socket and attaches protocol state. This is a public 53 * interface that may be used by socket layer consumers to create new 54 * sockets. 55 * 56 * sonewconn() creates a socket and attaches protocol state. This is a 57 * public interface that may be used by protocols to create new sockets when 58 * a new connection is received and will be available for accept() on a 59 * listen socket. 60 * 61 * soclose() destroys a socket after possibly waiting for it to disconnect. 62 * This is a public interface that socket consumers should use to close and 63 * release a socket when done with it. 64 * 65 * soabort() destroys a socket without waiting for it to disconnect (used 66 * only for incoming connections that are already partially or fully 67 * connected). This is used internally by the socket layer when clearing 68 * listen socket queues (due to overflow or close on the listen socket), but 69 * is also a public interface protocols may use to abort connections in 70 * their incomplete listen queues should they no longer be required. Sockets 71 * placed in completed connection listen queues should not be aborted. 72 * 73 * sofree() will free a socket and its protocol state if all references on 74 * the socket have been released, and is the public interface to attempt to 75 * free a socket when a reference is removed. This is a socket layer private 76 * interface. 77 * 78 * NOTE: In addition to socreate() and soclose(), which provide a single 79 * socket reference to the consumer to be managed as required, there are two 80 * calls to explicitly manage socket references, soref(), and sorele(). 81 * Currently, these are generally required only when transitioning a socket 82 * from a listen queue to a file descriptor, in order to prevent garbage 83 * collection of the socket at an untimely moment. For a number of reasons, 84 * these interfaces are not preferred, and should be avoided. 85 * 86 * XXXRW: The behavior of sockets after soclose() but before the last 87 * sorele() is poorly defined. We can probably entirely eliminate them with 88 * a little work, since consumers are managing references anyway. 89 */ 90 91 #include <sys/cdefs.h> 92 __FBSDID("$FreeBSD$"); 93 94 #include "opt_inet.h" 95 #include "opt_mac.h" 96 #include "opt_zero.h" 97 #include "opt_compat.h" 98 99 #include <sys/param.h> 100 #include <sys/systm.h> 101 #include <sys/fcntl.h> 102 #include <sys/limits.h> 103 #include <sys/lock.h> 104 #include <sys/mac.h> 105 #include <sys/malloc.h> 106 #include <sys/mbuf.h> 107 #include <sys/mutex.h> 108 #include <sys/domain.h> 109 #include <sys/file.h> /* for struct knote */ 110 #include <sys/kernel.h> 111 #include <sys/event.h> 112 #include <sys/poll.h> 113 #include <sys/proc.h> 114 #include <sys/protosw.h> 115 #include <sys/socket.h> 116 #include <sys/socketvar.h> 117 #include <sys/resourcevar.h> 118 #include <sys/signalvar.h> 119 #include <sys/sysctl.h> 120 #include <sys/uio.h> 121 #include <sys/jail.h> 122 123 #include <vm/uma.h> 124 125 #ifdef COMPAT_IA32 126 #include <sys/mount.h> 127 #include <compat/freebsd32/freebsd32.h> 128 129 extern struct sysentvec ia32_freebsd_sysvec; 130 #endif 131 132 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 133 int flags); 134 135 static void filt_sordetach(struct knote *kn); 136 static int filt_soread(struct knote *kn, long hint); 137 static void filt_sowdetach(struct knote *kn); 138 static int filt_sowrite(struct knote *kn, long hint); 139 static int filt_solisten(struct knote *kn, long hint); 140 141 static struct filterops solisten_filtops = 142 { 1, NULL, filt_sordetach, filt_solisten }; 143 static struct filterops soread_filtops = 144 { 1, NULL, filt_sordetach, filt_soread }; 145 static struct filterops sowrite_filtops = 146 { 1, NULL, filt_sowdetach, filt_sowrite }; 147 148 uma_zone_t socket_zone; 149 so_gen_t so_gencnt; /* generation count for sockets */ 150 151 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 152 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 153 154 SYSCTL_DECL(_kern_ipc); 155 156 static int somaxconn = SOMAXCONN; 157 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS); 158 /* XXX: we dont have SYSCTL_USHORT */ 159 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW, 160 0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection " 161 "queue size"); 162 static int numopensockets; 163 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 164 &numopensockets, 0, "Number of open sockets"); 165 #ifdef ZERO_COPY_SOCKETS 166 /* These aren't static because they're used in other files. */ 167 int so_zero_copy_send = 1; 168 int so_zero_copy_receive = 1; 169 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 170 "Zero copy controls"); 171 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 172 &so_zero_copy_receive, 0, "Enable zero copy receive"); 173 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 174 &so_zero_copy_send, 0, "Enable zero copy send"); 175 #endif /* ZERO_COPY_SOCKETS */ 176 177 /* 178 * accept_mtx locks down per-socket fields relating to accept queues. See 179 * socketvar.h for an annotation of the protected fields of struct socket. 180 */ 181 struct mtx accept_mtx; 182 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 183 184 /* 185 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 186 * so_gencnt field. 187 */ 188 static struct mtx so_global_mtx; 189 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 190 191 /* 192 * Socket operation routines. 193 * These routines are called by the routines in 194 * sys_socket.c or from a system process, and 195 * implement the semantics of socket operations by 196 * switching out to the protocol specific routines. 197 */ 198 199 /* 200 * Get a socket structure from our zone, and initialize it. 201 * Note that it would probably be better to allocate socket 202 * and PCB at the same time, but I'm not convinced that all 203 * the protocols can be easily modified to do this. 204 * 205 * soalloc() returns a socket with a ref count of 0. 206 */ 207 struct socket * 208 soalloc(int mflags) 209 { 210 struct socket *so; 211 212 so = uma_zalloc(socket_zone, mflags | M_ZERO); 213 if (so != NULL) { 214 #ifdef MAC 215 if (mac_init_socket(so, mflags) != 0) { 216 uma_zfree(socket_zone, so); 217 return (NULL); 218 } 219 #endif 220 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 221 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 222 TAILQ_INIT(&so->so_aiojobq); 223 mtx_lock(&so_global_mtx); 224 so->so_gencnt = ++so_gencnt; 225 ++numopensockets; 226 mtx_unlock(&so_global_mtx); 227 } 228 return (so); 229 } 230 231 /* 232 * socreate returns a socket with a ref count of 1. The socket should be 233 * closed with soclose(). 234 */ 235 int 236 socreate(dom, aso, type, proto, cred, td) 237 int dom; 238 struct socket **aso; 239 int type; 240 int proto; 241 struct ucred *cred; 242 struct thread *td; 243 { 244 struct protosw *prp; 245 struct socket *so; 246 int error; 247 248 if (proto) 249 prp = pffindproto(dom, proto, type); 250 else 251 prp = pffindtype(dom, type); 252 253 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 254 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 255 return (EPROTONOSUPPORT); 256 257 if (jailed(cred) && jail_socket_unixiproute_only && 258 prp->pr_domain->dom_family != PF_LOCAL && 259 prp->pr_domain->dom_family != PF_INET && 260 prp->pr_domain->dom_family != PF_ROUTE) { 261 return (EPROTONOSUPPORT); 262 } 263 264 if (prp->pr_type != type) 265 return (EPROTOTYPE); 266 so = soalloc(M_WAITOK); 267 if (so == NULL) 268 return (ENOBUFS); 269 270 TAILQ_INIT(&so->so_incomp); 271 TAILQ_INIT(&so->so_comp); 272 so->so_type = type; 273 so->so_cred = crhold(cred); 274 so->so_proto = prp; 275 #ifdef MAC 276 mac_create_socket(cred, so); 277 #endif 278 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 279 NULL, NULL, NULL); 280 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 281 NULL, NULL, NULL); 282 so->so_count = 1; 283 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 284 if (error) { 285 ACCEPT_LOCK(); 286 SOCK_LOCK(so); 287 so->so_state |= SS_NOFDREF; 288 sorele(so); 289 return (error); 290 } 291 *aso = so; 292 return (0); 293 } 294 295 int 296 sobind(so, nam, td) 297 struct socket *so; 298 struct sockaddr *nam; 299 struct thread *td; 300 { 301 302 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 303 } 304 305 void 306 sodealloc(struct socket *so) 307 { 308 309 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 310 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 311 312 mtx_lock(&so_global_mtx); 313 so->so_gencnt = ++so_gencnt; 314 mtx_unlock(&so_global_mtx); 315 if (so->so_rcv.sb_hiwat) 316 (void)chgsbsize(so->so_cred->cr_uidinfo, 317 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 318 if (so->so_snd.sb_hiwat) 319 (void)chgsbsize(so->so_cred->cr_uidinfo, 320 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 321 #ifdef INET 322 /* remove acccept filter if one is present. */ 323 if (so->so_accf != NULL) 324 do_setopt_accept_filter(so, NULL); 325 #endif 326 #ifdef MAC 327 mac_destroy_socket(so); 328 #endif 329 crfree(so->so_cred); 330 SOCKBUF_LOCK_DESTROY(&so->so_snd); 331 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 332 uma_zfree(socket_zone, so); 333 mtx_lock(&so_global_mtx); 334 --numopensockets; 335 mtx_unlock(&so_global_mtx); 336 } 337 338 /* 339 * solisten() transitions a socket from a non-listening state to a listening 340 * state, but can also be used to update the listen queue depth on an 341 * existing listen socket. The protocol will call back into the sockets 342 * layer using solisten_proto_check() and solisten_proto() to check and set 343 * socket-layer listen state. Call backs are used so that the protocol can 344 * acquire both protocol and socket layer locks in whatever order is required 345 * by the protocol. 346 * 347 * Protocol implementors are advised to hold the socket lock across the 348 * socket-layer test and set to avoid races at the socket layer. 349 */ 350 int 351 solisten(so, backlog, td) 352 struct socket *so; 353 int backlog; 354 struct thread *td; 355 { 356 357 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td)); 358 } 359 360 int 361 solisten_proto_check(so) 362 struct socket *so; 363 { 364 365 SOCK_LOCK_ASSERT(so); 366 367 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 368 SS_ISDISCONNECTING)) 369 return (EINVAL); 370 return (0); 371 } 372 373 void 374 solisten_proto(so, backlog) 375 struct socket *so; 376 int backlog; 377 { 378 379 SOCK_LOCK_ASSERT(so); 380 381 if (backlog < 0 || backlog > somaxconn) 382 backlog = somaxconn; 383 so->so_qlimit = backlog; 384 so->so_options |= SO_ACCEPTCONN; 385 } 386 387 /* 388 * Attempt to free a socket. This should really be sotryfree(). 389 * 390 * sofree() will succeed if: 391 * 392 * - There are no outstanding file descriptor references or related consumers 393 * (so_count == 0). 394 * 395 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 396 * 397 * - The protocol does not have an outstanding strong reference on the socket 398 * (SS_PROTOREF). 399 * 400 * Otherwise, it will quietly abort so that a future call to sofree(), when 401 * conditions are right, can succeed. 402 */ 403 void 404 sofree(so) 405 struct socket *so; 406 { 407 struct socket *head; 408 409 ACCEPT_LOCK_ASSERT(); 410 SOCK_LOCK_ASSERT(so); 411 412 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 413 (so->so_state & SS_PROTOREF)) { 414 SOCK_UNLOCK(so); 415 ACCEPT_UNLOCK(); 416 return; 417 } 418 419 head = so->so_head; 420 if (head != NULL) { 421 KASSERT((so->so_qstate & SQ_COMP) != 0 || 422 (so->so_qstate & SQ_INCOMP) != 0, 423 ("sofree: so_head != NULL, but neither SQ_COMP nor " 424 "SQ_INCOMP")); 425 KASSERT((so->so_qstate & SQ_COMP) == 0 || 426 (so->so_qstate & SQ_INCOMP) == 0, 427 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 428 /* 429 * accept(2) is responsible draining the completed 430 * connection queue and freeing those sockets, so 431 * we just return here if this socket is currently 432 * on the completed connection queue. Otherwise, 433 * accept(2) may hang after select(2) has indicating 434 * that a listening socket was ready. If it's an 435 * incomplete connection, we remove it from the queue 436 * and free it; otherwise, it won't be released until 437 * the listening socket is closed. 438 */ 439 if ((so->so_qstate & SQ_COMP) != 0) { 440 SOCK_UNLOCK(so); 441 ACCEPT_UNLOCK(); 442 return; 443 } 444 TAILQ_REMOVE(&head->so_incomp, so, so_list); 445 head->so_incqlen--; 446 so->so_qstate &= ~SQ_INCOMP; 447 so->so_head = NULL; 448 } 449 KASSERT((so->so_qstate & SQ_COMP) == 0 && 450 (so->so_qstate & SQ_INCOMP) == 0, 451 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 452 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 453 SOCK_UNLOCK(so); 454 ACCEPT_UNLOCK(); 455 456 SOCKBUF_LOCK(&so->so_snd); 457 so->so_snd.sb_flags |= SB_NOINTR; 458 (void)sblock(&so->so_snd, M_WAITOK); 459 /* 460 * socantsendmore_locked() drops the socket buffer mutex so that it 461 * can safely perform wakeups. Re-acquire the mutex before 462 * continuing. 463 */ 464 socantsendmore_locked(so); 465 SOCKBUF_LOCK(&so->so_snd); 466 sbunlock(&so->so_snd); 467 sbrelease_locked(&so->so_snd, so); 468 SOCKBUF_UNLOCK(&so->so_snd); 469 sorflush(so); 470 knlist_destroy(&so->so_rcv.sb_sel.si_note); 471 knlist_destroy(&so->so_snd.sb_sel.si_note); 472 sodealloc(so); 473 } 474 475 /* 476 * Close a socket on last file table reference removal. 477 * Initiate disconnect if connected. 478 * Free socket when disconnect complete. 479 * 480 * This function will sorele() the socket. Note that soclose() may be 481 * called prior to the ref count reaching zero. The actual socket 482 * structure will not be freed until the ref count reaches zero. 483 */ 484 int 485 soclose(so) 486 struct socket *so; 487 { 488 int error = 0; 489 490 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 491 492 funsetown(&so->so_sigio); 493 if (so->so_options & SO_ACCEPTCONN) { 494 struct socket *sp; 495 ACCEPT_LOCK(); 496 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 497 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 498 so->so_incqlen--; 499 sp->so_qstate &= ~SQ_INCOMP; 500 sp->so_head = NULL; 501 ACCEPT_UNLOCK(); 502 soabort(sp); 503 ACCEPT_LOCK(); 504 } 505 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 506 TAILQ_REMOVE(&so->so_comp, sp, so_list); 507 so->so_qlen--; 508 sp->so_qstate &= ~SQ_COMP; 509 sp->so_head = NULL; 510 ACCEPT_UNLOCK(); 511 soabort(sp); 512 ACCEPT_LOCK(); 513 } 514 ACCEPT_UNLOCK(); 515 } 516 if (so->so_state & SS_ISCONNECTED) { 517 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 518 error = sodisconnect(so); 519 if (error) 520 goto drop; 521 } 522 if (so->so_options & SO_LINGER) { 523 if ((so->so_state & SS_ISDISCONNECTING) && 524 (so->so_state & SS_NBIO)) 525 goto drop; 526 while (so->so_state & SS_ISCONNECTED) { 527 error = tsleep(&so->so_timeo, 528 PSOCK | PCATCH, "soclos", so->so_linger * hz); 529 if (error) 530 break; 531 } 532 } 533 } 534 535 drop: 536 (*so->so_proto->pr_usrreqs->pru_detach)(so); 537 ACCEPT_LOCK(); 538 SOCK_LOCK(so); 539 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 540 so->so_state |= SS_NOFDREF; 541 sorele(so); 542 return (error); 543 } 544 545 /* 546 * soabort() allows the socket code or protocol code to detach a socket that 547 * has been in an incomplete or completed listen queue, but has not yet been 548 * accepted. 549 * 550 * This interface is tricky, because it is called on an unreferenced socket, 551 * and must be called only by a thread that has actually removed the socket 552 * from the listen queue it was on, or races with other threads are risked. 553 * 554 * This interface will call into the protocol code, so must not be called 555 * with any socket locks held. Protocols do call it while holding their own 556 * recursible protocol mutexes, but this is something that should be subject 557 * to review in the future. 558 * 559 * XXXRW: Why do we maintain a distinction between pru_abort() and 560 * pru_detach()? 561 */ 562 void 563 soabort(so) 564 struct socket *so; 565 { 566 567 /* 568 * In as much as is possible, assert that no references to this 569 * socket are held. This is not quite the same as asserting that the 570 * current thread is responsible for arranging for no references, but 571 * is as close as we can get for now. 572 */ 573 KASSERT(so->so_count == 0, ("soabort: so_count")); 574 KASSERT(!(so->so_state & SS_PROTOREF), ("soabort: SS_PROTOREF")); 575 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 576 577 (*so->so_proto->pr_usrreqs->pru_abort)(so); 578 ACCEPT_LOCK(); 579 SOCK_LOCK(so); 580 sofree(so); 581 } 582 583 int 584 soaccept(so, nam) 585 struct socket *so; 586 struct sockaddr **nam; 587 { 588 int error; 589 590 SOCK_LOCK(so); 591 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 592 so->so_state &= ~SS_NOFDREF; 593 SOCK_UNLOCK(so); 594 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 595 return (error); 596 } 597 598 int 599 soconnect(so, nam, td) 600 struct socket *so; 601 struct sockaddr *nam; 602 struct thread *td; 603 { 604 int error; 605 606 if (so->so_options & SO_ACCEPTCONN) 607 return (EOPNOTSUPP); 608 /* 609 * If protocol is connection-based, can only connect once. 610 * Otherwise, if connected, try to disconnect first. 611 * This allows user to disconnect by connecting to, e.g., 612 * a null address. 613 */ 614 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 615 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 616 (error = sodisconnect(so)))) { 617 error = EISCONN; 618 } else { 619 /* 620 * Prevent accumulated error from previous connection 621 * from biting us. 622 */ 623 so->so_error = 0; 624 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 625 } 626 627 return (error); 628 } 629 630 int 631 soconnect2(so1, so2) 632 struct socket *so1; 633 struct socket *so2; 634 { 635 636 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 637 } 638 639 int 640 sodisconnect(so) 641 struct socket *so; 642 { 643 int error; 644 645 if ((so->so_state & SS_ISCONNECTED) == 0) 646 return (ENOTCONN); 647 if (so->so_state & SS_ISDISCONNECTING) 648 return (EALREADY); 649 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 650 return (error); 651 } 652 653 #ifdef ZERO_COPY_SOCKETS 654 struct so_zerocopy_stats{ 655 int size_ok; 656 int align_ok; 657 int found_ifp; 658 }; 659 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 660 #include <netinet/in.h> 661 #include <net/route.h> 662 #include <netinet/in_pcb.h> 663 #include <vm/vm.h> 664 #include <vm/vm_page.h> 665 #include <vm/vm_object.h> 666 #endif /*ZERO_COPY_SOCKETS*/ 667 668 /* 669 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 670 * all of the data referenced by the uio. If desired, it uses zero-copy. 671 * *space will be updated to reflect data copied in. 672 * 673 * NB: If atomic I/O is requested, the caller must already have checked that 674 * space can hold resid bytes. 675 * 676 * NB: In the event of an error, the caller may need to free the partial 677 * chain pointed to by *mpp. The contents of both *uio and *space may be 678 * modified even in the case of an error. 679 */ 680 static int 681 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 682 int flags) 683 { 684 struct mbuf *m, **mp, *top; 685 long len, resid; 686 int error; 687 #ifdef ZERO_COPY_SOCKETS 688 int cow_send; 689 #endif 690 691 *retmp = top = NULL; 692 mp = ⊤ 693 len = 0; 694 resid = uio->uio_resid; 695 error = 0; 696 do { 697 #ifdef ZERO_COPY_SOCKETS 698 cow_send = 0; 699 #endif /* ZERO_COPY_SOCKETS */ 700 if (resid >= MINCLSIZE) { 701 #ifdef ZERO_COPY_SOCKETS 702 if (top == NULL) { 703 MGETHDR(m, M_TRYWAIT, MT_DATA); 704 if (m == NULL) { 705 error = ENOBUFS; 706 goto out; 707 } 708 m->m_pkthdr.len = 0; 709 m->m_pkthdr.rcvif = NULL; 710 } else { 711 MGET(m, M_TRYWAIT, MT_DATA); 712 if (m == NULL) { 713 error = ENOBUFS; 714 goto out; 715 } 716 } 717 if (so_zero_copy_send && 718 resid>=PAGE_SIZE && 719 *space>=PAGE_SIZE && 720 uio->uio_iov->iov_len>=PAGE_SIZE) { 721 so_zerocp_stats.size_ok++; 722 so_zerocp_stats.align_ok++; 723 cow_send = socow_setup(m, uio); 724 len = cow_send; 725 } 726 if (!cow_send) { 727 MCLGET(m, M_TRYWAIT); 728 if ((m->m_flags & M_EXT) == 0) { 729 m_free(m); 730 m = NULL; 731 } else { 732 len = min(min(MCLBYTES, resid), 733 *space); 734 } 735 } 736 #else /* ZERO_COPY_SOCKETS */ 737 if (top == NULL) { 738 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 739 m->m_pkthdr.len = 0; 740 m->m_pkthdr.rcvif = NULL; 741 } else 742 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 743 len = min(min(MCLBYTES, resid), *space); 744 #endif /* ZERO_COPY_SOCKETS */ 745 } else { 746 if (top == NULL) { 747 m = m_gethdr(M_TRYWAIT, MT_DATA); 748 m->m_pkthdr.len = 0; 749 m->m_pkthdr.rcvif = NULL; 750 751 len = min(min(MHLEN, resid), *space); 752 /* 753 * For datagram protocols, leave room 754 * for protocol headers in first mbuf. 755 */ 756 if (atomic && m && len < MHLEN) 757 MH_ALIGN(m, len); 758 } else { 759 m = m_get(M_TRYWAIT, MT_DATA); 760 len = min(min(MLEN, resid), *space); 761 } 762 } 763 if (m == NULL) { 764 error = ENOBUFS; 765 goto out; 766 } 767 768 *space -= len; 769 #ifdef ZERO_COPY_SOCKETS 770 if (cow_send) 771 error = 0; 772 else 773 #endif /* ZERO_COPY_SOCKETS */ 774 error = uiomove(mtod(m, void *), (int)len, uio); 775 resid = uio->uio_resid; 776 m->m_len = len; 777 *mp = m; 778 top->m_pkthdr.len += len; 779 if (error) 780 goto out; 781 mp = &m->m_next; 782 if (resid <= 0) { 783 if (flags & MSG_EOR) 784 top->m_flags |= M_EOR; 785 break; 786 } 787 } while (*space > 0 && atomic); 788 out: 789 *retmp = top; 790 return (error); 791 } 792 793 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 794 795 int 796 sosend_dgram(so, addr, uio, top, control, flags, td) 797 struct socket *so; 798 struct sockaddr *addr; 799 struct uio *uio; 800 struct mbuf *top; 801 struct mbuf *control; 802 int flags; 803 struct thread *td; 804 { 805 long space, resid; 806 int clen = 0, error, dontroute; 807 int atomic = sosendallatonce(so) || top; 808 809 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); 810 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 811 ("sodgram_send: !PR_ATOMIC")); 812 813 if (uio != NULL) 814 resid = uio->uio_resid; 815 else 816 resid = top->m_pkthdr.len; 817 /* 818 * In theory resid should be unsigned. 819 * However, space must be signed, as it might be less than 0 820 * if we over-committed, and we must use a signed comparison 821 * of space and resid. On the other hand, a negative resid 822 * causes us to loop sending 0-length segments to the protocol. 823 * 824 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 825 * type sockets since that's an error. 826 */ 827 if (resid < 0) { 828 error = EINVAL; 829 goto out; 830 } 831 832 dontroute = 833 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 834 if (td != NULL) 835 td->td_proc->p_stats->p_ru.ru_msgsnd++; 836 if (control != NULL) 837 clen = control->m_len; 838 839 SOCKBUF_LOCK(&so->so_snd); 840 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 841 SOCKBUF_UNLOCK(&so->so_snd); 842 error = EPIPE; 843 goto out; 844 } 845 if (so->so_error) { 846 error = so->so_error; 847 so->so_error = 0; 848 SOCKBUF_UNLOCK(&so->so_snd); 849 goto out; 850 } 851 if ((so->so_state & SS_ISCONNECTED) == 0) { 852 /* 853 * `sendto' and `sendmsg' is allowed on a connection- 854 * based socket if it supports implied connect. 855 * Return ENOTCONN if not connected and no address is 856 * supplied. 857 */ 858 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 859 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 860 if ((so->so_state & SS_ISCONFIRMING) == 0 && 861 !(resid == 0 && clen != 0)) { 862 SOCKBUF_UNLOCK(&so->so_snd); 863 error = ENOTCONN; 864 goto out; 865 } 866 } else if (addr == NULL) { 867 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 868 error = ENOTCONN; 869 else 870 error = EDESTADDRREQ; 871 SOCKBUF_UNLOCK(&so->so_snd); 872 goto out; 873 } 874 } 875 876 /* 877 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 878 * problem and need fixing. 879 */ 880 space = sbspace(&so->so_snd); 881 if (flags & MSG_OOB) 882 space += 1024; 883 space -= clen; 884 if (resid > space) { 885 error = EMSGSIZE; 886 goto out; 887 } 888 SOCKBUF_UNLOCK(&so->so_snd); 889 if (uio == NULL) { 890 resid = 0; 891 if (flags & MSG_EOR) 892 top->m_flags |= M_EOR; 893 } else { 894 error = sosend_copyin(uio, &top, atomic, &space, flags); 895 if (error) 896 goto out; 897 resid = uio->uio_resid; 898 } 899 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 900 /* 901 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 902 * than with. 903 */ 904 if (dontroute) { 905 SOCK_LOCK(so); 906 so->so_options |= SO_DONTROUTE; 907 SOCK_UNLOCK(so); 908 } 909 /* 910 * XXX all the SBS_CANTSENDMORE checks previously 911 * done could be out of date. We could have recieved 912 * a reset packet in an interrupt or maybe we slept 913 * while doing page faults in uiomove() etc. We could 914 * probably recheck again inside the locking protection 915 * here, but there are probably other places that this 916 * also happens. We must rethink this. 917 */ 918 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 919 (flags & MSG_OOB) ? PRUS_OOB : 920 /* 921 * If the user set MSG_EOF, the protocol 922 * understands this flag and nothing left to 923 * send then use PRU_SEND_EOF instead of PRU_SEND. 924 */ 925 ((flags & MSG_EOF) && 926 (so->so_proto->pr_flags & PR_IMPLOPCL) && 927 (resid <= 0)) ? 928 PRUS_EOF : 929 /* If there is more to send set PRUS_MORETOCOME */ 930 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 931 top, addr, control, td); 932 if (dontroute) { 933 SOCK_LOCK(so); 934 so->so_options &= ~SO_DONTROUTE; 935 SOCK_UNLOCK(so); 936 } 937 clen = 0; 938 control = NULL; 939 top = NULL; 940 out: 941 if (top != NULL) 942 m_freem(top); 943 if (control != NULL) 944 m_freem(control); 945 return (error); 946 } 947 948 /* 949 * Send on a socket. 950 * If send must go all at once and message is larger than 951 * send buffering, then hard error. 952 * Lock against other senders. 953 * If must go all at once and not enough room now, then 954 * inform user that this would block and do nothing. 955 * Otherwise, if nonblocking, send as much as possible. 956 * The data to be sent is described by "uio" if nonzero, 957 * otherwise by the mbuf chain "top" (which must be null 958 * if uio is not). Data provided in mbuf chain must be small 959 * enough to send all at once. 960 * 961 * Returns nonzero on error, timeout or signal; callers 962 * must check for short counts if EINTR/ERESTART are returned. 963 * Data and control buffers are freed on return. 964 */ 965 #define snderr(errno) { error = (errno); goto release; } 966 int 967 sosend(so, addr, uio, top, control, flags, td) 968 struct socket *so; 969 struct sockaddr *addr; 970 struct uio *uio; 971 struct mbuf *top; 972 struct mbuf *control; 973 int flags; 974 struct thread *td; 975 { 976 long space, resid; 977 int clen = 0, error, dontroute; 978 int atomic = sosendallatonce(so) || top; 979 980 if (uio != NULL) 981 resid = uio->uio_resid; 982 else 983 resid = top->m_pkthdr.len; 984 /* 985 * In theory resid should be unsigned. 986 * However, space must be signed, as it might be less than 0 987 * if we over-committed, and we must use a signed comparison 988 * of space and resid. On the other hand, a negative resid 989 * causes us to loop sending 0-length segments to the protocol. 990 * 991 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 992 * type sockets since that's an error. 993 */ 994 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 995 error = EINVAL; 996 goto out; 997 } 998 999 dontroute = 1000 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1001 (so->so_proto->pr_flags & PR_ATOMIC); 1002 if (td != NULL) 1003 td->td_proc->p_stats->p_ru.ru_msgsnd++; 1004 if (control != NULL) 1005 clen = control->m_len; 1006 1007 SOCKBUF_LOCK(&so->so_snd); 1008 restart: 1009 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1010 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1011 if (error) 1012 goto out_locked; 1013 do { 1014 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1015 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 1016 snderr(EPIPE); 1017 if (so->so_error) { 1018 error = so->so_error; 1019 so->so_error = 0; 1020 goto release; 1021 } 1022 if ((so->so_state & SS_ISCONNECTED) == 0) { 1023 /* 1024 * `sendto' and `sendmsg' is allowed on a connection- 1025 * based socket if it supports implied connect. 1026 * Return ENOTCONN if not connected and no address is 1027 * supplied. 1028 */ 1029 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1030 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1031 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1032 !(resid == 0 && clen != 0)) 1033 snderr(ENOTCONN); 1034 } else if (addr == NULL) 1035 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 1036 ENOTCONN : EDESTADDRREQ); 1037 } 1038 space = sbspace(&so->so_snd); 1039 if (flags & MSG_OOB) 1040 space += 1024; 1041 if ((atomic && resid > so->so_snd.sb_hiwat) || 1042 clen > so->so_snd.sb_hiwat) 1043 snderr(EMSGSIZE); 1044 if (space < resid + clen && 1045 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1046 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 1047 snderr(EWOULDBLOCK); 1048 sbunlock(&so->so_snd); 1049 error = sbwait(&so->so_snd); 1050 if (error) 1051 goto out_locked; 1052 goto restart; 1053 } 1054 SOCKBUF_UNLOCK(&so->so_snd); 1055 space -= clen; 1056 do { 1057 if (uio == NULL) { 1058 resid = 0; 1059 if (flags & MSG_EOR) 1060 top->m_flags |= M_EOR; 1061 } else { 1062 error = sosend_copyin(uio, &top, atomic, 1063 &space, flags); 1064 if (error != 0) { 1065 SOCKBUF_LOCK(&so->so_snd); 1066 goto release; 1067 } 1068 resid = uio->uio_resid; 1069 } 1070 if (dontroute) { 1071 SOCK_LOCK(so); 1072 so->so_options |= SO_DONTROUTE; 1073 SOCK_UNLOCK(so); 1074 } 1075 /* 1076 * XXX all the SBS_CANTSENDMORE checks previously 1077 * done could be out of date. We could have recieved 1078 * a reset packet in an interrupt or maybe we slept 1079 * while doing page faults in uiomove() etc. We could 1080 * probably recheck again inside the locking protection 1081 * here, but there are probably other places that this 1082 * also happens. We must rethink this. 1083 */ 1084 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1085 (flags & MSG_OOB) ? PRUS_OOB : 1086 /* 1087 * If the user set MSG_EOF, the protocol 1088 * understands this flag and nothing left to 1089 * send then use PRU_SEND_EOF instead of PRU_SEND. 1090 */ 1091 ((flags & MSG_EOF) && 1092 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1093 (resid <= 0)) ? 1094 PRUS_EOF : 1095 /* If there is more to send set PRUS_MORETOCOME */ 1096 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1097 top, addr, control, td); 1098 if (dontroute) { 1099 SOCK_LOCK(so); 1100 so->so_options &= ~SO_DONTROUTE; 1101 SOCK_UNLOCK(so); 1102 } 1103 clen = 0; 1104 control = NULL; 1105 top = NULL; 1106 if (error) { 1107 SOCKBUF_LOCK(&so->so_snd); 1108 goto release; 1109 } 1110 } while (resid && space > 0); 1111 SOCKBUF_LOCK(&so->so_snd); 1112 } while (resid); 1113 1114 release: 1115 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1116 sbunlock(&so->so_snd); 1117 out_locked: 1118 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1119 SOCKBUF_UNLOCK(&so->so_snd); 1120 out: 1121 if (top != NULL) 1122 m_freem(top); 1123 if (control != NULL) 1124 m_freem(control); 1125 return (error); 1126 } 1127 #undef snderr 1128 1129 /* 1130 * The part of soreceive() that implements reading non-inline out-of-band 1131 * data from a socket. For more complete comments, see soreceive(), from 1132 * which this code originated. 1133 * 1134 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1135 * unable to return an mbuf chain to the caller. 1136 */ 1137 static int 1138 soreceive_rcvoob(so, uio, flags) 1139 struct socket *so; 1140 struct uio *uio; 1141 int flags; 1142 { 1143 struct protosw *pr = so->so_proto; 1144 struct mbuf *m; 1145 int error; 1146 1147 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1148 1149 m = m_get(M_TRYWAIT, MT_DATA); 1150 if (m == NULL) 1151 return (ENOBUFS); 1152 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1153 if (error) 1154 goto bad; 1155 do { 1156 #ifdef ZERO_COPY_SOCKETS 1157 if (so_zero_copy_receive) { 1158 int disposable; 1159 1160 if ((m->m_flags & M_EXT) 1161 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1162 disposable = 1; 1163 else 1164 disposable = 0; 1165 1166 error = uiomoveco(mtod(m, void *), 1167 min(uio->uio_resid, m->m_len), 1168 uio, disposable); 1169 } else 1170 #endif /* ZERO_COPY_SOCKETS */ 1171 error = uiomove(mtod(m, void *), 1172 (int) min(uio->uio_resid, m->m_len), uio); 1173 m = m_free(m); 1174 } while (uio->uio_resid && error == 0 && m); 1175 bad: 1176 if (m != NULL) 1177 m_freem(m); 1178 return (error); 1179 } 1180 1181 /* 1182 * Following replacement or removal of the first mbuf on the first mbuf chain 1183 * of a socket buffer, push necessary state changes back into the socket 1184 * buffer so that other consumers see the values consistently. 'nextrecord' 1185 * is the callers locally stored value of the original value of 1186 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1187 * NOTE: 'nextrecord' may be NULL. 1188 */ 1189 static __inline void 1190 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1191 { 1192 1193 SOCKBUF_LOCK_ASSERT(sb); 1194 /* 1195 * First, update for the new value of nextrecord. If necessary, make 1196 * it the first record. 1197 */ 1198 if (sb->sb_mb != NULL) 1199 sb->sb_mb->m_nextpkt = nextrecord; 1200 else 1201 sb->sb_mb = nextrecord; 1202 1203 /* 1204 * Now update any dependent socket buffer fields to reflect the new 1205 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1206 * addition of a second clause that takes care of the case where 1207 * sb_mb has been updated, but remains the last record. 1208 */ 1209 if (sb->sb_mb == NULL) { 1210 sb->sb_mbtail = NULL; 1211 sb->sb_lastrecord = NULL; 1212 } else if (sb->sb_mb->m_nextpkt == NULL) 1213 sb->sb_lastrecord = sb->sb_mb; 1214 } 1215 1216 1217 /* 1218 * Implement receive operations on a socket. 1219 * We depend on the way that records are added to the sockbuf 1220 * by sbappend*. In particular, each record (mbufs linked through m_next) 1221 * must begin with an address if the protocol so specifies, 1222 * followed by an optional mbuf or mbufs containing ancillary data, 1223 * and then zero or more mbufs of data. 1224 * In order to avoid blocking network interrupts for the entire time here, 1225 * we splx() while doing the actual copy to user space. 1226 * Although the sockbuf is locked, new data may still be appended, 1227 * and thus we must maintain consistency of the sockbuf during that time. 1228 * 1229 * The caller may receive the data as a single mbuf chain by supplying 1230 * an mbuf **mp0 for use in returning the chain. The uio is then used 1231 * only for the count in uio_resid. 1232 */ 1233 int 1234 soreceive(so, psa, uio, mp0, controlp, flagsp) 1235 struct socket *so; 1236 struct sockaddr **psa; 1237 struct uio *uio; 1238 struct mbuf **mp0; 1239 struct mbuf **controlp; 1240 int *flagsp; 1241 { 1242 struct mbuf *m, **mp; 1243 int flags, len, error, offset; 1244 struct protosw *pr = so->so_proto; 1245 struct mbuf *nextrecord; 1246 int moff, type = 0; 1247 int orig_resid = uio->uio_resid; 1248 1249 mp = mp0; 1250 if (psa != NULL) 1251 *psa = NULL; 1252 if (controlp != NULL) 1253 *controlp = NULL; 1254 if (flagsp != NULL) 1255 flags = *flagsp &~ MSG_EOR; 1256 else 1257 flags = 0; 1258 if (flags & MSG_OOB) 1259 return (soreceive_rcvoob(so, uio, flags)); 1260 if (mp != NULL) 1261 *mp = NULL; 1262 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1263 && uio->uio_resid) 1264 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1265 1266 SOCKBUF_LOCK(&so->so_rcv); 1267 restart: 1268 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1269 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1270 if (error) 1271 goto out; 1272 1273 m = so->so_rcv.sb_mb; 1274 /* 1275 * If we have less data than requested, block awaiting more 1276 * (subject to any timeout) if: 1277 * 1. the current count is less than the low water mark, or 1278 * 2. MSG_WAITALL is set, and it is possible to do the entire 1279 * receive operation at once if we block (resid <= hiwat). 1280 * 3. MSG_DONTWAIT is not set 1281 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1282 * we have to do the receive in sections, and thus risk returning 1283 * a short count if a timeout or signal occurs after we start. 1284 */ 1285 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1286 so->so_rcv.sb_cc < uio->uio_resid) && 1287 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1288 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1289 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1290 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1291 ("receive: m == %p so->so_rcv.sb_cc == %u", 1292 m, so->so_rcv.sb_cc)); 1293 if (so->so_error) { 1294 if (m != NULL) 1295 goto dontblock; 1296 error = so->so_error; 1297 if ((flags & MSG_PEEK) == 0) 1298 so->so_error = 0; 1299 goto release; 1300 } 1301 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1302 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1303 if (m) 1304 goto dontblock; 1305 else 1306 goto release; 1307 } 1308 for (; m != NULL; m = m->m_next) 1309 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1310 m = so->so_rcv.sb_mb; 1311 goto dontblock; 1312 } 1313 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1314 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1315 error = ENOTCONN; 1316 goto release; 1317 } 1318 if (uio->uio_resid == 0) 1319 goto release; 1320 if ((so->so_state & SS_NBIO) || 1321 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1322 error = EWOULDBLOCK; 1323 goto release; 1324 } 1325 SBLASTRECORDCHK(&so->so_rcv); 1326 SBLASTMBUFCHK(&so->so_rcv); 1327 sbunlock(&so->so_rcv); 1328 error = sbwait(&so->so_rcv); 1329 if (error) 1330 goto out; 1331 goto restart; 1332 } 1333 dontblock: 1334 /* 1335 * From this point onward, we maintain 'nextrecord' as a cache of the 1336 * pointer to the next record in the socket buffer. We must keep the 1337 * various socket buffer pointers and local stack versions of the 1338 * pointers in sync, pushing out modifications before dropping the 1339 * socket buffer mutex, and re-reading them when picking it up. 1340 * 1341 * Otherwise, we will race with the network stack appending new data 1342 * or records onto the socket buffer by using inconsistent/stale 1343 * versions of the field, possibly resulting in socket buffer 1344 * corruption. 1345 * 1346 * By holding the high-level sblock(), we prevent simultaneous 1347 * readers from pulling off the front of the socket buffer. 1348 */ 1349 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1350 if (uio->uio_td) 1351 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1352 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1353 SBLASTRECORDCHK(&so->so_rcv); 1354 SBLASTMBUFCHK(&so->so_rcv); 1355 nextrecord = m->m_nextpkt; 1356 if (pr->pr_flags & PR_ADDR) { 1357 KASSERT(m->m_type == MT_SONAME, 1358 ("m->m_type == %d", m->m_type)); 1359 orig_resid = 0; 1360 if (psa != NULL) 1361 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1362 M_NOWAIT); 1363 if (flags & MSG_PEEK) { 1364 m = m->m_next; 1365 } else { 1366 sbfree(&so->so_rcv, m); 1367 so->so_rcv.sb_mb = m_free(m); 1368 m = so->so_rcv.sb_mb; 1369 sockbuf_pushsync(&so->so_rcv, nextrecord); 1370 } 1371 } 1372 1373 /* 1374 * Process one or more MT_CONTROL mbufs present before any data mbufs 1375 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1376 * just copy the data; if !MSG_PEEK, we call into the protocol to 1377 * perform externalization (or freeing if controlp == NULL). 1378 */ 1379 if (m != NULL && m->m_type == MT_CONTROL) { 1380 struct mbuf *cm = NULL, *cmn; 1381 struct mbuf **cme = &cm; 1382 1383 do { 1384 if (flags & MSG_PEEK) { 1385 if (controlp != NULL) { 1386 *controlp = m_copy(m, 0, m->m_len); 1387 controlp = &(*controlp)->m_next; 1388 } 1389 m = m->m_next; 1390 } else { 1391 sbfree(&so->so_rcv, m); 1392 so->so_rcv.sb_mb = m->m_next; 1393 m->m_next = NULL; 1394 *cme = m; 1395 cme = &(*cme)->m_next; 1396 m = so->so_rcv.sb_mb; 1397 } 1398 } while (m != NULL && m->m_type == MT_CONTROL); 1399 if ((flags & MSG_PEEK) == 0) 1400 sockbuf_pushsync(&so->so_rcv, nextrecord); 1401 while (cm != NULL) { 1402 cmn = cm->m_next; 1403 cm->m_next = NULL; 1404 if (pr->pr_domain->dom_externalize != NULL) { 1405 SOCKBUF_UNLOCK(&so->so_rcv); 1406 error = (*pr->pr_domain->dom_externalize) 1407 (cm, controlp); 1408 SOCKBUF_LOCK(&so->so_rcv); 1409 } else if (controlp != NULL) 1410 *controlp = cm; 1411 else 1412 m_freem(cm); 1413 if (controlp != NULL) { 1414 orig_resid = 0; 1415 while (*controlp != NULL) 1416 controlp = &(*controlp)->m_next; 1417 } 1418 cm = cmn; 1419 } 1420 if (so->so_rcv.sb_mb) 1421 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1422 else 1423 nextrecord = NULL; 1424 orig_resid = 0; 1425 } 1426 if (m != NULL) { 1427 if ((flags & MSG_PEEK) == 0) { 1428 KASSERT(m->m_nextpkt == nextrecord, 1429 ("soreceive: post-control, nextrecord !sync")); 1430 if (nextrecord == NULL) { 1431 KASSERT(so->so_rcv.sb_mb == m, 1432 ("soreceive: post-control, sb_mb!=m")); 1433 KASSERT(so->so_rcv.sb_lastrecord == m, 1434 ("soreceive: post-control, lastrecord!=m")); 1435 } 1436 } 1437 type = m->m_type; 1438 if (type == MT_OOBDATA) 1439 flags |= MSG_OOB; 1440 } else { 1441 if ((flags & MSG_PEEK) == 0) { 1442 KASSERT(so->so_rcv.sb_mb == nextrecord, 1443 ("soreceive: sb_mb != nextrecord")); 1444 if (so->so_rcv.sb_mb == NULL) { 1445 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1446 ("soreceive: sb_lastercord != NULL")); 1447 } 1448 } 1449 } 1450 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1451 SBLASTRECORDCHK(&so->so_rcv); 1452 SBLASTMBUFCHK(&so->so_rcv); 1453 1454 /* 1455 * Now continue to read any data mbufs off of the head of the socket 1456 * buffer until the read request is satisfied. Note that 'type' is 1457 * used to store the type of any mbuf reads that have happened so far 1458 * such that soreceive() can stop reading if the type changes, which 1459 * causes soreceive() to return only one of regular data and inline 1460 * out-of-band data in a single socket receive operation. 1461 */ 1462 moff = 0; 1463 offset = 0; 1464 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1465 /* 1466 * If the type of mbuf has changed since the last mbuf 1467 * examined ('type'), end the receive operation. 1468 */ 1469 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1470 if (m->m_type == MT_OOBDATA) { 1471 if (type != MT_OOBDATA) 1472 break; 1473 } else if (type == MT_OOBDATA) 1474 break; 1475 else 1476 KASSERT(m->m_type == MT_DATA, 1477 ("m->m_type == %d", m->m_type)); 1478 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1479 len = uio->uio_resid; 1480 if (so->so_oobmark && len > so->so_oobmark - offset) 1481 len = so->so_oobmark - offset; 1482 if (len > m->m_len - moff) 1483 len = m->m_len - moff; 1484 /* 1485 * If mp is set, just pass back the mbufs. 1486 * Otherwise copy them out via the uio, then free. 1487 * Sockbuf must be consistent here (points to current mbuf, 1488 * it points to next record) when we drop priority; 1489 * we must note any additions to the sockbuf when we 1490 * block interrupts again. 1491 */ 1492 if (mp == NULL) { 1493 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1494 SBLASTRECORDCHK(&so->so_rcv); 1495 SBLASTMBUFCHK(&so->so_rcv); 1496 SOCKBUF_UNLOCK(&so->so_rcv); 1497 #ifdef ZERO_COPY_SOCKETS 1498 if (so_zero_copy_receive) { 1499 int disposable; 1500 1501 if ((m->m_flags & M_EXT) 1502 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1503 disposable = 1; 1504 else 1505 disposable = 0; 1506 1507 error = uiomoveco(mtod(m, char *) + moff, 1508 (int)len, uio, 1509 disposable); 1510 } else 1511 #endif /* ZERO_COPY_SOCKETS */ 1512 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1513 SOCKBUF_LOCK(&so->so_rcv); 1514 if (error) 1515 goto release; 1516 } else 1517 uio->uio_resid -= len; 1518 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1519 if (len == m->m_len - moff) { 1520 if (m->m_flags & M_EOR) 1521 flags |= MSG_EOR; 1522 if (flags & MSG_PEEK) { 1523 m = m->m_next; 1524 moff = 0; 1525 } else { 1526 nextrecord = m->m_nextpkt; 1527 sbfree(&so->so_rcv, m); 1528 if (mp != NULL) { 1529 *mp = m; 1530 mp = &m->m_next; 1531 so->so_rcv.sb_mb = m = m->m_next; 1532 *mp = NULL; 1533 } else { 1534 so->so_rcv.sb_mb = m_free(m); 1535 m = so->so_rcv.sb_mb; 1536 } 1537 sockbuf_pushsync(&so->so_rcv, nextrecord); 1538 SBLASTRECORDCHK(&so->so_rcv); 1539 SBLASTMBUFCHK(&so->so_rcv); 1540 } 1541 } else { 1542 if (flags & MSG_PEEK) 1543 moff += len; 1544 else { 1545 if (mp != NULL) { 1546 int copy_flag; 1547 1548 if (flags & MSG_DONTWAIT) 1549 copy_flag = M_DONTWAIT; 1550 else 1551 copy_flag = M_TRYWAIT; 1552 if (copy_flag == M_TRYWAIT) 1553 SOCKBUF_UNLOCK(&so->so_rcv); 1554 *mp = m_copym(m, 0, len, copy_flag); 1555 if (copy_flag == M_TRYWAIT) 1556 SOCKBUF_LOCK(&so->so_rcv); 1557 if (*mp == NULL) { 1558 /* 1559 * m_copym() couldn't allocate an mbuf. 1560 * Adjust uio_resid back (it was adjusted 1561 * down by len bytes, which we didn't end 1562 * up "copying" over). 1563 */ 1564 uio->uio_resid += len; 1565 break; 1566 } 1567 } 1568 m->m_data += len; 1569 m->m_len -= len; 1570 so->so_rcv.sb_cc -= len; 1571 } 1572 } 1573 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1574 if (so->so_oobmark) { 1575 if ((flags & MSG_PEEK) == 0) { 1576 so->so_oobmark -= len; 1577 if (so->so_oobmark == 0) { 1578 so->so_rcv.sb_state |= SBS_RCVATMARK; 1579 break; 1580 } 1581 } else { 1582 offset += len; 1583 if (offset == so->so_oobmark) 1584 break; 1585 } 1586 } 1587 if (flags & MSG_EOR) 1588 break; 1589 /* 1590 * If the MSG_WAITALL flag is set (for non-atomic socket), 1591 * we must not quit until "uio->uio_resid == 0" or an error 1592 * termination. If a signal/timeout occurs, return 1593 * with a short count but without error. 1594 * Keep sockbuf locked against other readers. 1595 */ 1596 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1597 !sosendallatonce(so) && nextrecord == NULL) { 1598 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1599 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1600 break; 1601 /* 1602 * Notify the protocol that some data has been 1603 * drained before blocking. 1604 */ 1605 if (pr->pr_flags & PR_WANTRCVD) { 1606 SOCKBUF_UNLOCK(&so->so_rcv); 1607 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1608 SOCKBUF_LOCK(&so->so_rcv); 1609 } 1610 SBLASTRECORDCHK(&so->so_rcv); 1611 SBLASTMBUFCHK(&so->so_rcv); 1612 error = sbwait(&so->so_rcv); 1613 if (error) 1614 goto release; 1615 m = so->so_rcv.sb_mb; 1616 if (m != NULL) 1617 nextrecord = m->m_nextpkt; 1618 } 1619 } 1620 1621 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1622 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1623 flags |= MSG_TRUNC; 1624 if ((flags & MSG_PEEK) == 0) 1625 (void) sbdroprecord_locked(&so->so_rcv); 1626 } 1627 if ((flags & MSG_PEEK) == 0) { 1628 if (m == NULL) { 1629 /* 1630 * First part is an inline SB_EMPTY_FIXUP(). Second 1631 * part makes sure sb_lastrecord is up-to-date if 1632 * there is still data in the socket buffer. 1633 */ 1634 so->so_rcv.sb_mb = nextrecord; 1635 if (so->so_rcv.sb_mb == NULL) { 1636 so->so_rcv.sb_mbtail = NULL; 1637 so->so_rcv.sb_lastrecord = NULL; 1638 } else if (nextrecord->m_nextpkt == NULL) 1639 so->so_rcv.sb_lastrecord = nextrecord; 1640 } 1641 SBLASTRECORDCHK(&so->so_rcv); 1642 SBLASTMBUFCHK(&so->so_rcv); 1643 /* 1644 * If soreceive() is being done from the socket callback, then 1645 * don't need to generate ACK to peer to update window, since 1646 * ACK will be generated on return to TCP. 1647 */ 1648 if (!(flags & MSG_SOCALLBCK) && 1649 (pr->pr_flags & PR_WANTRCVD)) { 1650 SOCKBUF_UNLOCK(&so->so_rcv); 1651 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1652 SOCKBUF_LOCK(&so->so_rcv); 1653 } 1654 } 1655 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1656 if (orig_resid == uio->uio_resid && orig_resid && 1657 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1658 sbunlock(&so->so_rcv); 1659 goto restart; 1660 } 1661 1662 if (flagsp != NULL) 1663 *flagsp |= flags; 1664 release: 1665 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1666 sbunlock(&so->so_rcv); 1667 out: 1668 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1669 SOCKBUF_UNLOCK(&so->so_rcv); 1670 return (error); 1671 } 1672 1673 int 1674 soshutdown(so, how) 1675 struct socket *so; 1676 int how; 1677 { 1678 struct protosw *pr = so->so_proto; 1679 1680 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1681 return (EINVAL); 1682 1683 if (how != SHUT_WR) 1684 sorflush(so); 1685 if (how != SHUT_RD) 1686 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1687 return (0); 1688 } 1689 1690 void 1691 sorflush(so) 1692 struct socket *so; 1693 { 1694 struct sockbuf *sb = &so->so_rcv; 1695 struct protosw *pr = so->so_proto; 1696 struct sockbuf asb; 1697 1698 /* 1699 * XXXRW: This is quite ugly. Previously, this code made a copy of 1700 * the socket buffer, then zero'd the original to clear the buffer 1701 * fields. However, with mutexes in the socket buffer, this causes 1702 * problems. We only clear the zeroable bits of the original; 1703 * however, we have to initialize and destroy the mutex in the copy 1704 * so that dom_dispose() and sbrelease() can lock t as needed. 1705 */ 1706 SOCKBUF_LOCK(sb); 1707 sb->sb_flags |= SB_NOINTR; 1708 (void) sblock(sb, M_WAITOK); 1709 /* 1710 * socantrcvmore_locked() drops the socket buffer mutex so that it 1711 * can safely perform wakeups. Re-acquire the mutex before 1712 * continuing. 1713 */ 1714 socantrcvmore_locked(so); 1715 SOCKBUF_LOCK(sb); 1716 sbunlock(sb); 1717 /* 1718 * Invalidate/clear most of the sockbuf structure, but leave 1719 * selinfo and mutex data unchanged. 1720 */ 1721 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1722 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1723 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1724 bzero(&sb->sb_startzero, 1725 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1726 SOCKBUF_UNLOCK(sb); 1727 1728 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1729 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1730 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1731 sbrelease(&asb, so); 1732 SOCKBUF_LOCK_DESTROY(&asb); 1733 } 1734 1735 /* 1736 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1737 * an additional variant to handle the case where the option value needs 1738 * to be some kind of integer, but not a specific size. 1739 * In addition to their use here, these functions are also called by the 1740 * protocol-level pr_ctloutput() routines. 1741 */ 1742 int 1743 sooptcopyin(sopt, buf, len, minlen) 1744 struct sockopt *sopt; 1745 void *buf; 1746 size_t len; 1747 size_t minlen; 1748 { 1749 size_t valsize; 1750 1751 /* 1752 * If the user gives us more than we wanted, we ignore it, 1753 * but if we don't get the minimum length the caller 1754 * wants, we return EINVAL. On success, sopt->sopt_valsize 1755 * is set to however much we actually retrieved. 1756 */ 1757 if ((valsize = sopt->sopt_valsize) < minlen) 1758 return EINVAL; 1759 if (valsize > len) 1760 sopt->sopt_valsize = valsize = len; 1761 1762 if (sopt->sopt_td != NULL) 1763 return (copyin(sopt->sopt_val, buf, valsize)); 1764 1765 bcopy(sopt->sopt_val, buf, valsize); 1766 return (0); 1767 } 1768 1769 /* 1770 * Kernel version of setsockopt(2)/ 1771 * XXX: optlen is size_t, not socklen_t 1772 */ 1773 int 1774 so_setsockopt(struct socket *so, int level, int optname, void *optval, 1775 size_t optlen) 1776 { 1777 struct sockopt sopt; 1778 1779 sopt.sopt_level = level; 1780 sopt.sopt_name = optname; 1781 sopt.sopt_dir = SOPT_SET; 1782 sopt.sopt_val = optval; 1783 sopt.sopt_valsize = optlen; 1784 sopt.sopt_td = NULL; 1785 return (sosetopt(so, &sopt)); 1786 } 1787 1788 int 1789 sosetopt(so, sopt) 1790 struct socket *so; 1791 struct sockopt *sopt; 1792 { 1793 int error, optval; 1794 struct linger l; 1795 struct timeval tv; 1796 u_long val; 1797 #ifdef MAC 1798 struct mac extmac; 1799 #endif 1800 1801 error = 0; 1802 if (sopt->sopt_level != SOL_SOCKET) { 1803 if (so->so_proto && so->so_proto->pr_ctloutput) 1804 return ((*so->so_proto->pr_ctloutput) 1805 (so, sopt)); 1806 error = ENOPROTOOPT; 1807 } else { 1808 switch (sopt->sopt_name) { 1809 #ifdef INET 1810 case SO_ACCEPTFILTER: 1811 error = do_setopt_accept_filter(so, sopt); 1812 if (error) 1813 goto bad; 1814 break; 1815 #endif 1816 case SO_LINGER: 1817 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1818 if (error) 1819 goto bad; 1820 1821 SOCK_LOCK(so); 1822 so->so_linger = l.l_linger; 1823 if (l.l_onoff) 1824 so->so_options |= SO_LINGER; 1825 else 1826 so->so_options &= ~SO_LINGER; 1827 SOCK_UNLOCK(so); 1828 break; 1829 1830 case SO_DEBUG: 1831 case SO_KEEPALIVE: 1832 case SO_DONTROUTE: 1833 case SO_USELOOPBACK: 1834 case SO_BROADCAST: 1835 case SO_REUSEADDR: 1836 case SO_REUSEPORT: 1837 case SO_OOBINLINE: 1838 case SO_TIMESTAMP: 1839 case SO_BINTIME: 1840 case SO_NOSIGPIPE: 1841 error = sooptcopyin(sopt, &optval, sizeof optval, 1842 sizeof optval); 1843 if (error) 1844 goto bad; 1845 SOCK_LOCK(so); 1846 if (optval) 1847 so->so_options |= sopt->sopt_name; 1848 else 1849 so->so_options &= ~sopt->sopt_name; 1850 SOCK_UNLOCK(so); 1851 break; 1852 1853 case SO_SNDBUF: 1854 case SO_RCVBUF: 1855 case SO_SNDLOWAT: 1856 case SO_RCVLOWAT: 1857 error = sooptcopyin(sopt, &optval, sizeof optval, 1858 sizeof optval); 1859 if (error) 1860 goto bad; 1861 1862 /* 1863 * Values < 1 make no sense for any of these 1864 * options, so disallow them. 1865 */ 1866 if (optval < 1) { 1867 error = EINVAL; 1868 goto bad; 1869 } 1870 1871 switch (sopt->sopt_name) { 1872 case SO_SNDBUF: 1873 case SO_RCVBUF: 1874 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1875 &so->so_snd : &so->so_rcv, (u_long)optval, 1876 so, curthread) == 0) { 1877 error = ENOBUFS; 1878 goto bad; 1879 } 1880 break; 1881 1882 /* 1883 * Make sure the low-water is never greater than 1884 * the high-water. 1885 */ 1886 case SO_SNDLOWAT: 1887 SOCKBUF_LOCK(&so->so_snd); 1888 so->so_snd.sb_lowat = 1889 (optval > so->so_snd.sb_hiwat) ? 1890 so->so_snd.sb_hiwat : optval; 1891 SOCKBUF_UNLOCK(&so->so_snd); 1892 break; 1893 case SO_RCVLOWAT: 1894 SOCKBUF_LOCK(&so->so_rcv); 1895 so->so_rcv.sb_lowat = 1896 (optval > so->so_rcv.sb_hiwat) ? 1897 so->so_rcv.sb_hiwat : optval; 1898 SOCKBUF_UNLOCK(&so->so_rcv); 1899 break; 1900 } 1901 break; 1902 1903 case SO_SNDTIMEO: 1904 case SO_RCVTIMEO: 1905 #ifdef COMPAT_IA32 1906 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 1907 struct timeval32 tv32; 1908 1909 error = sooptcopyin(sopt, &tv32, sizeof tv32, 1910 sizeof tv32); 1911 CP(tv32, tv, tv_sec); 1912 CP(tv32, tv, tv_usec); 1913 } else 1914 #endif 1915 error = sooptcopyin(sopt, &tv, sizeof tv, 1916 sizeof tv); 1917 if (error) 1918 goto bad; 1919 1920 /* assert(hz > 0); */ 1921 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 1922 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1923 error = EDOM; 1924 goto bad; 1925 } 1926 /* assert(tick > 0); */ 1927 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 1928 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1929 if (val > INT_MAX) { 1930 error = EDOM; 1931 goto bad; 1932 } 1933 if (val == 0 && tv.tv_usec != 0) 1934 val = 1; 1935 1936 switch (sopt->sopt_name) { 1937 case SO_SNDTIMEO: 1938 so->so_snd.sb_timeo = val; 1939 break; 1940 case SO_RCVTIMEO: 1941 so->so_rcv.sb_timeo = val; 1942 break; 1943 } 1944 break; 1945 1946 case SO_LABEL: 1947 #ifdef MAC 1948 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1949 sizeof extmac); 1950 if (error) 1951 goto bad; 1952 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1953 so, &extmac); 1954 #else 1955 error = EOPNOTSUPP; 1956 #endif 1957 break; 1958 1959 default: 1960 error = ENOPROTOOPT; 1961 break; 1962 } 1963 if (error == 0 && so->so_proto != NULL && 1964 so->so_proto->pr_ctloutput != NULL) { 1965 (void) ((*so->so_proto->pr_ctloutput) 1966 (so, sopt)); 1967 } 1968 } 1969 bad: 1970 return (error); 1971 } 1972 1973 /* Helper routine for getsockopt */ 1974 int 1975 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1976 { 1977 int error; 1978 size_t valsize; 1979 1980 error = 0; 1981 1982 /* 1983 * Documented get behavior is that we always return a value, 1984 * possibly truncated to fit in the user's buffer. 1985 * Traditional behavior is that we always tell the user 1986 * precisely how much we copied, rather than something useful 1987 * like the total amount we had available for her. 1988 * Note that this interface is not idempotent; the entire answer must 1989 * generated ahead of time. 1990 */ 1991 valsize = min(len, sopt->sopt_valsize); 1992 sopt->sopt_valsize = valsize; 1993 if (sopt->sopt_val != NULL) { 1994 if (sopt->sopt_td != NULL) 1995 error = copyout(buf, sopt->sopt_val, valsize); 1996 else 1997 bcopy(buf, sopt->sopt_val, valsize); 1998 } 1999 return (error); 2000 } 2001 2002 int 2003 sogetopt(so, sopt) 2004 struct socket *so; 2005 struct sockopt *sopt; 2006 { 2007 int error, optval; 2008 struct linger l; 2009 struct timeval tv; 2010 #ifdef MAC 2011 struct mac extmac; 2012 #endif 2013 2014 error = 0; 2015 if (sopt->sopt_level != SOL_SOCKET) { 2016 if (so->so_proto && so->so_proto->pr_ctloutput) { 2017 return ((*so->so_proto->pr_ctloutput) 2018 (so, sopt)); 2019 } else 2020 return (ENOPROTOOPT); 2021 } else { 2022 switch (sopt->sopt_name) { 2023 #ifdef INET 2024 case SO_ACCEPTFILTER: 2025 error = do_getopt_accept_filter(so, sopt); 2026 break; 2027 #endif 2028 case SO_LINGER: 2029 SOCK_LOCK(so); 2030 l.l_onoff = so->so_options & SO_LINGER; 2031 l.l_linger = so->so_linger; 2032 SOCK_UNLOCK(so); 2033 error = sooptcopyout(sopt, &l, sizeof l); 2034 break; 2035 2036 case SO_USELOOPBACK: 2037 case SO_DONTROUTE: 2038 case SO_DEBUG: 2039 case SO_KEEPALIVE: 2040 case SO_REUSEADDR: 2041 case SO_REUSEPORT: 2042 case SO_BROADCAST: 2043 case SO_OOBINLINE: 2044 case SO_ACCEPTCONN: 2045 case SO_TIMESTAMP: 2046 case SO_BINTIME: 2047 case SO_NOSIGPIPE: 2048 optval = so->so_options & sopt->sopt_name; 2049 integer: 2050 error = sooptcopyout(sopt, &optval, sizeof optval); 2051 break; 2052 2053 case SO_TYPE: 2054 optval = so->so_type; 2055 goto integer; 2056 2057 case SO_ERROR: 2058 optval = so->so_error; 2059 so->so_error = 0; 2060 goto integer; 2061 2062 case SO_SNDBUF: 2063 optval = so->so_snd.sb_hiwat; 2064 goto integer; 2065 2066 case SO_RCVBUF: 2067 optval = so->so_rcv.sb_hiwat; 2068 goto integer; 2069 2070 case SO_SNDLOWAT: 2071 optval = so->so_snd.sb_lowat; 2072 goto integer; 2073 2074 case SO_RCVLOWAT: 2075 optval = so->so_rcv.sb_lowat; 2076 goto integer; 2077 2078 case SO_SNDTIMEO: 2079 case SO_RCVTIMEO: 2080 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2081 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2082 2083 tv.tv_sec = optval / hz; 2084 tv.tv_usec = (optval % hz) * tick; 2085 #ifdef COMPAT_IA32 2086 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2087 struct timeval32 tv32; 2088 2089 CP(tv, tv32, tv_sec); 2090 CP(tv, tv32, tv_usec); 2091 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2092 } else 2093 #endif 2094 error = sooptcopyout(sopt, &tv, sizeof tv); 2095 break; 2096 2097 case SO_LABEL: 2098 #ifdef MAC 2099 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2100 sizeof(extmac)); 2101 if (error) 2102 return (error); 2103 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2104 so, &extmac); 2105 if (error) 2106 return (error); 2107 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2108 #else 2109 error = EOPNOTSUPP; 2110 #endif 2111 break; 2112 2113 case SO_PEERLABEL: 2114 #ifdef MAC 2115 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2116 sizeof(extmac)); 2117 if (error) 2118 return (error); 2119 error = mac_getsockopt_peerlabel( 2120 sopt->sopt_td->td_ucred, so, &extmac); 2121 if (error) 2122 return (error); 2123 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2124 #else 2125 error = EOPNOTSUPP; 2126 #endif 2127 break; 2128 2129 case SO_LISTENQLIMIT: 2130 optval = so->so_qlimit; 2131 goto integer; 2132 2133 case SO_LISTENQLEN: 2134 optval = so->so_qlen; 2135 goto integer; 2136 2137 case SO_LISTENINCQLEN: 2138 optval = so->so_incqlen; 2139 goto integer; 2140 2141 default: 2142 error = ENOPROTOOPT; 2143 break; 2144 } 2145 return (error); 2146 } 2147 } 2148 2149 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2150 int 2151 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2152 { 2153 struct mbuf *m, *m_prev; 2154 int sopt_size = sopt->sopt_valsize; 2155 2156 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2157 if (m == NULL) 2158 return ENOBUFS; 2159 if (sopt_size > MLEN) { 2160 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 2161 if ((m->m_flags & M_EXT) == 0) { 2162 m_free(m); 2163 return ENOBUFS; 2164 } 2165 m->m_len = min(MCLBYTES, sopt_size); 2166 } else { 2167 m->m_len = min(MLEN, sopt_size); 2168 } 2169 sopt_size -= m->m_len; 2170 *mp = m; 2171 m_prev = m; 2172 2173 while (sopt_size) { 2174 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2175 if (m == NULL) { 2176 m_freem(*mp); 2177 return ENOBUFS; 2178 } 2179 if (sopt_size > MLEN) { 2180 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2181 M_DONTWAIT); 2182 if ((m->m_flags & M_EXT) == 0) { 2183 m_freem(m); 2184 m_freem(*mp); 2185 return ENOBUFS; 2186 } 2187 m->m_len = min(MCLBYTES, sopt_size); 2188 } else { 2189 m->m_len = min(MLEN, sopt_size); 2190 } 2191 sopt_size -= m->m_len; 2192 m_prev->m_next = m; 2193 m_prev = m; 2194 } 2195 return (0); 2196 } 2197 2198 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2199 int 2200 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2201 { 2202 struct mbuf *m0 = m; 2203 2204 if (sopt->sopt_val == NULL) 2205 return (0); 2206 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2207 if (sopt->sopt_td != NULL) { 2208 int error; 2209 2210 error = copyin(sopt->sopt_val, mtod(m, char *), 2211 m->m_len); 2212 if (error != 0) { 2213 m_freem(m0); 2214 return(error); 2215 } 2216 } else 2217 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2218 sopt->sopt_valsize -= m->m_len; 2219 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2220 m = m->m_next; 2221 } 2222 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2223 panic("ip6_sooptmcopyin"); 2224 return (0); 2225 } 2226 2227 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2228 int 2229 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2230 { 2231 struct mbuf *m0 = m; 2232 size_t valsize = 0; 2233 2234 if (sopt->sopt_val == NULL) 2235 return (0); 2236 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2237 if (sopt->sopt_td != NULL) { 2238 int error; 2239 2240 error = copyout(mtod(m, char *), sopt->sopt_val, 2241 m->m_len); 2242 if (error != 0) { 2243 m_freem(m0); 2244 return(error); 2245 } 2246 } else 2247 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2248 sopt->sopt_valsize -= m->m_len; 2249 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2250 valsize += m->m_len; 2251 m = m->m_next; 2252 } 2253 if (m != NULL) { 2254 /* enough soopt buffer should be given from user-land */ 2255 m_freem(m0); 2256 return(EINVAL); 2257 } 2258 sopt->sopt_valsize = valsize; 2259 return (0); 2260 } 2261 2262 void 2263 sohasoutofband(so) 2264 struct socket *so; 2265 { 2266 if (so->so_sigio != NULL) 2267 pgsigio(&so->so_sigio, SIGURG, 0); 2268 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2269 } 2270 2271 int 2272 sopoll(struct socket *so, int events, struct ucred *active_cred, 2273 struct thread *td) 2274 { 2275 int revents = 0; 2276 2277 SOCKBUF_LOCK(&so->so_snd); 2278 SOCKBUF_LOCK(&so->so_rcv); 2279 if (events & (POLLIN | POLLRDNORM)) 2280 if (soreadable(so)) 2281 revents |= events & (POLLIN | POLLRDNORM); 2282 2283 if (events & POLLINIGNEOF) 2284 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2285 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2286 revents |= POLLINIGNEOF; 2287 2288 if (events & (POLLOUT | POLLWRNORM)) 2289 if (sowriteable(so)) 2290 revents |= events & (POLLOUT | POLLWRNORM); 2291 2292 if (events & (POLLPRI | POLLRDBAND)) 2293 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2294 revents |= events & (POLLPRI | POLLRDBAND); 2295 2296 if (revents == 0) { 2297 if (events & 2298 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2299 POLLRDBAND)) { 2300 selrecord(td, &so->so_rcv.sb_sel); 2301 so->so_rcv.sb_flags |= SB_SEL; 2302 } 2303 2304 if (events & (POLLOUT | POLLWRNORM)) { 2305 selrecord(td, &so->so_snd.sb_sel); 2306 so->so_snd.sb_flags |= SB_SEL; 2307 } 2308 } 2309 2310 SOCKBUF_UNLOCK(&so->so_rcv); 2311 SOCKBUF_UNLOCK(&so->so_snd); 2312 return (revents); 2313 } 2314 2315 int 2316 soo_kqfilter(struct file *fp, struct knote *kn) 2317 { 2318 struct socket *so = kn->kn_fp->f_data; 2319 struct sockbuf *sb; 2320 2321 switch (kn->kn_filter) { 2322 case EVFILT_READ: 2323 if (so->so_options & SO_ACCEPTCONN) 2324 kn->kn_fop = &solisten_filtops; 2325 else 2326 kn->kn_fop = &soread_filtops; 2327 sb = &so->so_rcv; 2328 break; 2329 case EVFILT_WRITE: 2330 kn->kn_fop = &sowrite_filtops; 2331 sb = &so->so_snd; 2332 break; 2333 default: 2334 return (EINVAL); 2335 } 2336 2337 SOCKBUF_LOCK(sb); 2338 knlist_add(&sb->sb_sel.si_note, kn, 1); 2339 sb->sb_flags |= SB_KNOTE; 2340 SOCKBUF_UNLOCK(sb); 2341 return (0); 2342 } 2343 2344 static void 2345 filt_sordetach(struct knote *kn) 2346 { 2347 struct socket *so = kn->kn_fp->f_data; 2348 2349 SOCKBUF_LOCK(&so->so_rcv); 2350 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2351 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2352 so->so_rcv.sb_flags &= ~SB_KNOTE; 2353 SOCKBUF_UNLOCK(&so->so_rcv); 2354 } 2355 2356 /*ARGSUSED*/ 2357 static int 2358 filt_soread(struct knote *kn, long hint) 2359 { 2360 struct socket *so; 2361 2362 so = kn->kn_fp->f_data; 2363 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2364 2365 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2366 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2367 kn->kn_flags |= EV_EOF; 2368 kn->kn_fflags = so->so_error; 2369 return (1); 2370 } else if (so->so_error) /* temporary udp error */ 2371 return (1); 2372 else if (kn->kn_sfflags & NOTE_LOWAT) 2373 return (kn->kn_data >= kn->kn_sdata); 2374 else 2375 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2376 } 2377 2378 static void 2379 filt_sowdetach(struct knote *kn) 2380 { 2381 struct socket *so = kn->kn_fp->f_data; 2382 2383 SOCKBUF_LOCK(&so->so_snd); 2384 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2385 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2386 so->so_snd.sb_flags &= ~SB_KNOTE; 2387 SOCKBUF_UNLOCK(&so->so_snd); 2388 } 2389 2390 /*ARGSUSED*/ 2391 static int 2392 filt_sowrite(struct knote *kn, long hint) 2393 { 2394 struct socket *so; 2395 2396 so = kn->kn_fp->f_data; 2397 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2398 kn->kn_data = sbspace(&so->so_snd); 2399 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2400 kn->kn_flags |= EV_EOF; 2401 kn->kn_fflags = so->so_error; 2402 return (1); 2403 } else if (so->so_error) /* temporary udp error */ 2404 return (1); 2405 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2406 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2407 return (0); 2408 else if (kn->kn_sfflags & NOTE_LOWAT) 2409 return (kn->kn_data >= kn->kn_sdata); 2410 else 2411 return (kn->kn_data >= so->so_snd.sb_lowat); 2412 } 2413 2414 /*ARGSUSED*/ 2415 static int 2416 filt_solisten(struct knote *kn, long hint) 2417 { 2418 struct socket *so = kn->kn_fp->f_data; 2419 2420 kn->kn_data = so->so_qlen; 2421 return (! TAILQ_EMPTY(&so->so_comp)); 2422 } 2423 2424 int 2425 socheckuid(struct socket *so, uid_t uid) 2426 { 2427 2428 if (so == NULL) 2429 return (EPERM); 2430 if (so->so_cred->cr_uid != uid) 2431 return (EPERM); 2432 return (0); 2433 } 2434 2435 static int 2436 somaxconn_sysctl(SYSCTL_HANDLER_ARGS) 2437 { 2438 int error; 2439 int val; 2440 2441 val = somaxconn; 2442 error = sysctl_handle_int(oidp, &val, sizeof(int), req); 2443 if (error || !req->newptr ) 2444 return (error); 2445 2446 if (val < 1 || val > USHRT_MAX) 2447 return (EINVAL); 2448 2449 somaxconn = val; 2450 return (0); 2451 } 2452