1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 * 96 * NOTE: With regard to VNETs the general rule is that callers do not set 97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 98 * sofree(), sorele(), sonewconn() and sorflush(), which are usually called 99 * from a pre-set VNET context. sopoll() currently does not need a VNET 100 * context to be set. 101 */ 102 103 #include <sys/cdefs.h> 104 #include "opt_inet.h" 105 #include "opt_inet6.h" 106 #include "opt_kern_tls.h" 107 #include "opt_ktrace.h" 108 #include "opt_sctp.h" 109 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/capsicum.h> 113 #include <sys/fcntl.h> 114 #include <sys/limits.h> 115 #include <sys/lock.h> 116 #include <sys/mac.h> 117 #include <sys/malloc.h> 118 #include <sys/mbuf.h> 119 #include <sys/mutex.h> 120 #include <sys/domain.h> 121 #include <sys/file.h> /* for struct knote */ 122 #include <sys/hhook.h> 123 #include <sys/kernel.h> 124 #include <sys/khelp.h> 125 #include <sys/kthread.h> 126 #include <sys/ktls.h> 127 #include <sys/event.h> 128 #include <sys/eventhandler.h> 129 #include <sys/poll.h> 130 #include <sys/proc.h> 131 #include <sys/protosw.h> 132 #include <sys/sbuf.h> 133 #include <sys/socket.h> 134 #include <sys/socketvar.h> 135 #include <sys/resourcevar.h> 136 #include <net/route.h> 137 #include <sys/sched.h> 138 #include <sys/signalvar.h> 139 #include <sys/smp.h> 140 #include <sys/stat.h> 141 #include <sys/sx.h> 142 #include <sys/sysctl.h> 143 #include <sys/taskqueue.h> 144 #include <sys/uio.h> 145 #include <sys/un.h> 146 #include <sys/unpcb.h> 147 #include <sys/jail.h> 148 #include <sys/syslog.h> 149 #include <netinet/in.h> 150 #include <netinet/in_pcb.h> 151 #include <netinet/tcp.h> 152 153 #include <net/vnet.h> 154 155 #include <security/mac/mac_framework.h> 156 157 #include <vm/uma.h> 158 159 #ifdef COMPAT_FREEBSD32 160 #include <sys/mount.h> 161 #include <sys/sysent.h> 162 #include <compat/freebsd32/freebsd32.h> 163 #endif 164 165 static int soreceive_generic_locked(struct socket *so, 166 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 167 struct mbuf **controlp, int *flagsp); 168 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 169 int flags); 170 static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 171 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 172 struct mbuf **controlp, int flags); 173 static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, 174 struct uio *uio, struct mbuf *top, struct mbuf *control, 175 int flags, struct thread *td); 176 static void so_rdknl_lock(void *); 177 static void so_rdknl_unlock(void *); 178 static void so_rdknl_assert_lock(void *, int); 179 static void so_wrknl_lock(void *); 180 static void so_wrknl_unlock(void *); 181 static void so_wrknl_assert_lock(void *, int); 182 183 static void filt_sordetach(struct knote *kn); 184 static int filt_soread(struct knote *kn, long hint); 185 static void filt_sowdetach(struct knote *kn); 186 static int filt_sowrite(struct knote *kn, long hint); 187 static int filt_soempty(struct knote *kn, long hint); 188 fo_kqfilter_t soo_kqfilter; 189 190 static struct filterops soread_filtops = { 191 .f_isfd = 1, 192 .f_detach = filt_sordetach, 193 .f_event = filt_soread, 194 }; 195 static struct filterops sowrite_filtops = { 196 .f_isfd = 1, 197 .f_detach = filt_sowdetach, 198 .f_event = filt_sowrite, 199 }; 200 static struct filterops soempty_filtops = { 201 .f_isfd = 1, 202 .f_detach = filt_sowdetach, 203 .f_event = filt_soempty, 204 }; 205 206 so_gen_t so_gencnt; /* generation count for sockets */ 207 208 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 209 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 210 211 #define VNET_SO_ASSERT(so) \ 212 VNET_ASSERT(curvnet != NULL, \ 213 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 214 215 #ifdef SOCKET_HHOOK 216 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 217 #define V_socket_hhh VNET(socket_hhh) 218 static inline int hhook_run_socket(struct socket *, void *, int32_t); 219 #endif 220 221 #ifdef COMPAT_FREEBSD32 222 #ifdef __amd64__ 223 /* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */ 224 #define __splice32_packed __packed 225 #else 226 #define __splice32_packed 227 #endif 228 struct splice32 { 229 int32_t sp_fd; 230 int64_t sp_max; 231 struct timeval32 sp_idle; 232 } __splice32_packed; 233 #undef __splice32_packed 234 #endif 235 236 /* 237 * Limit on the number of connections in the listen queue waiting 238 * for accept(2). 239 * NB: The original sysctl somaxconn is still available but hidden 240 * to prevent confusion about the actual purpose of this number. 241 */ 242 static u_int somaxconn = SOMAXCONN; 243 244 static int 245 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 246 { 247 int error; 248 int val; 249 250 val = somaxconn; 251 error = sysctl_handle_int(oidp, &val, 0, req); 252 if (error || !req->newptr ) 253 return (error); 254 255 /* 256 * The purpose of the UINT_MAX / 3 limit, is so that the formula 257 * 3 * so_qlimit / 2 258 * below, will not overflow. 259 */ 260 261 if (val < 1 || val > UINT_MAX / 3) 262 return (EINVAL); 263 264 somaxconn = val; 265 return (0); 266 } 267 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 268 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), 269 sysctl_somaxconn, "I", 270 "Maximum listen socket pending connection accept queue size"); 271 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 272 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, 273 sizeof(int), sysctl_somaxconn, "I", 274 "Maximum listen socket pending connection accept queue size (compat)"); 275 276 static int numopensockets; 277 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 278 &numopensockets, 0, "Number of open sockets"); 279 280 /* 281 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 282 * so_gencnt field. 283 */ 284 static struct mtx so_global_mtx; 285 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 286 287 /* 288 * General IPC sysctl name space, used by sockets and a variety of other IPC 289 * types. 290 */ 291 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 292 "IPC"); 293 294 /* 295 * Initialize the socket subsystem and set up the socket 296 * memory allocator. 297 */ 298 static uma_zone_t socket_zone; 299 int maxsockets; 300 301 static void 302 socket_zone_change(void *tag) 303 { 304 305 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 306 } 307 308 static int splice_init_state; 309 static struct sx splice_init_lock; 310 SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init"); 311 312 static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0, 313 "Settings relating to the SO_SPLICE socket option"); 314 315 static bool splice_receive_stream = true; 316 SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN, 317 &splice_receive_stream, 0, 318 "Use soreceive_stream() for stream splices"); 319 320 static uma_zone_t splice_zone; 321 static struct proc *splice_proc; 322 struct splice_wq { 323 struct mtx mtx; 324 STAILQ_HEAD(, so_splice) head; 325 bool running; 326 } __aligned(CACHE_LINE_SIZE); 327 static struct splice_wq *splice_wq; 328 static uint32_t splice_index = 0; 329 330 static void so_splice_timeout(void *arg, int pending); 331 static void so_splice_xfer(struct so_splice *s); 332 static int so_unsplice(struct socket *so, bool timeout); 333 334 static void 335 splice_work_thread(void *ctx) 336 { 337 struct splice_wq *wq = ctx; 338 struct so_splice *s, *s_temp; 339 STAILQ_HEAD(, so_splice) local_head; 340 int cpu; 341 342 cpu = wq - splice_wq; 343 if (bootverbose) 344 printf("starting so_splice worker thread for CPU %d\n", cpu); 345 346 for (;;) { 347 mtx_lock(&wq->mtx); 348 while (STAILQ_EMPTY(&wq->head)) { 349 wq->running = false; 350 mtx_sleep(wq, &wq->mtx, 0, "-", 0); 351 wq->running = true; 352 } 353 STAILQ_INIT(&local_head); 354 STAILQ_CONCAT(&local_head, &wq->head); 355 STAILQ_INIT(&wq->head); 356 mtx_unlock(&wq->mtx); 357 STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) { 358 mtx_lock(&s->mtx); 359 CURVNET_SET(s->src->so_vnet); 360 so_splice_xfer(s); 361 CURVNET_RESTORE(); 362 } 363 } 364 } 365 366 static void 367 so_splice_dispatch_async(struct so_splice *sp) 368 { 369 struct splice_wq *wq; 370 bool running; 371 372 wq = &splice_wq[sp->wq_index]; 373 mtx_lock(&wq->mtx); 374 STAILQ_INSERT_TAIL(&wq->head, sp, next); 375 running = wq->running; 376 mtx_unlock(&wq->mtx); 377 if (!running) 378 wakeup(wq); 379 } 380 381 void 382 so_splice_dispatch(struct so_splice *sp) 383 { 384 mtx_assert(&sp->mtx, MA_OWNED); 385 386 if (sp->state != SPLICE_IDLE) { 387 mtx_unlock(&sp->mtx); 388 } else { 389 sp->state = SPLICE_QUEUED; 390 mtx_unlock(&sp->mtx); 391 so_splice_dispatch_async(sp); 392 } 393 } 394 395 static int 396 splice_zinit(void *mem, int size __unused, int flags __unused) 397 { 398 struct so_splice *s; 399 400 s = (struct so_splice *)mem; 401 mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF); 402 return (0); 403 } 404 405 static void 406 splice_zfini(void *mem, int size) 407 { 408 struct so_splice *s; 409 410 s = (struct so_splice *)mem; 411 mtx_destroy(&s->mtx); 412 } 413 414 static int 415 splice_init(void) 416 { 417 struct thread *td; 418 int error, i, state; 419 420 state = atomic_load_acq_int(&splice_init_state); 421 if (__predict_true(state > 0)) 422 return (0); 423 if (state < 0) 424 return (ENXIO); 425 sx_xlock(&splice_init_lock); 426 if (splice_init_state != 0) { 427 sx_xunlock(&splice_init_lock); 428 return (0); 429 } 430 431 splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL, 432 NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0); 433 434 splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP, 435 M_WAITOK | M_ZERO); 436 437 /* 438 * Initialize the workqueues to run the splice work. We create a 439 * work queue for each CPU. 440 */ 441 CPU_FOREACH(i) { 442 STAILQ_INIT(&splice_wq[i].head); 443 mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF); 444 } 445 446 /* Start kthreads for each workqueue. */ 447 error = 0; 448 CPU_FOREACH(i) { 449 error = kproc_kthread_add(splice_work_thread, &splice_wq[i], 450 &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i); 451 if (error) { 452 printf("Can't add so_splice thread %d error %d\n", 453 i, error); 454 break; 455 } 456 457 /* 458 * It's possible to create loops with SO_SPLICE; ensure that 459 * worker threads aren't able to starve the system too easily. 460 */ 461 thread_lock(td); 462 sched_prio(td, PUSER); 463 thread_unlock(td); 464 } 465 466 splice_init_state = error != 0 ? -1 : 1; 467 sx_xunlock(&splice_init_lock); 468 469 return (error); 470 } 471 472 /* 473 * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding 474 * one lock in order to avoid potential deadlocks in case there is some other 475 * code path which acquires more than one I/O lock at a time. 476 */ 477 static void 478 splice_lock_pair(struct socket *so_src, struct socket *so_dst) 479 { 480 int error; 481 482 for (;;) { 483 error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR); 484 KASSERT(error == 0, 485 ("%s: failed to lock send I/O lock: %d", __func__, error)); 486 error = SOCK_IO_RECV_LOCK(so_src, 0); 487 KASSERT(error == 0 || error == EWOULDBLOCK, 488 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 489 if (error == 0) 490 break; 491 SOCK_IO_SEND_UNLOCK(so_dst); 492 493 error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR); 494 KASSERT(error == 0, 495 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 496 error = SOCK_IO_SEND_LOCK(so_dst, 0); 497 KASSERT(error == 0 || error == EWOULDBLOCK, 498 ("%s: failed to lock send I/O lock: %d", __func__, error)); 499 if (error == 0) 500 break; 501 SOCK_IO_RECV_UNLOCK(so_src); 502 } 503 } 504 505 static void 506 splice_unlock_pair(struct socket *so_src, struct socket *so_dst) 507 { 508 SOCK_IO_RECV_UNLOCK(so_src); 509 SOCK_IO_SEND_UNLOCK(so_dst); 510 } 511 512 /* 513 * Move data from the source to the sink. Assumes that both of the relevant 514 * socket I/O locks are held. 515 */ 516 static int 517 so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max, 518 ssize_t *lenp) 519 { 520 struct uio uio; 521 struct mbuf *m; 522 struct sockbuf *sb_src, *sb_dst; 523 ssize_t len; 524 long space; 525 int error, flags; 526 527 SOCK_IO_RECV_ASSERT_LOCKED(so_src); 528 SOCK_IO_SEND_ASSERT_LOCKED(so_dst); 529 530 error = 0; 531 m = NULL; 532 memset(&uio, 0, sizeof(uio)); 533 534 sb_src = &so_src->so_rcv; 535 sb_dst = &so_dst->so_snd; 536 537 space = sbspace(sb_dst); 538 if (space < 0) 539 space = 0; 540 len = MIN(max, MIN(space, sbavail(sb_src))); 541 if (len == 0) { 542 SOCK_RECVBUF_LOCK(so_src); 543 if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0) 544 error = EPIPE; 545 SOCK_RECVBUF_UNLOCK(so_src); 546 } else { 547 flags = MSG_DONTWAIT; 548 uio.uio_resid = len; 549 if (splice_receive_stream && sb_src->sb_tls_info == NULL) { 550 error = soreceive_stream_locked(so_src, sb_src, NULL, 551 &uio, &m, NULL, flags); 552 } else { 553 error = soreceive_generic_locked(so_src, NULL, 554 &uio, &m, NULL, &flags); 555 } 556 if (error != 0 && m != NULL) { 557 m_freem(m); 558 m = NULL; 559 } 560 } 561 if (m != NULL) { 562 len -= uio.uio_resid; 563 error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL, 564 MSG_DONTWAIT, curthread); 565 } else if (error == 0) { 566 len = 0; 567 SOCK_SENDBUF_LOCK(so_dst); 568 if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0) 569 error = EPIPE; 570 SOCK_SENDBUF_UNLOCK(so_dst); 571 } 572 if (error == 0) 573 *lenp = len; 574 return (error); 575 } 576 577 /* 578 * Transfer data from the source to the sink. 579 * 580 * If "direct" is true, the transfer is done in the context of whichever thread 581 * is operating on one of the socket buffers. We do not know which locks are 582 * held, so we can only trylock the socket buffers; if this fails, we fall back 583 * to the worker thread, which invokes this routine with "direct" set to false. 584 */ 585 static void 586 so_splice_xfer(struct so_splice *sp) 587 { 588 struct socket *so_src, *so_dst; 589 off_t max; 590 ssize_t len; 591 int error; 592 593 mtx_assert(&sp->mtx, MA_OWNED); 594 KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING, 595 ("so_splice_xfer: invalid state %d", sp->state)); 596 KASSERT(sp->max != 0, ("so_splice_xfer: max == 0")); 597 598 if (sp->state == SPLICE_CLOSING) { 599 /* Userspace asked us to close the splice. */ 600 goto closing; 601 } 602 603 sp->state = SPLICE_RUNNING; 604 so_src = sp->src; 605 so_dst = sp->dst; 606 max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX; 607 if (max < 0) 608 max = 0; 609 610 /* 611 * Lock the sockets in order to block userspace from doing anything 612 * sneaky. If an error occurs or one of the sockets can no longer 613 * transfer data, we will automatically unsplice. 614 */ 615 mtx_unlock(&sp->mtx); 616 splice_lock_pair(so_src, so_dst); 617 618 error = so_splice_xfer_data(so_src, so_dst, max, &len); 619 620 mtx_lock(&sp->mtx); 621 622 /* 623 * Update our stats while still holding the socket locks. This 624 * synchronizes with getsockopt(SO_SPLICE), see the comment there. 625 */ 626 if (error == 0) { 627 KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len)); 628 so_src->so_splice_sent += len; 629 } 630 splice_unlock_pair(so_src, so_dst); 631 632 switch (sp->state) { 633 case SPLICE_CLOSING: 634 closing: 635 sp->state = SPLICE_CLOSED; 636 wakeup(sp); 637 mtx_unlock(&sp->mtx); 638 break; 639 case SPLICE_RUNNING: 640 if (error != 0 || 641 (sp->max > 0 && so_src->so_splice_sent >= sp->max)) { 642 sp->state = SPLICE_EXCEPTION; 643 soref(so_src); 644 mtx_unlock(&sp->mtx); 645 (void)so_unsplice(so_src, false); 646 sorele(so_src); 647 } else { 648 /* 649 * Locklessly check for additional bytes in the source's 650 * receive buffer and queue more work if possible. We 651 * may end up queuing needless work, but that's ok, and 652 * if we race with a thread inserting more data into the 653 * buffer and observe sbavail() == 0, the splice mutex 654 * ensures that splice_push() will queue more work for 655 * us. 656 */ 657 if (sbavail(&so_src->so_rcv) > 0 && 658 sbspace(&so_dst->so_snd) > 0) { 659 sp->state = SPLICE_QUEUED; 660 mtx_unlock(&sp->mtx); 661 so_splice_dispatch_async(sp); 662 } else { 663 sp->state = SPLICE_IDLE; 664 mtx_unlock(&sp->mtx); 665 } 666 } 667 break; 668 default: 669 __assert_unreachable(); 670 } 671 } 672 673 static void 674 socket_init(void *tag) 675 { 676 677 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 678 NULL, NULL, UMA_ALIGN_PTR, 0); 679 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 680 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 681 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 682 EVENTHANDLER_PRI_FIRST); 683 } 684 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 685 686 #ifdef SOCKET_HHOOK 687 static void 688 socket_hhook_register(int subtype) 689 { 690 691 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 692 &V_socket_hhh[subtype], 693 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 694 printf("%s: WARNING: unable to register hook\n", __func__); 695 } 696 697 static void 698 socket_hhook_deregister(int subtype) 699 { 700 701 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 702 printf("%s: WARNING: unable to deregister hook\n", __func__); 703 } 704 705 static void 706 socket_vnet_init(const void *unused __unused) 707 { 708 int i; 709 710 /* We expect a contiguous range */ 711 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 712 socket_hhook_register(i); 713 } 714 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 715 socket_vnet_init, NULL); 716 717 static void 718 socket_vnet_uninit(const void *unused __unused) 719 { 720 int i; 721 722 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 723 socket_hhook_deregister(i); 724 } 725 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 726 socket_vnet_uninit, NULL); 727 #endif /* SOCKET_HHOOK */ 728 729 /* 730 * Initialise maxsockets. This SYSINIT must be run after 731 * tunable_mbinit(). 732 */ 733 static void 734 init_maxsockets(void *ignored) 735 { 736 737 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 738 maxsockets = imax(maxsockets, maxfiles); 739 } 740 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 741 742 /* 743 * Sysctl to get and set the maximum global sockets limit. Notify protocols 744 * of the change so that they can update their dependent limits as required. 745 */ 746 static int 747 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 748 { 749 int error, newmaxsockets; 750 751 newmaxsockets = maxsockets; 752 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 753 if (error == 0 && req->newptr && newmaxsockets != maxsockets) { 754 if (newmaxsockets > maxsockets && 755 newmaxsockets <= maxfiles) { 756 maxsockets = newmaxsockets; 757 EVENTHANDLER_INVOKE(maxsockets_change); 758 } else 759 error = EINVAL; 760 } 761 return (error); 762 } 763 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 764 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 765 &maxsockets, 0, sysctl_maxsockets, "IU", 766 "Maximum number of sockets available"); 767 768 /* 769 * Socket operation routines. These routines are called by the routines in 770 * sys_socket.c or from a system process, and implement the semantics of 771 * socket operations by switching out to the protocol specific routines. 772 */ 773 774 /* 775 * Get a socket structure from our zone, and initialize it. Note that it 776 * would probably be better to allocate socket and PCB at the same time, but 777 * I'm not convinced that all the protocols can be easily modified to do 778 * this. 779 * 780 * soalloc() returns a socket with a ref count of 0. 781 */ 782 static struct socket * 783 soalloc(struct vnet *vnet) 784 { 785 struct socket *so; 786 787 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 788 if (so == NULL) 789 return (NULL); 790 #ifdef MAC 791 if (mac_socket_init(so, M_NOWAIT) != 0) { 792 uma_zfree(socket_zone, so); 793 return (NULL); 794 } 795 #endif 796 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 797 uma_zfree(socket_zone, so); 798 return (NULL); 799 } 800 801 /* 802 * The socket locking protocol allows to lock 2 sockets at a time, 803 * however, the first one must be a listening socket. WITNESS lacks 804 * a feature to change class of an existing lock, so we use DUPOK. 805 */ 806 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 807 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); 808 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); 809 so->so_rcv.sb_sel = &so->so_rdsel; 810 so->so_snd.sb_sel = &so->so_wrsel; 811 sx_init(&so->so_snd_sx, "so_snd_sx"); 812 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 813 TAILQ_INIT(&so->so_snd.sb_aiojobq); 814 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 815 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 816 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 817 #ifdef VIMAGE 818 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 819 __func__, __LINE__, so)); 820 so->so_vnet = vnet; 821 #endif 822 #ifdef SOCKET_HHOOK 823 /* We shouldn't need the so_global_mtx */ 824 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 825 /* Do we need more comprehensive error returns? */ 826 uma_zfree(socket_zone, so); 827 return (NULL); 828 } 829 #endif 830 mtx_lock(&so_global_mtx); 831 so->so_gencnt = ++so_gencnt; 832 ++numopensockets; 833 #ifdef VIMAGE 834 vnet->vnet_sockcnt++; 835 #endif 836 mtx_unlock(&so_global_mtx); 837 838 return (so); 839 } 840 841 /* 842 * Free the storage associated with a socket at the socket layer, tear down 843 * locks, labels, etc. All protocol state is assumed already to have been 844 * torn down (and possibly never set up) by the caller. 845 */ 846 void 847 sodealloc(struct socket *so) 848 { 849 850 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 851 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 852 853 mtx_lock(&so_global_mtx); 854 so->so_gencnt = ++so_gencnt; 855 --numopensockets; /* Could be below, but faster here. */ 856 #ifdef VIMAGE 857 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 858 __func__, __LINE__, so)); 859 so->so_vnet->vnet_sockcnt--; 860 #endif 861 mtx_unlock(&so_global_mtx); 862 #ifdef MAC 863 mac_socket_destroy(so); 864 #endif 865 #ifdef SOCKET_HHOOK 866 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 867 #endif 868 869 khelp_destroy_osd(&so->osd); 870 if (SOLISTENING(so)) { 871 if (so->sol_accept_filter != NULL) 872 accept_filt_setopt(so, NULL); 873 } else { 874 if (so->so_rcv.sb_hiwat) 875 (void)chgsbsize(so->so_cred->cr_uidinfo, 876 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 877 if (so->so_snd.sb_hiwat) 878 (void)chgsbsize(so->so_cred->cr_uidinfo, 879 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 880 sx_destroy(&so->so_snd_sx); 881 sx_destroy(&so->so_rcv_sx); 882 mtx_destroy(&so->so_snd_mtx); 883 mtx_destroy(&so->so_rcv_mtx); 884 } 885 crfree(so->so_cred); 886 mtx_destroy(&so->so_lock); 887 uma_zfree(socket_zone, so); 888 } 889 890 /* 891 * socreate returns a socket with a ref count of 1 and a file descriptor 892 * reference. The socket should be closed with soclose(). 893 */ 894 int 895 socreate(int dom, struct socket **aso, int type, int proto, 896 struct ucred *cred, struct thread *td) 897 { 898 struct protosw *prp; 899 struct socket *so; 900 int error; 901 902 /* 903 * XXX: divert(4) historically abused PF_INET. Keep this compatibility 904 * shim until all applications have been updated. 905 */ 906 if (__predict_false(dom == PF_INET && type == SOCK_RAW && 907 proto == IPPROTO_DIVERT)) { 908 dom = PF_DIVERT; 909 printf("%s uses obsolete way to create divert(4) socket\n", 910 td->td_proc->p_comm); 911 } 912 913 prp = pffindproto(dom, type, proto); 914 if (prp == NULL) { 915 /* No support for domain. */ 916 if (pffinddomain(dom) == NULL) 917 return (EAFNOSUPPORT); 918 /* No support for socket type. */ 919 if (proto == 0 && type != 0) 920 return (EPROTOTYPE); 921 return (EPROTONOSUPPORT); 922 } 923 924 MPASS(prp->pr_attach); 925 926 if ((prp->pr_flags & PR_CAPATTACH) == 0) { 927 if (CAP_TRACING(td)) 928 ktrcapfail(CAPFAIL_PROTO, &proto); 929 if (IN_CAPABILITY_MODE(td)) 930 return (ECAPMODE); 931 } 932 933 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 934 return (EPROTONOSUPPORT); 935 936 so = soalloc(CRED_TO_VNET(cred)); 937 if (so == NULL) 938 return (ENOBUFS); 939 940 so->so_type = type; 941 so->so_cred = crhold(cred); 942 if ((prp->pr_domain->dom_family == PF_INET) || 943 (prp->pr_domain->dom_family == PF_INET6) || 944 (prp->pr_domain->dom_family == PF_ROUTE)) 945 so->so_fibnum = td->td_proc->p_fibnum; 946 else 947 so->so_fibnum = 0; 948 so->so_proto = prp; 949 #ifdef MAC 950 mac_socket_create(cred, so); 951 #endif 952 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 953 so_rdknl_assert_lock); 954 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 955 so_wrknl_assert_lock); 956 if ((prp->pr_flags & PR_SOCKBUF) == 0) { 957 so->so_snd.sb_mtx = &so->so_snd_mtx; 958 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 959 } 960 /* 961 * Auto-sizing of socket buffers is managed by the protocols and 962 * the appropriate flags must be set in the pru_attach function. 963 */ 964 CURVNET_SET(so->so_vnet); 965 error = prp->pr_attach(so, proto, td); 966 CURVNET_RESTORE(); 967 if (error) { 968 sodealloc(so); 969 return (error); 970 } 971 soref(so); 972 *aso = so; 973 return (0); 974 } 975 976 #ifdef REGRESSION 977 static int regression_sonewconn_earlytest = 1; 978 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 979 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 980 #endif 981 982 static int sooverprio = LOG_DEBUG; 983 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, 984 &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); 985 986 static struct timeval overinterval = { 60, 0 }; 987 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 988 &overinterval, 989 "Delay in seconds between warnings for listen socket overflows"); 990 991 /* 992 * When an attempt at a new connection is noted on a socket which supports 993 * accept(2), the protocol has two options: 994 * 1) Call legacy sonewconn() function, which would call protocol attach 995 * method, same as used for socket(2). 996 * 2) Call solisten_clone(), do attach that is specific to a cloned connection, 997 * and then call solisten_enqueue(). 998 * 999 * Note: the ref count on the socket is 0 on return. 1000 */ 1001 struct socket * 1002 solisten_clone(struct socket *head) 1003 { 1004 struct sbuf descrsb; 1005 struct socket *so; 1006 int len, overcount; 1007 u_int qlen; 1008 const char localprefix[] = "local:"; 1009 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 1010 #if defined(INET6) 1011 char addrbuf[INET6_ADDRSTRLEN]; 1012 #elif defined(INET) 1013 char addrbuf[INET_ADDRSTRLEN]; 1014 #endif 1015 bool dolog, over; 1016 1017 SOLISTEN_LOCK(head); 1018 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 1019 #ifdef REGRESSION 1020 if (regression_sonewconn_earlytest && over) { 1021 #else 1022 if (over) { 1023 #endif 1024 head->sol_overcount++; 1025 dolog = (sooverprio >= 0) && 1026 !!ratecheck(&head->sol_lastover, &overinterval); 1027 1028 /* 1029 * If we're going to log, copy the overflow count and queue 1030 * length from the listen socket before dropping the lock. 1031 * Also, reset the overflow count. 1032 */ 1033 if (dolog) { 1034 overcount = head->sol_overcount; 1035 head->sol_overcount = 0; 1036 qlen = head->sol_qlen; 1037 } 1038 SOLISTEN_UNLOCK(head); 1039 1040 if (dolog) { 1041 /* 1042 * Try to print something descriptive about the 1043 * socket for the error message. 1044 */ 1045 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 1046 SBUF_FIXEDLEN); 1047 switch (head->so_proto->pr_domain->dom_family) { 1048 #if defined(INET) || defined(INET6) 1049 #ifdef INET 1050 case AF_INET: 1051 #endif 1052 #ifdef INET6 1053 case AF_INET6: 1054 if (head->so_proto->pr_domain->dom_family == 1055 AF_INET6 || 1056 (sotoinpcb(head)->inp_inc.inc_flags & 1057 INC_ISIPV6)) { 1058 ip6_sprintf(addrbuf, 1059 &sotoinpcb(head)->inp_inc.inc6_laddr); 1060 sbuf_printf(&descrsb, "[%s]", addrbuf); 1061 } else 1062 #endif 1063 { 1064 #ifdef INET 1065 inet_ntoa_r( 1066 sotoinpcb(head)->inp_inc.inc_laddr, 1067 addrbuf); 1068 sbuf_cat(&descrsb, addrbuf); 1069 #endif 1070 } 1071 sbuf_printf(&descrsb, ":%hu (proto %u)", 1072 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 1073 head->so_proto->pr_protocol); 1074 break; 1075 #endif /* INET || INET6 */ 1076 case AF_UNIX: 1077 sbuf_cat(&descrsb, localprefix); 1078 if (sotounpcb(head)->unp_addr != NULL) 1079 len = 1080 sotounpcb(head)->unp_addr->sun_len - 1081 offsetof(struct sockaddr_un, 1082 sun_path); 1083 else 1084 len = 0; 1085 if (len > 0) 1086 sbuf_bcat(&descrsb, 1087 sotounpcb(head)->unp_addr->sun_path, 1088 len); 1089 else 1090 sbuf_cat(&descrsb, "(unknown)"); 1091 break; 1092 } 1093 1094 /* 1095 * If we can't print something more specific, at least 1096 * print the domain name. 1097 */ 1098 if (sbuf_finish(&descrsb) != 0 || 1099 sbuf_len(&descrsb) <= 0) { 1100 sbuf_clear(&descrsb); 1101 sbuf_cat(&descrsb, 1102 head->so_proto->pr_domain->dom_name ?: 1103 "unknown"); 1104 sbuf_finish(&descrsb); 1105 } 1106 KASSERT(sbuf_len(&descrsb) > 0, 1107 ("%s: sbuf creation failed", __func__)); 1108 /* 1109 * Preserve the historic listen queue overflow log 1110 * message, that starts with "sonewconn:". It has 1111 * been known to sysadmins for years and also test 1112 * sys/kern/sonewconn_overflow checks for it. 1113 */ 1114 if (head->so_cred == 0) { 1115 log(LOG_PRI(sooverprio), 1116 "sonewconn: pcb %p (%s): " 1117 "Listen queue overflow: %i already in " 1118 "queue awaiting acceptance (%d " 1119 "occurrences)\n", head->so_pcb, 1120 sbuf_data(&descrsb), 1121 qlen, overcount); 1122 } else { 1123 log(LOG_PRI(sooverprio), 1124 "sonewconn: pcb %p (%s): " 1125 "Listen queue overflow: " 1126 "%i already in queue awaiting acceptance " 1127 "(%d occurrences), euid %d, rgid %d, jail %s\n", 1128 head->so_pcb, sbuf_data(&descrsb), qlen, 1129 overcount, head->so_cred->cr_uid, 1130 head->so_cred->cr_rgid, 1131 head->so_cred->cr_prison ? 1132 head->so_cred->cr_prison->pr_name : 1133 "not_jailed"); 1134 } 1135 sbuf_delete(&descrsb); 1136 1137 overcount = 0; 1138 } 1139 1140 return (NULL); 1141 } 1142 SOLISTEN_UNLOCK(head); 1143 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 1144 __func__, head)); 1145 so = soalloc(head->so_vnet); 1146 if (so == NULL) { 1147 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1148 "limit reached or out of memory\n", 1149 __func__, head->so_pcb); 1150 return (NULL); 1151 } 1152 so->so_listen = head; 1153 so->so_type = head->so_type; 1154 /* 1155 * POSIX is ambiguous on what options an accept(2)ed socket should 1156 * inherit from the listener. Words "create a new socket" may be 1157 * interpreted as not inheriting anything. Best programming practice 1158 * for application developers is to not rely on such inheritance. 1159 * FreeBSD had historically inherited all so_options excluding 1160 * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, 1161 * including those completely irrelevant to a new born socket. For 1162 * compatibility with older versions we will inherit a list of 1163 * meaningful options. 1164 * The crucial bit to inherit is SO_ACCEPTFILTER. We need it present 1165 * in the child socket for soisconnected() promoting socket from the 1166 * incomplete queue to complete. It will be cleared before the child 1167 * gets available to accept(2). 1168 */ 1169 so->so_options = head->so_options & (SO_ACCEPTFILTER | SO_KEEPALIVE | 1170 SO_DONTROUTE | SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); 1171 so->so_linger = head->so_linger; 1172 so->so_state = head->so_state; 1173 so->so_fibnum = head->so_fibnum; 1174 so->so_proto = head->so_proto; 1175 so->so_cred = crhold(head->so_cred); 1176 #ifdef SOCKET_HHOOK 1177 if (V_socket_hhh[HHOOK_SOCKET_NEWCONN]->hhh_nhooks > 0) { 1178 if (hhook_run_socket(so, head, HHOOK_SOCKET_NEWCONN)) { 1179 sodealloc(so); 1180 log(LOG_DEBUG, "%s: hhook run failed\n", __func__); 1181 return (NULL); 1182 } 1183 } 1184 #endif 1185 #ifdef MAC 1186 mac_socket_newconn(head, so); 1187 #endif 1188 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1189 so_rdknl_assert_lock); 1190 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1191 so_wrknl_assert_lock); 1192 VNET_SO_ASSERT(head); 1193 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 1194 sodealloc(so); 1195 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 1196 __func__, head->so_pcb); 1197 return (NULL); 1198 } 1199 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 1200 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 1201 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 1202 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 1203 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; 1204 so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; 1205 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 1206 so->so_snd.sb_mtx = &so->so_snd_mtx; 1207 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 1208 } 1209 1210 return (so); 1211 } 1212 1213 /* Connstatus may be 0 or SS_ISCONNECTED. */ 1214 struct socket * 1215 sonewconn(struct socket *head, int connstatus) 1216 { 1217 struct socket *so; 1218 1219 if ((so = solisten_clone(head)) == NULL) 1220 return (NULL); 1221 1222 if (so->so_proto->pr_attach(so, 0, NULL) != 0) { 1223 sodealloc(so); 1224 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 1225 __func__, head->so_pcb); 1226 return (NULL); 1227 } 1228 1229 (void)solisten_enqueue(so, connstatus); 1230 1231 return (so); 1232 } 1233 1234 /* 1235 * Enqueue socket cloned by solisten_clone() to the listen queue of the 1236 * listener it has been cloned from. 1237 * 1238 * Return 'true' if socket landed on complete queue, otherwise 'false'. 1239 */ 1240 bool 1241 solisten_enqueue(struct socket *so, int connstatus) 1242 { 1243 struct socket *head = so->so_listen; 1244 1245 MPASS(refcount_load(&so->so_count) == 0); 1246 refcount_init(&so->so_count, 1); 1247 1248 SOLISTEN_LOCK(head); 1249 if (head->sol_accept_filter != NULL) 1250 connstatus = 0; 1251 so->so_state |= connstatus; 1252 soref(head); /* A socket on (in)complete queue refs head. */ 1253 if (connstatus) { 1254 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 1255 so->so_qstate = SQ_COMP; 1256 head->sol_qlen++; 1257 solisten_wakeup(head); /* unlocks */ 1258 return (true); 1259 } else { 1260 /* 1261 * Keep removing sockets from the head until there's room for 1262 * us to insert on the tail. In pre-locking revisions, this 1263 * was a simple if(), but as we could be racing with other 1264 * threads and soabort() requires dropping locks, we must 1265 * loop waiting for the condition to be true. 1266 */ 1267 while (head->sol_incqlen > head->sol_qlimit) { 1268 struct socket *sp; 1269 1270 sp = TAILQ_FIRST(&head->sol_incomp); 1271 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 1272 head->sol_incqlen--; 1273 SOCK_LOCK(sp); 1274 sp->so_qstate = SQ_NONE; 1275 sp->so_listen = NULL; 1276 SOCK_UNLOCK(sp); 1277 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ 1278 soabort(sp); 1279 SOLISTEN_LOCK(head); 1280 } 1281 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 1282 so->so_qstate = SQ_INCOMP; 1283 head->sol_incqlen++; 1284 SOLISTEN_UNLOCK(head); 1285 return (false); 1286 } 1287 } 1288 1289 #if defined(SCTP) || defined(SCTP_SUPPORT) 1290 /* 1291 * Socket part of sctp_peeloff(). Detach a new socket from an 1292 * association. The new socket is returned with a reference. 1293 * 1294 * XXXGL: reduce copy-paste with solisten_clone(). 1295 */ 1296 struct socket * 1297 sopeeloff(struct socket *head) 1298 { 1299 struct socket *so; 1300 1301 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 1302 __func__, __LINE__, head)); 1303 so = soalloc(head->so_vnet); 1304 if (so == NULL) { 1305 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1306 "limit reached or out of memory\n", 1307 __func__, head->so_pcb); 1308 return (NULL); 1309 } 1310 so->so_type = head->so_type; 1311 so->so_options = head->so_options; 1312 so->so_linger = head->so_linger; 1313 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 1314 so->so_fibnum = head->so_fibnum; 1315 so->so_proto = head->so_proto; 1316 so->so_cred = crhold(head->so_cred); 1317 #ifdef MAC 1318 mac_socket_newconn(head, so); 1319 #endif 1320 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1321 so_rdknl_assert_lock); 1322 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1323 so_wrknl_assert_lock); 1324 VNET_SO_ASSERT(head); 1325 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 1326 sodealloc(so); 1327 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 1328 __func__, head->so_pcb); 1329 return (NULL); 1330 } 1331 if ((*so->so_proto->pr_attach)(so, 0, NULL)) { 1332 sodealloc(so); 1333 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 1334 __func__, head->so_pcb); 1335 return (NULL); 1336 } 1337 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 1338 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 1339 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 1340 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 1341 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 1342 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 1343 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 1344 so->so_snd.sb_mtx = &so->so_snd_mtx; 1345 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 1346 } 1347 1348 soref(so); 1349 1350 return (so); 1351 } 1352 #endif /* SCTP */ 1353 1354 int 1355 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 1356 { 1357 int error; 1358 1359 CURVNET_SET(so->so_vnet); 1360 error = so->so_proto->pr_bind(so, nam, td); 1361 CURVNET_RESTORE(); 1362 return (error); 1363 } 1364 1365 int 1366 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1367 { 1368 int error; 1369 1370 CURVNET_SET(so->so_vnet); 1371 error = so->so_proto->pr_bindat(fd, so, nam, td); 1372 CURVNET_RESTORE(); 1373 return (error); 1374 } 1375 1376 /* 1377 * solisten() transitions a socket from a non-listening state to a listening 1378 * state, but can also be used to update the listen queue depth on an 1379 * existing listen socket. The protocol will call back into the sockets 1380 * layer using solisten_proto_check() and solisten_proto() to check and set 1381 * socket-layer listen state. Call backs are used so that the protocol can 1382 * acquire both protocol and socket layer locks in whatever order is required 1383 * by the protocol. 1384 * 1385 * Protocol implementors are advised to hold the socket lock across the 1386 * socket-layer test and set to avoid races at the socket layer. 1387 */ 1388 int 1389 solisten(struct socket *so, int backlog, struct thread *td) 1390 { 1391 int error; 1392 1393 CURVNET_SET(so->so_vnet); 1394 error = so->so_proto->pr_listen(so, backlog, td); 1395 CURVNET_RESTORE(); 1396 return (error); 1397 } 1398 1399 /* 1400 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 1401 * order to interlock with socket I/O. 1402 */ 1403 int 1404 solisten_proto_check(struct socket *so) 1405 { 1406 SOCK_LOCK_ASSERT(so); 1407 1408 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1409 SS_ISDISCONNECTING)) != 0) 1410 return (EINVAL); 1411 1412 /* 1413 * Sleeping is not permitted here, so simply fail if userspace is 1414 * attempting to transmit or receive on the socket. This kind of 1415 * transient failure is not ideal, but it should occur only if userspace 1416 * is misusing the socket interfaces. 1417 */ 1418 if (!sx_try_xlock(&so->so_snd_sx)) 1419 return (EAGAIN); 1420 if (!sx_try_xlock(&so->so_rcv_sx)) { 1421 sx_xunlock(&so->so_snd_sx); 1422 return (EAGAIN); 1423 } 1424 mtx_lock(&so->so_snd_mtx); 1425 mtx_lock(&so->so_rcv_mtx); 1426 1427 /* Interlock with soo_aio_queue() and KTLS. */ 1428 if (!SOLISTENING(so)) { 1429 bool ktls; 1430 1431 #ifdef KERN_TLS 1432 ktls = so->so_snd.sb_tls_info != NULL || 1433 so->so_rcv.sb_tls_info != NULL; 1434 #else 1435 ktls = false; 1436 #endif 1437 if (ktls || 1438 (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || 1439 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { 1440 solisten_proto_abort(so); 1441 return (EINVAL); 1442 } 1443 } 1444 1445 return (0); 1446 } 1447 1448 /* 1449 * Undo the setup done by solisten_proto_check(). 1450 */ 1451 void 1452 solisten_proto_abort(struct socket *so) 1453 { 1454 mtx_unlock(&so->so_snd_mtx); 1455 mtx_unlock(&so->so_rcv_mtx); 1456 sx_xunlock(&so->so_snd_sx); 1457 sx_xunlock(&so->so_rcv_sx); 1458 } 1459 1460 void 1461 solisten_proto(struct socket *so, int backlog) 1462 { 1463 int sbrcv_lowat, sbsnd_lowat; 1464 u_int sbrcv_hiwat, sbsnd_hiwat; 1465 short sbrcv_flags, sbsnd_flags; 1466 sbintime_t sbrcv_timeo, sbsnd_timeo; 1467 1468 SOCK_LOCK_ASSERT(so); 1469 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1470 SS_ISDISCONNECTING)) == 0, 1471 ("%s: bad socket state %p", __func__, so)); 1472 1473 if (SOLISTENING(so)) 1474 goto listening; 1475 1476 /* 1477 * Change this socket to listening state. 1478 */ 1479 sbrcv_lowat = so->so_rcv.sb_lowat; 1480 sbsnd_lowat = so->so_snd.sb_lowat; 1481 sbrcv_hiwat = so->so_rcv.sb_hiwat; 1482 sbsnd_hiwat = so->so_snd.sb_hiwat; 1483 sbrcv_flags = so->so_rcv.sb_flags; 1484 sbsnd_flags = so->so_snd.sb_flags; 1485 sbrcv_timeo = so->so_rcv.sb_timeo; 1486 sbsnd_timeo = so->so_snd.sb_timeo; 1487 1488 if (!(so->so_proto->pr_flags & PR_SOCKBUF)) { 1489 sbdestroy(so, SO_SND); 1490 sbdestroy(so, SO_RCV); 1491 } 1492 1493 #ifdef INVARIANTS 1494 bzero(&so->so_rcv, 1495 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 1496 #endif 1497 1498 so->sol_sbrcv_lowat = sbrcv_lowat; 1499 so->sol_sbsnd_lowat = sbsnd_lowat; 1500 so->sol_sbrcv_hiwat = sbrcv_hiwat; 1501 so->sol_sbsnd_hiwat = sbsnd_hiwat; 1502 so->sol_sbrcv_flags = sbrcv_flags; 1503 so->sol_sbsnd_flags = sbsnd_flags; 1504 so->sol_sbrcv_timeo = sbrcv_timeo; 1505 so->sol_sbsnd_timeo = sbsnd_timeo; 1506 1507 so->sol_qlen = so->sol_incqlen = 0; 1508 TAILQ_INIT(&so->sol_incomp); 1509 TAILQ_INIT(&so->sol_comp); 1510 1511 so->sol_accept_filter = NULL; 1512 so->sol_accept_filter_arg = NULL; 1513 so->sol_accept_filter_str = NULL; 1514 1515 so->sol_upcall = NULL; 1516 so->sol_upcallarg = NULL; 1517 1518 so->so_options |= SO_ACCEPTCONN; 1519 1520 listening: 1521 if (backlog < 0 || backlog > somaxconn) 1522 backlog = somaxconn; 1523 so->sol_qlimit = backlog; 1524 1525 mtx_unlock(&so->so_snd_mtx); 1526 mtx_unlock(&so->so_rcv_mtx); 1527 sx_xunlock(&so->so_snd_sx); 1528 sx_xunlock(&so->so_rcv_sx); 1529 } 1530 1531 /* 1532 * Wakeup listeners/subsystems once we have a complete connection. 1533 * Enters with lock, returns unlocked. 1534 */ 1535 void 1536 solisten_wakeup(struct socket *sol) 1537 { 1538 1539 if (sol->sol_upcall != NULL) 1540 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1541 else { 1542 selwakeuppri(&sol->so_rdsel, PSOCK); 1543 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1544 } 1545 SOLISTEN_UNLOCK(sol); 1546 wakeup_one(&sol->sol_comp); 1547 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1548 pgsigio(&sol->so_sigio, SIGIO, 0); 1549 } 1550 1551 /* 1552 * Return single connection off a listening socket queue. Main consumer of 1553 * the function is kern_accept4(). Some modules, that do their own accept 1554 * management also use the function. The socket reference held by the 1555 * listen queue is handed to the caller. 1556 * 1557 * Listening socket must be locked on entry and is returned unlocked on 1558 * return. 1559 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1560 */ 1561 int 1562 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1563 { 1564 struct socket *so; 1565 int error; 1566 1567 SOLISTEN_LOCK_ASSERT(head); 1568 1569 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1570 head->so_error == 0) { 1571 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1572 "accept", 0); 1573 if (error != 0) { 1574 SOLISTEN_UNLOCK(head); 1575 return (error); 1576 } 1577 } 1578 if (head->so_error) { 1579 error = head->so_error; 1580 head->so_error = 0; 1581 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1582 error = EWOULDBLOCK; 1583 else 1584 error = 0; 1585 if (error) { 1586 SOLISTEN_UNLOCK(head); 1587 return (error); 1588 } 1589 so = TAILQ_FIRST(&head->sol_comp); 1590 SOCK_LOCK(so); 1591 KASSERT(so->so_qstate == SQ_COMP, 1592 ("%s: so %p not SQ_COMP", __func__, so)); 1593 head->sol_qlen--; 1594 so->so_qstate = SQ_NONE; 1595 so->so_listen = NULL; 1596 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1597 if (flags & ACCEPT4_INHERIT) 1598 so->so_state |= (head->so_state & SS_NBIO); 1599 else 1600 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1601 SOCK_UNLOCK(so); 1602 sorele_locked(head); 1603 1604 *ret = so; 1605 return (0); 1606 } 1607 1608 static struct so_splice * 1609 so_splice_alloc(off_t max) 1610 { 1611 struct so_splice *sp; 1612 1613 sp = uma_zalloc(splice_zone, M_WAITOK); 1614 sp->src = NULL; 1615 sp->dst = NULL; 1616 sp->max = max > 0 ? max : -1; 1617 do { 1618 sp->wq_index = atomic_fetchadd_32(&splice_index, 1) % 1619 (mp_maxid + 1); 1620 } while (CPU_ABSENT(sp->wq_index)); 1621 sp->state = SPLICE_IDLE; 1622 TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout, 1623 sp); 1624 return (sp); 1625 } 1626 1627 static void 1628 so_splice_free(struct so_splice *sp) 1629 { 1630 KASSERT(sp->state == SPLICE_CLOSED, 1631 ("so_splice_free: sp %p not closed", sp)); 1632 uma_zfree(splice_zone, sp); 1633 } 1634 1635 static void 1636 so_splice_timeout(void *arg, int pending __unused) 1637 { 1638 struct so_splice *sp; 1639 1640 sp = arg; 1641 (void)so_unsplice(sp->src, true); 1642 } 1643 1644 /* 1645 * Splice the output from so to the input of so2. 1646 */ 1647 static int 1648 so_splice(struct socket *so, struct socket *so2, struct splice *splice) 1649 { 1650 struct so_splice *sp; 1651 int error; 1652 1653 if (splice->sp_max < 0) 1654 return (EINVAL); 1655 /* Handle only TCP for now; TODO: other streaming protos */ 1656 if (so->so_proto->pr_protocol != IPPROTO_TCP || 1657 so2->so_proto->pr_protocol != IPPROTO_TCP) 1658 return (EPROTONOSUPPORT); 1659 if (so->so_vnet != so2->so_vnet) 1660 return (EINVAL); 1661 1662 /* so_splice_xfer() assumes that we're using these implementations. */ 1663 KASSERT(so->so_proto->pr_sosend == sosend_generic, 1664 ("so_splice: sosend not sosend_generic")); 1665 KASSERT(so2->so_proto->pr_soreceive == soreceive_generic || 1666 so2->so_proto->pr_soreceive == soreceive_stream, 1667 ("so_splice: soreceive not soreceive_generic/stream")); 1668 1669 sp = so_splice_alloc(splice->sp_max); 1670 so->so_splice_sent = 0; 1671 sp->src = so; 1672 sp->dst = so2; 1673 1674 error = 0; 1675 SOCK_LOCK(so); 1676 if (SOLISTENING(so)) 1677 error = EINVAL; 1678 else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1679 error = ENOTCONN; 1680 else if (so->so_splice != NULL) 1681 error = EBUSY; 1682 if (error != 0) { 1683 SOCK_UNLOCK(so); 1684 uma_zfree(splice_zone, sp); 1685 return (error); 1686 } 1687 soref(so); 1688 so->so_splice = sp; 1689 SOCK_RECVBUF_LOCK(so); 1690 so->so_rcv.sb_flags |= SB_SPLICED; 1691 SOCK_RECVBUF_UNLOCK(so); 1692 SOCK_UNLOCK(so); 1693 1694 error = 0; 1695 SOCK_LOCK(so2); 1696 if (SOLISTENING(so2)) 1697 error = EINVAL; 1698 else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1699 error = ENOTCONN; 1700 else if (so2->so_splice_back != NULL) 1701 error = EBUSY; 1702 if (error != 0) { 1703 SOCK_UNLOCK(so2); 1704 SOCK_LOCK(so); 1705 so->so_splice = NULL; 1706 SOCK_RECVBUF_LOCK(so); 1707 so->so_rcv.sb_flags &= ~SB_SPLICED; 1708 SOCK_RECVBUF_UNLOCK(so); 1709 SOCK_UNLOCK(so); 1710 sorele(so); 1711 uma_zfree(splice_zone, sp); 1712 return (error); 1713 } 1714 soref(so2); 1715 so2->so_splice_back = sp; 1716 SOCK_SENDBUF_LOCK(so2); 1717 so2->so_snd.sb_flags |= SB_SPLICED; 1718 mtx_lock(&sp->mtx); 1719 SOCK_SENDBUF_UNLOCK(so2); 1720 SOCK_UNLOCK(so2); 1721 1722 if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) { 1723 taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout, 1724 tvtosbt(splice->sp_idle), 0, C_PREL(4)); 1725 } 1726 1727 /* 1728 * Transfer any data already present in the socket buffer. 1729 */ 1730 sp->state = SPLICE_QUEUED; 1731 so_splice_xfer(sp); 1732 return (0); 1733 } 1734 1735 static int 1736 so_unsplice(struct socket *so, bool timeout) 1737 { 1738 struct socket *so2; 1739 struct so_splice *sp; 1740 bool drain; 1741 1742 /* 1743 * First unset SB_SPLICED and hide the splice structure so that 1744 * wakeup routines will stop enqueuing work. This also ensures that 1745 * a only a single thread will proceed with the unsplice. 1746 */ 1747 SOCK_LOCK(so); 1748 if (SOLISTENING(so)) { 1749 SOCK_UNLOCK(so); 1750 return (EINVAL); 1751 } 1752 SOCK_RECVBUF_LOCK(so); 1753 if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) { 1754 SOCK_RECVBUF_UNLOCK(so); 1755 SOCK_UNLOCK(so); 1756 return (ENOTCONN); 1757 } 1758 so->so_rcv.sb_flags &= ~SB_SPLICED; 1759 sp = so->so_splice; 1760 so->so_splice = NULL; 1761 SOCK_RECVBUF_UNLOCK(so); 1762 SOCK_UNLOCK(so); 1763 1764 so2 = sp->dst; 1765 SOCK_LOCK(so2); 1766 KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__)); 1767 SOCK_SENDBUF_LOCK(so2); 1768 KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0, 1769 ("%s: so2 is not spliced", __func__)); 1770 KASSERT(so2->so_splice_back == sp, 1771 ("%s: so_splice_back != sp", __func__)); 1772 so2->so_snd.sb_flags &= ~SB_SPLICED; 1773 so2->so_splice_back = NULL; 1774 SOCK_SENDBUF_UNLOCK(so2); 1775 SOCK_UNLOCK(so2); 1776 1777 /* 1778 * No new work is being enqueued. The worker thread might be 1779 * splicing data right now, in which case we want to wait for it to 1780 * finish before proceeding. 1781 */ 1782 mtx_lock(&sp->mtx); 1783 switch (sp->state) { 1784 case SPLICE_QUEUED: 1785 case SPLICE_RUNNING: 1786 sp->state = SPLICE_CLOSING; 1787 while (sp->state == SPLICE_CLOSING) 1788 msleep(sp, &sp->mtx, PSOCK, "unsplice", 0); 1789 break; 1790 case SPLICE_IDLE: 1791 case SPLICE_EXCEPTION: 1792 sp->state = SPLICE_CLOSED; 1793 break; 1794 default: 1795 __assert_unreachable(); 1796 } 1797 if (!timeout) { 1798 drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout, 1799 NULL) != 0; 1800 } else { 1801 drain = false; 1802 } 1803 mtx_unlock(&sp->mtx); 1804 if (drain) 1805 taskqueue_drain_timeout(taskqueue_thread, &sp->timeout); 1806 1807 /* 1808 * Now we hold the sole reference to the splice structure. 1809 * Clean up: signal userspace and release socket references. 1810 */ 1811 sorwakeup(so); 1812 CURVNET_SET(so->so_vnet); 1813 sorele(so); 1814 sowwakeup(so2); 1815 sorele(so2); 1816 CURVNET_RESTORE(); 1817 so_splice_free(sp); 1818 return (0); 1819 } 1820 1821 /* 1822 * Free socket upon release of the very last reference. 1823 */ 1824 static void 1825 sofree(struct socket *so) 1826 { 1827 struct protosw *pr = so->so_proto; 1828 1829 SOCK_LOCK_ASSERT(so); 1830 KASSERT(refcount_load(&so->so_count) == 0, 1831 ("%s: so %p has references", __func__, so)); 1832 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, 1833 ("%s: so %p is on listen queue", __func__, so)); 1834 KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0, 1835 ("%s: so %p rcvbuf is spliced", __func__, so)); 1836 KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0, 1837 ("%s: so %p sndbuf is spliced", __func__, so)); 1838 KASSERT(so->so_splice == NULL && so->so_splice_back == NULL, 1839 ("%s: so %p has spliced data", __func__, so)); 1840 1841 SOCK_UNLOCK(so); 1842 1843 if (so->so_dtor != NULL) 1844 so->so_dtor(so); 1845 1846 VNET_SO_ASSERT(so); 1847 if (pr->pr_detach != NULL) 1848 pr->pr_detach(so); 1849 1850 /* 1851 * From this point on, we assume that no other references to this 1852 * socket exist anywhere else in the stack. Therefore, no locks need 1853 * to be acquired or held. 1854 */ 1855 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { 1856 sbdestroy(so, SO_SND); 1857 sbdestroy(so, SO_RCV); 1858 } 1859 seldrain(&so->so_rdsel); 1860 seldrain(&so->so_wrsel); 1861 knlist_destroy(&so->so_rdsel.si_note); 1862 knlist_destroy(&so->so_wrsel.si_note); 1863 sodealloc(so); 1864 } 1865 1866 /* 1867 * Release a reference on a socket while holding the socket lock. 1868 * Unlocks the socket lock before returning. 1869 */ 1870 void 1871 sorele_locked(struct socket *so) 1872 { 1873 SOCK_LOCK_ASSERT(so); 1874 if (refcount_release(&so->so_count)) 1875 sofree(so); 1876 else 1877 SOCK_UNLOCK(so); 1878 } 1879 1880 /* 1881 * Close a socket on last file table reference removal. Initiate disconnect 1882 * if connected. Free socket when disconnect complete. 1883 * 1884 * This function will sorele() the socket. Note that soclose() may be called 1885 * prior to the ref count reaching zero. The actual socket structure will 1886 * not be freed until the ref count reaches zero. 1887 */ 1888 int 1889 soclose(struct socket *so) 1890 { 1891 struct accept_queue lqueue; 1892 int error = 0; 1893 bool listening, last __diagused; 1894 1895 CURVNET_SET(so->so_vnet); 1896 funsetown(&so->so_sigio); 1897 if (so->so_state & SS_ISCONNECTED) { 1898 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1899 error = sodisconnect(so); 1900 if (error) { 1901 if (error == ENOTCONN) 1902 error = 0; 1903 goto drop; 1904 } 1905 } 1906 1907 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 1908 if ((so->so_state & SS_ISDISCONNECTING) && 1909 (so->so_state & SS_NBIO)) 1910 goto drop; 1911 while (so->so_state & SS_ISCONNECTED) { 1912 error = tsleep(&so->so_timeo, 1913 PSOCK | PCATCH, "soclos", 1914 so->so_linger * hz); 1915 if (error) 1916 break; 1917 } 1918 } 1919 } 1920 1921 drop: 1922 if (so->so_proto->pr_close != NULL) 1923 so->so_proto->pr_close(so); 1924 1925 SOCK_LOCK(so); 1926 if ((listening = SOLISTENING(so))) { 1927 struct socket *sp; 1928 1929 TAILQ_INIT(&lqueue); 1930 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1931 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1932 1933 so->sol_qlen = so->sol_incqlen = 0; 1934 1935 TAILQ_FOREACH(sp, &lqueue, so_list) { 1936 SOCK_LOCK(sp); 1937 sp->so_qstate = SQ_NONE; 1938 sp->so_listen = NULL; 1939 SOCK_UNLOCK(sp); 1940 last = refcount_release(&so->so_count); 1941 KASSERT(!last, ("%s: released last reference for %p", 1942 __func__, so)); 1943 } 1944 } 1945 sorele_locked(so); 1946 if (listening) { 1947 struct socket *sp, *tsp; 1948 1949 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) 1950 soabort(sp); 1951 } 1952 CURVNET_RESTORE(); 1953 return (error); 1954 } 1955 1956 /* 1957 * soabort() is used to abruptly tear down a connection, such as when a 1958 * resource limit is reached (listen queue depth exceeded), or if a listen 1959 * socket is closed while there are sockets waiting to be accepted. 1960 * 1961 * This interface is tricky, because it is called on an unreferenced socket, 1962 * and must be called only by a thread that has actually removed the socket 1963 * from the listen queue it was on. Likely this thread holds the last 1964 * reference on the socket and soabort() will proceed with sofree(). But 1965 * it might be not the last, as the sockets on the listen queues are seen 1966 * from the protocol side. 1967 * 1968 * This interface will call into the protocol code, so must not be called 1969 * with any socket locks held. Protocols do call it while holding their own 1970 * recursible protocol mutexes, but this is something that should be subject 1971 * to review in the future. 1972 * 1973 * Usually socket should have a single reference left, but this is not a 1974 * requirement. In the past, when we have had named references for file 1975 * descriptor and protocol, we asserted that none of them are being held. 1976 */ 1977 void 1978 soabort(struct socket *so) 1979 { 1980 1981 VNET_SO_ASSERT(so); 1982 1983 if (so->so_proto->pr_abort != NULL) 1984 so->so_proto->pr_abort(so); 1985 SOCK_LOCK(so); 1986 sorele_locked(so); 1987 } 1988 1989 int 1990 soaccept(struct socket *so, struct sockaddr *sa) 1991 { 1992 #ifdef INVARIANTS 1993 u_char len = sa->sa_len; 1994 #endif 1995 int error; 1996 1997 CURVNET_SET(so->so_vnet); 1998 error = so->so_proto->pr_accept(so, sa); 1999 KASSERT(sa->sa_len <= len, 2000 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2001 CURVNET_RESTORE(); 2002 return (error); 2003 } 2004 2005 int 2006 sopeeraddr(struct socket *so, struct sockaddr *sa) 2007 { 2008 #ifdef INVARIANTS 2009 u_char len = sa->sa_len; 2010 #endif 2011 int error; 2012 2013 CURVNET_SET(so->so_vnet); 2014 error = so->so_proto->pr_peeraddr(so, sa); 2015 KASSERT(sa->sa_len <= len, 2016 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2017 CURVNET_RESTORE(); 2018 2019 return (error); 2020 } 2021 2022 int 2023 sosockaddr(struct socket *so, struct sockaddr *sa) 2024 { 2025 #ifdef INVARIANTS 2026 u_char len = sa->sa_len; 2027 #endif 2028 int error; 2029 2030 CURVNET_SET(so->so_vnet); 2031 error = so->so_proto->pr_sockaddr(so, sa); 2032 KASSERT(sa->sa_len <= len, 2033 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2034 CURVNET_RESTORE(); 2035 2036 return (error); 2037 } 2038 2039 int 2040 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 2041 { 2042 2043 return (soconnectat(AT_FDCWD, so, nam, td)); 2044 } 2045 2046 int 2047 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 2048 { 2049 int error; 2050 2051 CURVNET_SET(so->so_vnet); 2052 2053 /* 2054 * If protocol is connection-based, can only connect once. 2055 * Otherwise, if connected, try to disconnect first. This allows 2056 * user to disconnect by connecting to, e.g., a null address. 2057 * 2058 * Note, this check is racy and may need to be re-evaluated at the 2059 * protocol layer. 2060 */ 2061 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 2062 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 2063 (error = sodisconnect(so)))) { 2064 error = EISCONN; 2065 } else { 2066 /* 2067 * Prevent accumulated error from previous connection from 2068 * biting us. 2069 */ 2070 so->so_error = 0; 2071 if (fd == AT_FDCWD) { 2072 error = so->so_proto->pr_connect(so, nam, td); 2073 } else { 2074 error = so->so_proto->pr_connectat(fd, so, nam, td); 2075 } 2076 } 2077 CURVNET_RESTORE(); 2078 2079 return (error); 2080 } 2081 2082 int 2083 soconnect2(struct socket *so1, struct socket *so2) 2084 { 2085 int error; 2086 2087 CURVNET_SET(so1->so_vnet); 2088 error = so1->so_proto->pr_connect2(so1, so2); 2089 CURVNET_RESTORE(); 2090 return (error); 2091 } 2092 2093 int 2094 sodisconnect(struct socket *so) 2095 { 2096 int error; 2097 2098 if ((so->so_state & SS_ISCONNECTED) == 0) 2099 return (ENOTCONN); 2100 if (so->so_state & SS_ISDISCONNECTING) 2101 return (EALREADY); 2102 VNET_SO_ASSERT(so); 2103 error = so->so_proto->pr_disconnect(so); 2104 return (error); 2105 } 2106 2107 int 2108 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 2109 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2110 { 2111 long space; 2112 ssize_t resid; 2113 int clen = 0, error, dontroute; 2114 2115 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 2116 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 2117 ("sosend_dgram: !PR_ATOMIC")); 2118 2119 if (uio != NULL) 2120 resid = uio->uio_resid; 2121 else 2122 resid = top->m_pkthdr.len; 2123 /* 2124 * In theory resid should be unsigned. However, space must be 2125 * signed, as it might be less than 0 if we over-committed, and we 2126 * must use a signed comparison of space and resid. On the other 2127 * hand, a negative resid causes us to loop sending 0-length 2128 * segments to the protocol. 2129 */ 2130 if (resid < 0) { 2131 error = EINVAL; 2132 goto out; 2133 } 2134 2135 dontroute = 2136 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 2137 if (td != NULL) 2138 td->td_ru.ru_msgsnd++; 2139 if (control != NULL) 2140 clen = control->m_len; 2141 2142 SOCKBUF_LOCK(&so->so_snd); 2143 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2144 SOCKBUF_UNLOCK(&so->so_snd); 2145 error = EPIPE; 2146 goto out; 2147 } 2148 if (so->so_error) { 2149 error = so->so_error; 2150 so->so_error = 0; 2151 SOCKBUF_UNLOCK(&so->so_snd); 2152 goto out; 2153 } 2154 if ((so->so_state & SS_ISCONNECTED) == 0) { 2155 /* 2156 * `sendto' and `sendmsg' is allowed on a connection-based 2157 * socket if it supports implied connect. Return ENOTCONN if 2158 * not connected and no address is supplied. 2159 */ 2160 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2161 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2162 if (!(resid == 0 && clen != 0)) { 2163 SOCKBUF_UNLOCK(&so->so_snd); 2164 error = ENOTCONN; 2165 goto out; 2166 } 2167 } else if (addr == NULL) { 2168 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2169 error = ENOTCONN; 2170 else 2171 error = EDESTADDRREQ; 2172 SOCKBUF_UNLOCK(&so->so_snd); 2173 goto out; 2174 } 2175 } 2176 2177 /* 2178 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 2179 * problem and need fixing. 2180 */ 2181 space = sbspace(&so->so_snd); 2182 if (flags & MSG_OOB) 2183 space += 1024; 2184 space -= clen; 2185 SOCKBUF_UNLOCK(&so->so_snd); 2186 if (resid > space) { 2187 error = EMSGSIZE; 2188 goto out; 2189 } 2190 if (uio == NULL) { 2191 resid = 0; 2192 if (flags & MSG_EOR) 2193 top->m_flags |= M_EOR; 2194 } else { 2195 /* 2196 * Copy the data from userland into a mbuf chain. 2197 * If no data is to be copied in, a single empty mbuf 2198 * is returned. 2199 */ 2200 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 2201 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 2202 if (top == NULL) { 2203 error = EFAULT; /* only possible error */ 2204 goto out; 2205 } 2206 space -= resid - uio->uio_resid; 2207 resid = uio->uio_resid; 2208 } 2209 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 2210 /* 2211 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 2212 * than with. 2213 */ 2214 if (dontroute) { 2215 SOCK_LOCK(so); 2216 so->so_options |= SO_DONTROUTE; 2217 SOCK_UNLOCK(so); 2218 } 2219 /* 2220 * XXX all the SBS_CANTSENDMORE checks previously done could be out 2221 * of date. We could have received a reset packet in an interrupt or 2222 * maybe we slept while doing page faults in uiomove() etc. We could 2223 * probably recheck again inside the locking protection here, but 2224 * there are probably other places that this also happens. We must 2225 * rethink this. 2226 */ 2227 VNET_SO_ASSERT(so); 2228 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : 2229 /* 2230 * If the user set MSG_EOF, the protocol understands this flag and 2231 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 2232 */ 2233 ((flags & MSG_EOF) && 2234 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2235 (resid <= 0)) ? 2236 PRUS_EOF : 2237 /* If there is more to send set PRUS_MORETOCOME */ 2238 (flags & MSG_MORETOCOME) || 2239 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 2240 top, addr, control, td); 2241 if (dontroute) { 2242 SOCK_LOCK(so); 2243 so->so_options &= ~SO_DONTROUTE; 2244 SOCK_UNLOCK(so); 2245 } 2246 clen = 0; 2247 control = NULL; 2248 top = NULL; 2249 out: 2250 if (top != NULL) 2251 m_freem(top); 2252 if (control != NULL) 2253 m_freem(control); 2254 return (error); 2255 } 2256 2257 /* 2258 * Send on a socket. If send must go all at once and message is larger than 2259 * send buffering, then hard error. Lock against other senders. If must go 2260 * all at once and not enough room now, then inform user that this would 2261 * block and do nothing. Otherwise, if nonblocking, send as much as 2262 * possible. The data to be sent is described by "uio" if nonzero, otherwise 2263 * by the mbuf chain "top" (which must be null if uio is not). Data provided 2264 * in mbuf chain must be small enough to send all at once. 2265 * 2266 * Returns nonzero on error, timeout or signal; callers must check for short 2267 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 2268 * on return. 2269 */ 2270 static int 2271 sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, 2272 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2273 { 2274 long space; 2275 ssize_t resid; 2276 int clen = 0, error, dontroute; 2277 int atomic = sosendallatonce(so) || top; 2278 int pr_send_flag; 2279 #ifdef KERN_TLS 2280 struct ktls_session *tls; 2281 int tls_enq_cnt, tls_send_flag; 2282 uint8_t tls_rtype; 2283 2284 tls = NULL; 2285 tls_rtype = TLS_RLTYPE_APP; 2286 #endif 2287 2288 SOCK_IO_SEND_ASSERT_LOCKED(so); 2289 2290 if (uio != NULL) 2291 resid = uio->uio_resid; 2292 else if ((top->m_flags & M_PKTHDR) != 0) 2293 resid = top->m_pkthdr.len; 2294 else 2295 resid = m_length(top, NULL); 2296 /* 2297 * In theory resid should be unsigned. However, space must be 2298 * signed, as it might be less than 0 if we over-committed, and we 2299 * must use a signed comparison of space and resid. On the other 2300 * hand, a negative resid causes us to loop sending 0-length 2301 * segments to the protocol. 2302 * 2303 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 2304 * type sockets since that's an error. 2305 */ 2306 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 2307 error = EINVAL; 2308 goto out; 2309 } 2310 2311 dontroute = 2312 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 2313 (so->so_proto->pr_flags & PR_ATOMIC); 2314 if (td != NULL) 2315 td->td_ru.ru_msgsnd++; 2316 if (control != NULL) 2317 clen = control->m_len; 2318 2319 #ifdef KERN_TLS 2320 tls_send_flag = 0; 2321 tls = ktls_hold(so->so_snd.sb_tls_info); 2322 if (tls != NULL) { 2323 if (tls->mode == TCP_TLS_MODE_SW) 2324 tls_send_flag = PRUS_NOTREADY; 2325 2326 if (control != NULL) { 2327 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 2328 2329 if (clen >= sizeof(*cm) && 2330 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 2331 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 2332 clen = 0; 2333 m_freem(control); 2334 control = NULL; 2335 atomic = 1; 2336 } 2337 } 2338 2339 if (resid == 0 && !ktls_permit_empty_frames(tls)) { 2340 error = EINVAL; 2341 goto out; 2342 } 2343 } 2344 #endif 2345 2346 restart: 2347 do { 2348 SOCKBUF_LOCK(&so->so_snd); 2349 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2350 SOCKBUF_UNLOCK(&so->so_snd); 2351 error = EPIPE; 2352 goto out; 2353 } 2354 if (so->so_error) { 2355 error = so->so_error; 2356 so->so_error = 0; 2357 SOCKBUF_UNLOCK(&so->so_snd); 2358 goto out; 2359 } 2360 if ((so->so_state & SS_ISCONNECTED) == 0) { 2361 /* 2362 * `sendto' and `sendmsg' is allowed on a connection- 2363 * based socket if it supports implied connect. 2364 * Return ENOTCONN if not connected and no address is 2365 * supplied. 2366 */ 2367 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2368 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2369 if (!(resid == 0 && clen != 0)) { 2370 SOCKBUF_UNLOCK(&so->so_snd); 2371 error = ENOTCONN; 2372 goto out; 2373 } 2374 } else if (addr == NULL) { 2375 SOCKBUF_UNLOCK(&so->so_snd); 2376 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2377 error = ENOTCONN; 2378 else 2379 error = EDESTADDRREQ; 2380 goto out; 2381 } 2382 } 2383 space = sbspace(&so->so_snd); 2384 if (flags & MSG_OOB) 2385 space += 1024; 2386 if ((atomic && resid > so->so_snd.sb_hiwat) || 2387 clen > so->so_snd.sb_hiwat) { 2388 SOCKBUF_UNLOCK(&so->so_snd); 2389 error = EMSGSIZE; 2390 goto out; 2391 } 2392 if (space < resid + clen && 2393 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 2394 if ((so->so_state & SS_NBIO) || 2395 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 2396 SOCKBUF_UNLOCK(&so->so_snd); 2397 error = EWOULDBLOCK; 2398 goto out; 2399 } 2400 error = sbwait(so, SO_SND); 2401 SOCKBUF_UNLOCK(&so->so_snd); 2402 if (error) 2403 goto out; 2404 goto restart; 2405 } 2406 SOCKBUF_UNLOCK(&so->so_snd); 2407 space -= clen; 2408 do { 2409 if (uio == NULL) { 2410 resid = 0; 2411 if (flags & MSG_EOR) 2412 top->m_flags |= M_EOR; 2413 #ifdef KERN_TLS 2414 if (tls != NULL) { 2415 ktls_frame(top, tls, &tls_enq_cnt, 2416 tls_rtype); 2417 tls_rtype = TLS_RLTYPE_APP; 2418 } 2419 #endif 2420 } else { 2421 /* 2422 * Copy the data from userland into a mbuf 2423 * chain. If resid is 0, which can happen 2424 * only if we have control to send, then 2425 * a single empty mbuf is returned. This 2426 * is a workaround to prevent protocol send 2427 * methods to panic. 2428 */ 2429 #ifdef KERN_TLS 2430 if (tls != NULL) { 2431 top = m_uiotombuf(uio, M_WAITOK, space, 2432 tls->params.max_frame_len, 2433 M_EXTPG | 2434 ((flags & MSG_EOR) ? M_EOR : 0)); 2435 if (top != NULL) { 2436 ktls_frame(top, tls, 2437 &tls_enq_cnt, tls_rtype); 2438 } 2439 tls_rtype = TLS_RLTYPE_APP; 2440 } else 2441 #endif 2442 top = m_uiotombuf(uio, M_WAITOK, space, 2443 (atomic ? max_hdr : 0), 2444 (atomic ? M_PKTHDR : 0) | 2445 ((flags & MSG_EOR) ? M_EOR : 0)); 2446 if (top == NULL) { 2447 error = EFAULT; /* only possible error */ 2448 goto out; 2449 } 2450 space -= resid - uio->uio_resid; 2451 resid = uio->uio_resid; 2452 } 2453 if (dontroute) { 2454 SOCK_LOCK(so); 2455 so->so_options |= SO_DONTROUTE; 2456 SOCK_UNLOCK(so); 2457 } 2458 /* 2459 * XXX all the SBS_CANTSENDMORE checks previously 2460 * done could be out of date. We could have received 2461 * a reset packet in an interrupt or maybe we slept 2462 * while doing page faults in uiomove() etc. We 2463 * could probably recheck again inside the locking 2464 * protection here, but there are probably other 2465 * places that this also happens. We must rethink 2466 * this. 2467 */ 2468 VNET_SO_ASSERT(so); 2469 2470 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : 2471 /* 2472 * If the user set MSG_EOF, the protocol understands 2473 * this flag and nothing left to send then use 2474 * PRU_SEND_EOF instead of PRU_SEND. 2475 */ 2476 ((flags & MSG_EOF) && 2477 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2478 (resid <= 0)) ? 2479 PRUS_EOF : 2480 /* If there is more to send set PRUS_MORETOCOME. */ 2481 (flags & MSG_MORETOCOME) || 2482 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 2483 2484 #ifdef KERN_TLS 2485 pr_send_flag |= tls_send_flag; 2486 #endif 2487 2488 error = so->so_proto->pr_send(so, pr_send_flag, top, 2489 addr, control, td); 2490 2491 if (dontroute) { 2492 SOCK_LOCK(so); 2493 so->so_options &= ~SO_DONTROUTE; 2494 SOCK_UNLOCK(so); 2495 } 2496 2497 #ifdef KERN_TLS 2498 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 2499 if (error != 0) { 2500 m_freem(top); 2501 top = NULL; 2502 } else { 2503 soref(so); 2504 ktls_enqueue(top, so, tls_enq_cnt); 2505 } 2506 } 2507 #endif 2508 clen = 0; 2509 control = NULL; 2510 top = NULL; 2511 if (error) 2512 goto out; 2513 } while (resid && space > 0); 2514 } while (resid); 2515 2516 out: 2517 #ifdef KERN_TLS 2518 if (tls != NULL) 2519 ktls_free(tls); 2520 #endif 2521 if (top != NULL) 2522 m_freem(top); 2523 if (control != NULL) 2524 m_freem(control); 2525 return (error); 2526 } 2527 2528 int 2529 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 2530 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2531 { 2532 int error; 2533 2534 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 2535 if (error) 2536 return (error); 2537 error = sosend_generic_locked(so, addr, uio, top, control, flags, td); 2538 SOCK_IO_SEND_UNLOCK(so); 2539 return (error); 2540 } 2541 2542 /* 2543 * Send to a socket from a kernel thread. 2544 * 2545 * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. 2546 * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs 2547 * to be set at all. This function should just boil down to a static inline 2548 * calling the protocol method. 2549 */ 2550 int 2551 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2552 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2553 { 2554 int error; 2555 2556 CURVNET_SET(so->so_vnet); 2557 error = so->so_proto->pr_sosend(so, addr, uio, 2558 top, control, flags, td); 2559 CURVNET_RESTORE(); 2560 return (error); 2561 } 2562 2563 /* 2564 * send(2), write(2) or aio_write(2) on a socket. 2565 */ 2566 int 2567 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2568 struct mbuf *control, int flags, struct proc *userproc) 2569 { 2570 struct thread *td; 2571 ssize_t len; 2572 int error; 2573 2574 td = uio->uio_td; 2575 len = uio->uio_resid; 2576 CURVNET_SET(so->so_vnet); 2577 error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, 2578 td); 2579 CURVNET_RESTORE(); 2580 if (error != 0) { 2581 /* 2582 * Clear transient errors for stream protocols if they made 2583 * some progress. Make exclusion for aio(4) that would 2584 * schedule a new write in case of EWOULDBLOCK and clear 2585 * error itself. See soaio_process_job(). 2586 */ 2587 if (uio->uio_resid != len && 2588 (so->so_proto->pr_flags & PR_ATOMIC) == 0 && 2589 userproc == NULL && 2590 (error == ERESTART || error == EINTR || 2591 error == EWOULDBLOCK)) 2592 error = 0; 2593 /* Generation of SIGPIPE can be controlled per socket. */ 2594 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && 2595 (flags & MSG_NOSIGNAL) == 0) { 2596 if (userproc != NULL) { 2597 /* aio(4) job */ 2598 PROC_LOCK(userproc); 2599 kern_psignal(userproc, SIGPIPE); 2600 PROC_UNLOCK(userproc); 2601 } else { 2602 PROC_LOCK(td->td_proc); 2603 tdsignal(td, SIGPIPE); 2604 PROC_UNLOCK(td->td_proc); 2605 } 2606 } 2607 } 2608 return (error); 2609 } 2610 2611 /* 2612 * The part of soreceive() that implements reading non-inline out-of-band 2613 * data from a socket. For more complete comments, see soreceive(), from 2614 * which this code originated. 2615 * 2616 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 2617 * unable to return an mbuf chain to the caller. 2618 */ 2619 static int 2620 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 2621 { 2622 struct protosw *pr = so->so_proto; 2623 struct mbuf *m; 2624 int error; 2625 2626 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 2627 VNET_SO_ASSERT(so); 2628 2629 m = m_get(M_WAITOK, MT_DATA); 2630 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 2631 if (error) 2632 goto bad; 2633 do { 2634 error = uiomove(mtod(m, void *), 2635 (int) min(uio->uio_resid, m->m_len), uio); 2636 m = m_free(m); 2637 } while (uio->uio_resid && error == 0 && m); 2638 bad: 2639 if (m != NULL) 2640 m_freem(m); 2641 return (error); 2642 } 2643 2644 /* 2645 * Following replacement or removal of the first mbuf on the first mbuf chain 2646 * of a socket buffer, push necessary state changes back into the socket 2647 * buffer so that other consumers see the values consistently. 'nextrecord' 2648 * is the callers locally stored value of the original value of 2649 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 2650 * NOTE: 'nextrecord' may be NULL. 2651 */ 2652 static __inline void 2653 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 2654 { 2655 2656 SOCKBUF_LOCK_ASSERT(sb); 2657 /* 2658 * First, update for the new value of nextrecord. If necessary, make 2659 * it the first record. 2660 */ 2661 if (sb->sb_mb != NULL) 2662 sb->sb_mb->m_nextpkt = nextrecord; 2663 else 2664 sb->sb_mb = nextrecord; 2665 2666 /* 2667 * Now update any dependent socket buffer fields to reflect the new 2668 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 2669 * addition of a second clause that takes care of the case where 2670 * sb_mb has been updated, but remains the last record. 2671 */ 2672 if (sb->sb_mb == NULL) { 2673 sb->sb_mbtail = NULL; 2674 sb->sb_lastrecord = NULL; 2675 } else if (sb->sb_mb->m_nextpkt == NULL) 2676 sb->sb_lastrecord = sb->sb_mb; 2677 } 2678 2679 /* 2680 * Implement receive operations on a socket. We depend on the way that 2681 * records are added to the sockbuf by sbappend. In particular, each record 2682 * (mbufs linked through m_next) must begin with an address if the protocol 2683 * so specifies, followed by an optional mbuf or mbufs containing ancillary 2684 * data, and then zero or more mbufs of data. In order to allow parallelism 2685 * between network receive and copying to user space, as well as avoid 2686 * sleeping with a mutex held, we release the socket buffer mutex during the 2687 * user space copy. Although the sockbuf is locked, new data may still be 2688 * appended, and thus we must maintain consistency of the sockbuf during that 2689 * time. 2690 * 2691 * The caller may receive the data as a single mbuf chain by supplying an 2692 * mbuf **mp0 for use in returning the chain. The uio is then used only for 2693 * the count in uio_resid. 2694 */ 2695 static int 2696 soreceive_generic_locked(struct socket *so, struct sockaddr **psa, 2697 struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) 2698 { 2699 struct mbuf *m; 2700 int flags, error, offset; 2701 ssize_t len; 2702 struct protosw *pr = so->so_proto; 2703 struct mbuf *nextrecord; 2704 int moff, type = 0; 2705 ssize_t orig_resid = uio->uio_resid; 2706 bool report_real_len = false; 2707 2708 SOCK_IO_RECV_ASSERT_LOCKED(so); 2709 2710 error = 0; 2711 if (flagsp != NULL) { 2712 report_real_len = *flagsp & MSG_TRUNC; 2713 *flagsp &= ~MSG_TRUNC; 2714 flags = *flagsp &~ MSG_EOR; 2715 } else 2716 flags = 0; 2717 2718 restart: 2719 SOCKBUF_LOCK(&so->so_rcv); 2720 m = so->so_rcv.sb_mb; 2721 /* 2722 * If we have less data than requested, block awaiting more (subject 2723 * to any timeout) if: 2724 * 1. the current count is less than the low water mark, or 2725 * 2. MSG_DONTWAIT is not set 2726 */ 2727 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 2728 sbavail(&so->so_rcv) < uio->uio_resid) && 2729 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 2730 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 2731 KASSERT(m != NULL || !sbavail(&so->so_rcv), 2732 ("receive: m == %p sbavail == %u", 2733 m, sbavail(&so->so_rcv))); 2734 if (so->so_error || so->so_rerror) { 2735 if (m != NULL) 2736 goto dontblock; 2737 if (so->so_error) 2738 error = so->so_error; 2739 else 2740 error = so->so_rerror; 2741 if ((flags & MSG_PEEK) == 0) { 2742 if (so->so_error) 2743 so->so_error = 0; 2744 else 2745 so->so_rerror = 0; 2746 } 2747 SOCKBUF_UNLOCK(&so->so_rcv); 2748 goto release; 2749 } 2750 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2751 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2752 if (m != NULL) 2753 goto dontblock; 2754 #ifdef KERN_TLS 2755 else if (so->so_rcv.sb_tlsdcc == 0 && 2756 so->so_rcv.sb_tlscc == 0) { 2757 #else 2758 else { 2759 #endif 2760 SOCKBUF_UNLOCK(&so->so_rcv); 2761 goto release; 2762 } 2763 } 2764 for (; m != NULL; m = m->m_next) 2765 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2766 m = so->so_rcv.sb_mb; 2767 goto dontblock; 2768 } 2769 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2770 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2771 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2772 SOCKBUF_UNLOCK(&so->so_rcv); 2773 error = ENOTCONN; 2774 goto release; 2775 } 2776 if (uio->uio_resid == 0 && !report_real_len) { 2777 SOCKBUF_UNLOCK(&so->so_rcv); 2778 goto release; 2779 } 2780 if ((so->so_state & SS_NBIO) || 2781 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2782 SOCKBUF_UNLOCK(&so->so_rcv); 2783 error = EWOULDBLOCK; 2784 goto release; 2785 } 2786 SBLASTRECORDCHK(&so->so_rcv); 2787 SBLASTMBUFCHK(&so->so_rcv); 2788 error = sbwait(so, SO_RCV); 2789 SOCKBUF_UNLOCK(&so->so_rcv); 2790 if (error) 2791 goto release; 2792 goto restart; 2793 } 2794 dontblock: 2795 /* 2796 * From this point onward, we maintain 'nextrecord' as a cache of the 2797 * pointer to the next record in the socket buffer. We must keep the 2798 * various socket buffer pointers and local stack versions of the 2799 * pointers in sync, pushing out modifications before dropping the 2800 * socket buffer mutex, and re-reading them when picking it up. 2801 * 2802 * Otherwise, we will race with the network stack appending new data 2803 * or records onto the socket buffer by using inconsistent/stale 2804 * versions of the field, possibly resulting in socket buffer 2805 * corruption. 2806 * 2807 * By holding the high-level sblock(), we prevent simultaneous 2808 * readers from pulling off the front of the socket buffer. 2809 */ 2810 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2811 if (uio->uio_td) 2812 uio->uio_td->td_ru.ru_msgrcv++; 2813 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2814 SBLASTRECORDCHK(&so->so_rcv); 2815 SBLASTMBUFCHK(&so->so_rcv); 2816 nextrecord = m->m_nextpkt; 2817 if (pr->pr_flags & PR_ADDR) { 2818 KASSERT(m->m_type == MT_SONAME, 2819 ("m->m_type == %d", m->m_type)); 2820 orig_resid = 0; 2821 if (psa != NULL) 2822 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2823 M_NOWAIT); 2824 if (flags & MSG_PEEK) { 2825 m = m->m_next; 2826 } else { 2827 sbfree(&so->so_rcv, m); 2828 so->so_rcv.sb_mb = m_free(m); 2829 m = so->so_rcv.sb_mb; 2830 sockbuf_pushsync(&so->so_rcv, nextrecord); 2831 } 2832 } 2833 2834 /* 2835 * Process one or more MT_CONTROL mbufs present before any data mbufs 2836 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2837 * just copy the data; if !MSG_PEEK, we call into the protocol to 2838 * perform externalization (or freeing if controlp == NULL). 2839 */ 2840 if (m != NULL && m->m_type == MT_CONTROL) { 2841 struct mbuf *cm = NULL, *cmn; 2842 struct mbuf **cme = &cm; 2843 #ifdef KERN_TLS 2844 struct cmsghdr *cmsg; 2845 struct tls_get_record tgr; 2846 2847 /* 2848 * For MSG_TLSAPPDATA, check for an alert record. 2849 * If found, return ENXIO without removing 2850 * it from the receive queue. This allows a subsequent 2851 * call without MSG_TLSAPPDATA to receive it. 2852 * Note that, for TLS, there should only be a single 2853 * control mbuf with the TLS_GET_RECORD message in it. 2854 */ 2855 if (flags & MSG_TLSAPPDATA) { 2856 cmsg = mtod(m, struct cmsghdr *); 2857 if (cmsg->cmsg_type == TLS_GET_RECORD && 2858 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2859 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2860 if (__predict_false(tgr.tls_type == 2861 TLS_RLTYPE_ALERT)) { 2862 SOCKBUF_UNLOCK(&so->so_rcv); 2863 error = ENXIO; 2864 goto release; 2865 } 2866 } 2867 } 2868 #endif 2869 2870 do { 2871 if (flags & MSG_PEEK) { 2872 if (controlp != NULL) { 2873 *controlp = m_copym(m, 0, m->m_len, 2874 M_NOWAIT); 2875 controlp = &(*controlp)->m_next; 2876 } 2877 m = m->m_next; 2878 } else { 2879 sbfree(&so->so_rcv, m); 2880 so->so_rcv.sb_mb = m->m_next; 2881 m->m_next = NULL; 2882 *cme = m; 2883 cme = &(*cme)->m_next; 2884 m = so->so_rcv.sb_mb; 2885 } 2886 } while (m != NULL && m->m_type == MT_CONTROL); 2887 if ((flags & MSG_PEEK) == 0) 2888 sockbuf_pushsync(&so->so_rcv, nextrecord); 2889 while (cm != NULL) { 2890 cmn = cm->m_next; 2891 cm->m_next = NULL; 2892 if (pr->pr_domain->dom_externalize != NULL) { 2893 SOCKBUF_UNLOCK(&so->so_rcv); 2894 VNET_SO_ASSERT(so); 2895 error = (*pr->pr_domain->dom_externalize) 2896 (cm, controlp, flags); 2897 SOCKBUF_LOCK(&so->so_rcv); 2898 } else if (controlp != NULL) 2899 *controlp = cm; 2900 else 2901 m_freem(cm); 2902 if (controlp != NULL) { 2903 while (*controlp != NULL) 2904 controlp = &(*controlp)->m_next; 2905 } 2906 cm = cmn; 2907 } 2908 if (m != NULL) 2909 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 2910 else 2911 nextrecord = so->so_rcv.sb_mb; 2912 orig_resid = 0; 2913 } 2914 if (m != NULL) { 2915 if ((flags & MSG_PEEK) == 0) { 2916 KASSERT(m->m_nextpkt == nextrecord, 2917 ("soreceive: post-control, nextrecord !sync")); 2918 if (nextrecord == NULL) { 2919 KASSERT(so->so_rcv.sb_mb == m, 2920 ("soreceive: post-control, sb_mb!=m")); 2921 KASSERT(so->so_rcv.sb_lastrecord == m, 2922 ("soreceive: post-control, lastrecord!=m")); 2923 } 2924 } 2925 type = m->m_type; 2926 if (type == MT_OOBDATA) 2927 flags |= MSG_OOB; 2928 } else { 2929 if ((flags & MSG_PEEK) == 0) { 2930 KASSERT(so->so_rcv.sb_mb == nextrecord, 2931 ("soreceive: sb_mb != nextrecord")); 2932 if (so->so_rcv.sb_mb == NULL) { 2933 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2934 ("soreceive: sb_lastercord != NULL")); 2935 } 2936 } 2937 } 2938 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2939 SBLASTRECORDCHK(&so->so_rcv); 2940 SBLASTMBUFCHK(&so->so_rcv); 2941 2942 /* 2943 * Now continue to read any data mbufs off of the head of the socket 2944 * buffer until the read request is satisfied. Note that 'type' is 2945 * used to store the type of any mbuf reads that have happened so far 2946 * such that soreceive() can stop reading if the type changes, which 2947 * causes soreceive() to return only one of regular data and inline 2948 * out-of-band data in a single socket receive operation. 2949 */ 2950 moff = 0; 2951 offset = 0; 2952 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 2953 && error == 0) { 2954 /* 2955 * If the type of mbuf has changed since the last mbuf 2956 * examined ('type'), end the receive operation. 2957 */ 2958 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2959 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 2960 if (type != m->m_type) 2961 break; 2962 } else if (type == MT_OOBDATA) 2963 break; 2964 else 2965 KASSERT(m->m_type == MT_DATA, 2966 ("m->m_type == %d", m->m_type)); 2967 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 2968 len = uio->uio_resid; 2969 if (so->so_oobmark && len > so->so_oobmark - offset) 2970 len = so->so_oobmark - offset; 2971 if (len > m->m_len - moff) 2972 len = m->m_len - moff; 2973 /* 2974 * If mp is set, just pass back the mbufs. Otherwise copy 2975 * them out via the uio, then free. Sockbuf must be 2976 * consistent here (points to current mbuf, it points to next 2977 * record) when we drop priority; we must note any additions 2978 * to the sockbuf when we block interrupts again. 2979 */ 2980 if (mp == NULL) { 2981 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2982 SBLASTRECORDCHK(&so->so_rcv); 2983 SBLASTMBUFCHK(&so->so_rcv); 2984 SOCKBUF_UNLOCK(&so->so_rcv); 2985 if ((m->m_flags & M_EXTPG) != 0) 2986 error = m_unmapped_uiomove(m, moff, uio, 2987 (int)len); 2988 else 2989 error = uiomove(mtod(m, char *) + moff, 2990 (int)len, uio); 2991 SOCKBUF_LOCK(&so->so_rcv); 2992 if (error) { 2993 /* 2994 * The MT_SONAME mbuf has already been removed 2995 * from the record, so it is necessary to 2996 * remove the data mbufs, if any, to preserve 2997 * the invariant in the case of PR_ADDR that 2998 * requires MT_SONAME mbufs at the head of 2999 * each record. 3000 */ 3001 if (pr->pr_flags & PR_ATOMIC && 3002 ((flags & MSG_PEEK) == 0)) 3003 (void)sbdroprecord_locked(&so->so_rcv); 3004 SOCKBUF_UNLOCK(&so->so_rcv); 3005 goto release; 3006 } 3007 } else 3008 uio->uio_resid -= len; 3009 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3010 if (len == m->m_len - moff) { 3011 if (m->m_flags & M_EOR) 3012 flags |= MSG_EOR; 3013 if (flags & MSG_PEEK) { 3014 m = m->m_next; 3015 moff = 0; 3016 } else { 3017 nextrecord = m->m_nextpkt; 3018 sbfree(&so->so_rcv, m); 3019 if (mp != NULL) { 3020 m->m_nextpkt = NULL; 3021 *mp = m; 3022 mp = &m->m_next; 3023 so->so_rcv.sb_mb = m = m->m_next; 3024 *mp = NULL; 3025 } else { 3026 so->so_rcv.sb_mb = m_free(m); 3027 m = so->so_rcv.sb_mb; 3028 } 3029 sockbuf_pushsync(&so->so_rcv, nextrecord); 3030 SBLASTRECORDCHK(&so->so_rcv); 3031 SBLASTMBUFCHK(&so->so_rcv); 3032 } 3033 } else { 3034 if (flags & MSG_PEEK) 3035 moff += len; 3036 else { 3037 if (mp != NULL) { 3038 if (flags & MSG_DONTWAIT) { 3039 *mp = m_copym(m, 0, len, 3040 M_NOWAIT); 3041 if (*mp == NULL) { 3042 /* 3043 * m_copym() couldn't 3044 * allocate an mbuf. 3045 * Adjust uio_resid back 3046 * (it was adjusted 3047 * down by len bytes, 3048 * which we didn't end 3049 * up "copying" over). 3050 */ 3051 uio->uio_resid += len; 3052 break; 3053 } 3054 } else { 3055 SOCKBUF_UNLOCK(&so->so_rcv); 3056 *mp = m_copym(m, 0, len, 3057 M_WAITOK); 3058 SOCKBUF_LOCK(&so->so_rcv); 3059 } 3060 } 3061 sbcut_locked(&so->so_rcv, len); 3062 } 3063 } 3064 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3065 if (so->so_oobmark) { 3066 if ((flags & MSG_PEEK) == 0) { 3067 so->so_oobmark -= len; 3068 if (so->so_oobmark == 0) { 3069 so->so_rcv.sb_state |= SBS_RCVATMARK; 3070 break; 3071 } 3072 } else { 3073 offset += len; 3074 if (offset == so->so_oobmark) 3075 break; 3076 } 3077 } 3078 if (flags & MSG_EOR) 3079 break; 3080 /* 3081 * If the MSG_WAITALL flag is set (for non-atomic socket), we 3082 * must not quit until "uio->uio_resid == 0" or an error 3083 * termination. If a signal/timeout occurs, return with a 3084 * short count but without error. Keep sockbuf locked 3085 * against other readers. 3086 */ 3087 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 3088 !sosendallatonce(so) && nextrecord == NULL) { 3089 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3090 if (so->so_error || so->so_rerror || 3091 so->so_rcv.sb_state & SBS_CANTRCVMORE) 3092 break; 3093 /* 3094 * Notify the protocol that some data has been 3095 * drained before blocking. 3096 */ 3097 if (pr->pr_flags & PR_WANTRCVD) { 3098 SOCKBUF_UNLOCK(&so->so_rcv); 3099 VNET_SO_ASSERT(so); 3100 pr->pr_rcvd(so, flags); 3101 SOCKBUF_LOCK(&so->so_rcv); 3102 if (__predict_false(so->so_rcv.sb_mb == NULL && 3103 (so->so_error || so->so_rerror || 3104 so->so_rcv.sb_state & SBS_CANTRCVMORE))) 3105 break; 3106 } 3107 SBLASTRECORDCHK(&so->so_rcv); 3108 SBLASTMBUFCHK(&so->so_rcv); 3109 /* 3110 * We could receive some data while was notifying 3111 * the protocol. Skip blocking in this case. 3112 */ 3113 if (so->so_rcv.sb_mb == NULL) { 3114 error = sbwait(so, SO_RCV); 3115 if (error) { 3116 SOCKBUF_UNLOCK(&so->so_rcv); 3117 goto release; 3118 } 3119 } 3120 m = so->so_rcv.sb_mb; 3121 if (m != NULL) 3122 nextrecord = m->m_nextpkt; 3123 } 3124 } 3125 3126 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3127 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 3128 if (report_real_len) 3129 uio->uio_resid -= m_length(m, NULL) - moff; 3130 flags |= MSG_TRUNC; 3131 if ((flags & MSG_PEEK) == 0) 3132 (void) sbdroprecord_locked(&so->so_rcv); 3133 } 3134 if ((flags & MSG_PEEK) == 0) { 3135 if (m == NULL) { 3136 /* 3137 * First part is an inline SB_EMPTY_FIXUP(). Second 3138 * part makes sure sb_lastrecord is up-to-date if 3139 * there is still data in the socket buffer. 3140 */ 3141 so->so_rcv.sb_mb = nextrecord; 3142 if (so->so_rcv.sb_mb == NULL) { 3143 so->so_rcv.sb_mbtail = NULL; 3144 so->so_rcv.sb_lastrecord = NULL; 3145 } else if (nextrecord->m_nextpkt == NULL) 3146 so->so_rcv.sb_lastrecord = nextrecord; 3147 } 3148 SBLASTRECORDCHK(&so->so_rcv); 3149 SBLASTMBUFCHK(&so->so_rcv); 3150 /* 3151 * If soreceive() is being done from the socket callback, 3152 * then don't need to generate ACK to peer to update window, 3153 * since ACK will be generated on return to TCP. 3154 */ 3155 if (!(flags & MSG_SOCALLBCK) && 3156 (pr->pr_flags & PR_WANTRCVD)) { 3157 SOCKBUF_UNLOCK(&so->so_rcv); 3158 VNET_SO_ASSERT(so); 3159 pr->pr_rcvd(so, flags); 3160 SOCKBUF_LOCK(&so->so_rcv); 3161 } 3162 } 3163 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3164 if (orig_resid == uio->uio_resid && orig_resid && 3165 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 3166 SOCKBUF_UNLOCK(&so->so_rcv); 3167 goto restart; 3168 } 3169 SOCKBUF_UNLOCK(&so->so_rcv); 3170 3171 if (flagsp != NULL) 3172 *flagsp |= flags; 3173 release: 3174 return (error); 3175 } 3176 3177 int 3178 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 3179 struct mbuf **mp, struct mbuf **controlp, int *flagsp) 3180 { 3181 int error, flags; 3182 3183 if (psa != NULL) 3184 *psa = NULL; 3185 if (controlp != NULL) 3186 *controlp = NULL; 3187 if (flagsp != NULL) { 3188 flags = *flagsp; 3189 if ((flags & MSG_OOB) != 0) 3190 return (soreceive_rcvoob(so, uio, flags)); 3191 } else { 3192 flags = 0; 3193 } 3194 if (mp != NULL) 3195 *mp = NULL; 3196 3197 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3198 if (error) 3199 return (error); 3200 error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp); 3201 SOCK_IO_RECV_UNLOCK(so); 3202 return (error); 3203 } 3204 3205 /* 3206 * Optimized version of soreceive() for stream (TCP) sockets. 3207 */ 3208 static int 3209 soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 3210 struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, 3211 struct mbuf **controlp, int flags) 3212 { 3213 int len = 0, error = 0, oresid; 3214 struct mbuf *m, *n = NULL; 3215 3216 SOCK_IO_RECV_ASSERT_LOCKED(so); 3217 3218 /* Easy one, no space to copyout anything. */ 3219 if (uio->uio_resid == 0) 3220 return (EINVAL); 3221 oresid = uio->uio_resid; 3222 3223 SOCKBUF_LOCK(sb); 3224 /* We will never ever get anything unless we are or were connected. */ 3225 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 3226 error = ENOTCONN; 3227 goto out; 3228 } 3229 3230 restart: 3231 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3232 3233 /* Abort if socket has reported problems. */ 3234 if (so->so_error) { 3235 if (sbavail(sb) > 0) 3236 goto deliver; 3237 if (oresid > uio->uio_resid) 3238 goto out; 3239 error = so->so_error; 3240 if (!(flags & MSG_PEEK)) 3241 so->so_error = 0; 3242 goto out; 3243 } 3244 3245 /* Door is closed. Deliver what is left, if any. */ 3246 if (sb->sb_state & SBS_CANTRCVMORE) { 3247 if (sbavail(sb) > 0) 3248 goto deliver; 3249 else 3250 goto out; 3251 } 3252 3253 /* Socket buffer is empty and we shall not block. */ 3254 if (sbavail(sb) == 0 && 3255 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 3256 error = EAGAIN; 3257 goto out; 3258 } 3259 3260 /* Socket buffer got some data that we shall deliver now. */ 3261 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 3262 ((so->so_state & SS_NBIO) || 3263 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 3264 sbavail(sb) >= sb->sb_lowat || 3265 sbavail(sb) >= uio->uio_resid || 3266 sbavail(sb) >= sb->sb_hiwat) ) { 3267 goto deliver; 3268 } 3269 3270 /* On MSG_WAITALL we must wait until all data or error arrives. */ 3271 if ((flags & MSG_WAITALL) && 3272 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 3273 goto deliver; 3274 3275 /* 3276 * Wait and block until (more) data comes in. 3277 * NB: Drops the sockbuf lock during wait. 3278 */ 3279 error = sbwait(so, SO_RCV); 3280 if (error) 3281 goto out; 3282 goto restart; 3283 3284 deliver: 3285 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3286 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 3287 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 3288 3289 /* Statistics. */ 3290 if (uio->uio_td) 3291 uio->uio_td->td_ru.ru_msgrcv++; 3292 3293 /* Fill uio until full or current end of socket buffer is reached. */ 3294 len = min(uio->uio_resid, sbavail(sb)); 3295 if (mp0 != NULL) { 3296 /* Dequeue as many mbufs as possible. */ 3297 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 3298 if (*mp0 == NULL) 3299 *mp0 = sb->sb_mb; 3300 else 3301 m_cat(*mp0, sb->sb_mb); 3302 for (m = sb->sb_mb; 3303 m != NULL && m->m_len <= len; 3304 m = m->m_next) { 3305 KASSERT(!(m->m_flags & M_NOTAVAIL), 3306 ("%s: m %p not available", __func__, m)); 3307 len -= m->m_len; 3308 uio->uio_resid -= m->m_len; 3309 sbfree(sb, m); 3310 n = m; 3311 } 3312 n->m_next = NULL; 3313 sb->sb_mb = m; 3314 sb->sb_lastrecord = sb->sb_mb; 3315 if (sb->sb_mb == NULL) 3316 SB_EMPTY_FIXUP(sb); 3317 } 3318 /* Copy the remainder. */ 3319 if (len > 0) { 3320 KASSERT(sb->sb_mb != NULL, 3321 ("%s: len > 0 && sb->sb_mb empty", __func__)); 3322 3323 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 3324 if (m == NULL) 3325 len = 0; /* Don't flush data from sockbuf. */ 3326 else 3327 uio->uio_resid -= len; 3328 if (*mp0 != NULL) 3329 m_cat(*mp0, m); 3330 else 3331 *mp0 = m; 3332 if (*mp0 == NULL) { 3333 error = ENOBUFS; 3334 goto out; 3335 } 3336 } 3337 } else { 3338 /* NB: Must unlock socket buffer as uiomove may sleep. */ 3339 SOCKBUF_UNLOCK(sb); 3340 error = m_mbuftouio(uio, sb->sb_mb, len); 3341 SOCKBUF_LOCK(sb); 3342 if (error) 3343 goto out; 3344 } 3345 SBLASTRECORDCHK(sb); 3346 SBLASTMBUFCHK(sb); 3347 3348 /* 3349 * Remove the delivered data from the socket buffer unless we 3350 * were only peeking. 3351 */ 3352 if (!(flags & MSG_PEEK)) { 3353 if (len > 0) 3354 sbdrop_locked(sb, len); 3355 3356 /* Notify protocol that we drained some data. */ 3357 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 3358 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 3359 !(flags & MSG_SOCALLBCK))) { 3360 SOCKBUF_UNLOCK(sb); 3361 VNET_SO_ASSERT(so); 3362 so->so_proto->pr_rcvd(so, flags); 3363 SOCKBUF_LOCK(sb); 3364 } 3365 } 3366 3367 /* 3368 * For MSG_WAITALL we may have to loop again and wait for 3369 * more data to come in. 3370 */ 3371 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 3372 goto restart; 3373 out: 3374 SBLASTRECORDCHK(sb); 3375 SBLASTMBUFCHK(sb); 3376 SOCKBUF_UNLOCK(sb); 3377 return (error); 3378 } 3379 3380 int 3381 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 3382 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3383 { 3384 struct sockbuf *sb; 3385 int error, flags; 3386 3387 sb = &so->so_rcv; 3388 3389 /* We only do stream sockets. */ 3390 if (so->so_type != SOCK_STREAM) 3391 return (EINVAL); 3392 if (psa != NULL) 3393 *psa = NULL; 3394 if (flagsp != NULL) 3395 flags = *flagsp & ~MSG_EOR; 3396 else 3397 flags = 0; 3398 if (controlp != NULL) 3399 *controlp = NULL; 3400 if (flags & MSG_OOB) 3401 return (soreceive_rcvoob(so, uio, flags)); 3402 if (mp0 != NULL) 3403 *mp0 = NULL; 3404 3405 #ifdef KERN_TLS 3406 /* 3407 * KTLS store TLS records as records with a control message to 3408 * describe the framing. 3409 * 3410 * We check once here before acquiring locks to optimize the 3411 * common case. 3412 */ 3413 if (sb->sb_tls_info != NULL) 3414 return (soreceive_generic(so, psa, uio, mp0, controlp, 3415 flagsp)); 3416 #endif 3417 3418 /* 3419 * Prevent other threads from reading from the socket. This lock may be 3420 * dropped in order to sleep waiting for data to arrive. 3421 */ 3422 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3423 if (error) 3424 return (error); 3425 #ifdef KERN_TLS 3426 if (__predict_false(sb->sb_tls_info != NULL)) { 3427 SOCK_IO_RECV_UNLOCK(so); 3428 return (soreceive_generic(so, psa, uio, mp0, controlp, 3429 flagsp)); 3430 } 3431 #endif 3432 error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags); 3433 SOCK_IO_RECV_UNLOCK(so); 3434 return (error); 3435 } 3436 3437 /* 3438 * Optimized version of soreceive() for simple datagram cases from userspace. 3439 * Unlike in the stream case, we're able to drop a datagram if copyout() 3440 * fails, and because we handle datagrams atomically, we don't need to use a 3441 * sleep lock to prevent I/O interlacing. 3442 */ 3443 int 3444 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 3445 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3446 { 3447 struct mbuf *m, *m2; 3448 int flags, error; 3449 ssize_t len; 3450 struct protosw *pr = so->so_proto; 3451 struct mbuf *nextrecord; 3452 3453 if (psa != NULL) 3454 *psa = NULL; 3455 if (controlp != NULL) 3456 *controlp = NULL; 3457 if (flagsp != NULL) 3458 flags = *flagsp &~ MSG_EOR; 3459 else 3460 flags = 0; 3461 3462 /* 3463 * For any complicated cases, fall back to the full 3464 * soreceive_generic(). 3465 */ 3466 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) 3467 return (soreceive_generic(so, psa, uio, mp0, controlp, 3468 flagsp)); 3469 3470 /* 3471 * Enforce restrictions on use. 3472 */ 3473 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 3474 ("soreceive_dgram: wantrcvd")); 3475 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 3476 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 3477 ("soreceive_dgram: SBS_RCVATMARK")); 3478 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 3479 ("soreceive_dgram: P_CONNREQUIRED")); 3480 3481 /* 3482 * Loop blocking while waiting for a datagram. 3483 */ 3484 SOCKBUF_LOCK(&so->so_rcv); 3485 while ((m = so->so_rcv.sb_mb) == NULL) { 3486 KASSERT(sbavail(&so->so_rcv) == 0, 3487 ("soreceive_dgram: sb_mb NULL but sbavail %u", 3488 sbavail(&so->so_rcv))); 3489 if (so->so_error) { 3490 error = so->so_error; 3491 so->so_error = 0; 3492 SOCKBUF_UNLOCK(&so->so_rcv); 3493 return (error); 3494 } 3495 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 3496 uio->uio_resid == 0) { 3497 SOCKBUF_UNLOCK(&so->so_rcv); 3498 return (0); 3499 } 3500 if ((so->so_state & SS_NBIO) || 3501 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 3502 SOCKBUF_UNLOCK(&so->so_rcv); 3503 return (EWOULDBLOCK); 3504 } 3505 SBLASTRECORDCHK(&so->so_rcv); 3506 SBLASTMBUFCHK(&so->so_rcv); 3507 error = sbwait(so, SO_RCV); 3508 if (error) { 3509 SOCKBUF_UNLOCK(&so->so_rcv); 3510 return (error); 3511 } 3512 } 3513 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3514 3515 if (uio->uio_td) 3516 uio->uio_td->td_ru.ru_msgrcv++; 3517 SBLASTRECORDCHK(&so->so_rcv); 3518 SBLASTMBUFCHK(&so->so_rcv); 3519 nextrecord = m->m_nextpkt; 3520 if (nextrecord == NULL) { 3521 KASSERT(so->so_rcv.sb_lastrecord == m, 3522 ("soreceive_dgram: lastrecord != m")); 3523 } 3524 3525 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 3526 ("soreceive_dgram: m_nextpkt != nextrecord")); 3527 3528 /* 3529 * Pull 'm' and its chain off the front of the packet queue. 3530 */ 3531 so->so_rcv.sb_mb = NULL; 3532 sockbuf_pushsync(&so->so_rcv, nextrecord); 3533 3534 /* 3535 * Walk 'm's chain and free that many bytes from the socket buffer. 3536 */ 3537 for (m2 = m; m2 != NULL; m2 = m2->m_next) 3538 sbfree(&so->so_rcv, m2); 3539 3540 /* 3541 * Do a few last checks before we let go of the lock. 3542 */ 3543 SBLASTRECORDCHK(&so->so_rcv); 3544 SBLASTMBUFCHK(&so->so_rcv); 3545 SOCKBUF_UNLOCK(&so->so_rcv); 3546 3547 if (pr->pr_flags & PR_ADDR) { 3548 KASSERT(m->m_type == MT_SONAME, 3549 ("m->m_type == %d", m->m_type)); 3550 if (psa != NULL) 3551 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 3552 M_WAITOK); 3553 m = m_free(m); 3554 } 3555 KASSERT(m, ("%s: no data or control after soname", __func__)); 3556 3557 /* 3558 * Packet to copyout() is now in 'm' and it is disconnected from the 3559 * queue. 3560 * 3561 * Process one or more MT_CONTROL mbufs present before any data mbufs 3562 * in the first mbuf chain on the socket buffer. We call into the 3563 * protocol to perform externalization (or freeing if controlp == 3564 * NULL). In some cases there can be only MT_CONTROL mbufs without 3565 * MT_DATA mbufs. 3566 */ 3567 if (m->m_type == MT_CONTROL) { 3568 struct mbuf *cm = NULL, *cmn; 3569 struct mbuf **cme = &cm; 3570 3571 do { 3572 m2 = m->m_next; 3573 m->m_next = NULL; 3574 *cme = m; 3575 cme = &(*cme)->m_next; 3576 m = m2; 3577 } while (m != NULL && m->m_type == MT_CONTROL); 3578 while (cm != NULL) { 3579 cmn = cm->m_next; 3580 cm->m_next = NULL; 3581 if (pr->pr_domain->dom_externalize != NULL) { 3582 error = (*pr->pr_domain->dom_externalize) 3583 (cm, controlp, flags); 3584 } else if (controlp != NULL) 3585 *controlp = cm; 3586 else 3587 m_freem(cm); 3588 if (controlp != NULL) { 3589 while (*controlp != NULL) 3590 controlp = &(*controlp)->m_next; 3591 } 3592 cm = cmn; 3593 } 3594 } 3595 KASSERT(m == NULL || m->m_type == MT_DATA, 3596 ("soreceive_dgram: !data")); 3597 while (m != NULL && uio->uio_resid > 0) { 3598 len = uio->uio_resid; 3599 if (len > m->m_len) 3600 len = m->m_len; 3601 error = uiomove(mtod(m, char *), (int)len, uio); 3602 if (error) { 3603 m_freem(m); 3604 return (error); 3605 } 3606 if (len == m->m_len) 3607 m = m_free(m); 3608 else { 3609 m->m_data += len; 3610 m->m_len -= len; 3611 } 3612 } 3613 if (m != NULL) { 3614 flags |= MSG_TRUNC; 3615 m_freem(m); 3616 } 3617 if (flagsp != NULL) 3618 *flagsp |= flags; 3619 return (0); 3620 } 3621 3622 int 3623 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 3624 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3625 { 3626 int error; 3627 3628 CURVNET_SET(so->so_vnet); 3629 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); 3630 CURVNET_RESTORE(); 3631 return (error); 3632 } 3633 3634 int 3635 soshutdown(struct socket *so, enum shutdown_how how) 3636 { 3637 int error; 3638 3639 CURVNET_SET(so->so_vnet); 3640 error = so->so_proto->pr_shutdown(so, how); 3641 CURVNET_RESTORE(); 3642 3643 return (error); 3644 } 3645 3646 /* 3647 * Used by several pr_shutdown implementations that use generic socket buffers. 3648 */ 3649 void 3650 sorflush(struct socket *so) 3651 { 3652 int error; 3653 3654 VNET_SO_ASSERT(so); 3655 3656 /* 3657 * Dislodge threads currently blocked in receive and wait to acquire 3658 * a lock against other simultaneous readers before clearing the 3659 * socket buffer. Don't let our acquire be interrupted by a signal 3660 * despite any existing socket disposition on interruptable waiting. 3661 * 3662 * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive 3663 * methods that read the top of the socket buffer without acquisition 3664 * of the socket buffer mutex, assuming that top of the buffer 3665 * exclusively belongs to the read(2) syscall. This is handy when 3666 * performing MSG_PEEK. 3667 */ 3668 socantrcvmore(so); 3669 3670 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 3671 if (error != 0) { 3672 KASSERT(SOLISTENING(so), 3673 ("%s: soiolock(%p) failed", __func__, so)); 3674 return; 3675 } 3676 3677 sbrelease(so, SO_RCV); 3678 SOCK_IO_RECV_UNLOCK(so); 3679 3680 } 3681 3682 #ifdef SOCKET_HHOOK 3683 /* 3684 * Wrapper for Socket established helper hook. 3685 * Parameters: socket, context of the hook point, hook id. 3686 */ 3687 static inline int 3688 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 3689 { 3690 struct socket_hhook_data hhook_data = { 3691 .so = so, 3692 .hctx = hctx, 3693 .m = NULL, 3694 .status = 0 3695 }; 3696 3697 CURVNET_SET(so->so_vnet); 3698 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 3699 CURVNET_RESTORE(); 3700 3701 /* Ugly but needed, since hhooks return void for now */ 3702 return (hhook_data.status); 3703 } 3704 #endif 3705 3706 /* 3707 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 3708 * additional variant to handle the case where the option value needs to be 3709 * some kind of integer, but not a specific size. In addition to their use 3710 * here, these functions are also called by the protocol-level pr_ctloutput() 3711 * routines. 3712 */ 3713 int 3714 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 3715 { 3716 size_t valsize; 3717 3718 /* 3719 * If the user gives us more than we wanted, we ignore it, but if we 3720 * don't get the minimum length the caller wants, we return EINVAL. 3721 * On success, sopt->sopt_valsize is set to however much we actually 3722 * retrieved. 3723 */ 3724 if ((valsize = sopt->sopt_valsize) < minlen) 3725 return EINVAL; 3726 if (valsize > len) 3727 sopt->sopt_valsize = valsize = len; 3728 3729 if (sopt->sopt_td != NULL) 3730 return (copyin(sopt->sopt_val, buf, valsize)); 3731 3732 bcopy(sopt->sopt_val, buf, valsize); 3733 return (0); 3734 } 3735 3736 /* 3737 * Kernel version of setsockopt(2). 3738 * 3739 * XXX: optlen is size_t, not socklen_t 3740 */ 3741 int 3742 so_setsockopt(struct socket *so, int level, int optname, void *optval, 3743 size_t optlen) 3744 { 3745 struct sockopt sopt; 3746 3747 sopt.sopt_level = level; 3748 sopt.sopt_name = optname; 3749 sopt.sopt_dir = SOPT_SET; 3750 sopt.sopt_val = optval; 3751 sopt.sopt_valsize = optlen; 3752 sopt.sopt_td = NULL; 3753 return (sosetopt(so, &sopt)); 3754 } 3755 3756 int 3757 sosetopt(struct socket *so, struct sockopt *sopt) 3758 { 3759 int error, optval; 3760 struct linger l; 3761 struct timeval tv; 3762 sbintime_t val, *valp; 3763 uint32_t val32; 3764 #ifdef MAC 3765 struct mac extmac; 3766 #endif 3767 3768 CURVNET_SET(so->so_vnet); 3769 error = 0; 3770 if (sopt->sopt_level != SOL_SOCKET) { 3771 if (so->so_proto->pr_ctloutput != NULL) 3772 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3773 else 3774 error = ENOPROTOOPT; 3775 } else { 3776 switch (sopt->sopt_name) { 3777 case SO_ACCEPTFILTER: 3778 error = accept_filt_setopt(so, sopt); 3779 if (error) 3780 goto bad; 3781 break; 3782 3783 case SO_LINGER: 3784 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3785 if (error) 3786 goto bad; 3787 if (l.l_linger < 0 || 3788 l.l_linger > USHRT_MAX || 3789 l.l_linger > (INT_MAX / hz)) { 3790 error = EDOM; 3791 goto bad; 3792 } 3793 SOCK_LOCK(so); 3794 so->so_linger = l.l_linger; 3795 if (l.l_onoff) 3796 so->so_options |= SO_LINGER; 3797 else 3798 so->so_options &= ~SO_LINGER; 3799 SOCK_UNLOCK(so); 3800 break; 3801 3802 case SO_DEBUG: 3803 case SO_KEEPALIVE: 3804 case SO_DONTROUTE: 3805 case SO_USELOOPBACK: 3806 case SO_BROADCAST: 3807 case SO_REUSEADDR: 3808 case SO_REUSEPORT: 3809 case SO_REUSEPORT_LB: 3810 case SO_OOBINLINE: 3811 case SO_TIMESTAMP: 3812 case SO_BINTIME: 3813 case SO_NOSIGPIPE: 3814 case SO_NO_DDP: 3815 case SO_NO_OFFLOAD: 3816 case SO_RERROR: 3817 error = sooptcopyin(sopt, &optval, sizeof optval, 3818 sizeof optval); 3819 if (error) 3820 goto bad; 3821 SOCK_LOCK(so); 3822 if (optval) 3823 so->so_options |= sopt->sopt_name; 3824 else 3825 so->so_options &= ~sopt->sopt_name; 3826 SOCK_UNLOCK(so); 3827 break; 3828 3829 case SO_SETFIB: 3830 error = sooptcopyin(sopt, &optval, sizeof optval, 3831 sizeof optval); 3832 if (error) 3833 goto bad; 3834 3835 if (optval < 0 || optval >= rt_numfibs) { 3836 error = EINVAL; 3837 goto bad; 3838 } 3839 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 3840 (so->so_proto->pr_domain->dom_family == PF_INET6) || 3841 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 3842 so->so_fibnum = optval; 3843 else 3844 so->so_fibnum = 0; 3845 break; 3846 3847 case SO_USER_COOKIE: 3848 error = sooptcopyin(sopt, &val32, sizeof val32, 3849 sizeof val32); 3850 if (error) 3851 goto bad; 3852 so->so_user_cookie = val32; 3853 break; 3854 3855 case SO_SNDBUF: 3856 case SO_RCVBUF: 3857 case SO_SNDLOWAT: 3858 case SO_RCVLOWAT: 3859 error = so->so_proto->pr_setsbopt(so, sopt); 3860 if (error) 3861 goto bad; 3862 break; 3863 3864 case SO_SNDTIMEO: 3865 case SO_RCVTIMEO: 3866 #ifdef COMPAT_FREEBSD32 3867 if (SV_CURPROC_FLAG(SV_ILP32)) { 3868 struct timeval32 tv32; 3869 3870 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3871 sizeof tv32); 3872 CP(tv32, tv, tv_sec); 3873 CP(tv32, tv, tv_usec); 3874 } else 3875 #endif 3876 error = sooptcopyin(sopt, &tv, sizeof tv, 3877 sizeof tv); 3878 if (error) 3879 goto bad; 3880 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3881 tv.tv_usec >= 1000000) { 3882 error = EDOM; 3883 goto bad; 3884 } 3885 if (tv.tv_sec > INT32_MAX) 3886 val = SBT_MAX; 3887 else 3888 val = tvtosbt(tv); 3889 SOCK_LOCK(so); 3890 valp = sopt->sopt_name == SO_SNDTIMEO ? 3891 (SOLISTENING(so) ? &so->sol_sbsnd_timeo : 3892 &so->so_snd.sb_timeo) : 3893 (SOLISTENING(so) ? &so->sol_sbrcv_timeo : 3894 &so->so_rcv.sb_timeo); 3895 *valp = val; 3896 SOCK_UNLOCK(so); 3897 break; 3898 3899 case SO_LABEL: 3900 #ifdef MAC 3901 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3902 sizeof extmac); 3903 if (error) 3904 goto bad; 3905 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3906 so, &extmac); 3907 #else 3908 error = EOPNOTSUPP; 3909 #endif 3910 break; 3911 3912 case SO_TS_CLOCK: 3913 error = sooptcopyin(sopt, &optval, sizeof optval, 3914 sizeof optval); 3915 if (error) 3916 goto bad; 3917 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 3918 error = EINVAL; 3919 goto bad; 3920 } 3921 so->so_ts_clock = optval; 3922 break; 3923 3924 case SO_MAX_PACING_RATE: 3925 error = sooptcopyin(sopt, &val32, sizeof(val32), 3926 sizeof(val32)); 3927 if (error) 3928 goto bad; 3929 so->so_max_pacing_rate = val32; 3930 break; 3931 3932 case SO_SPLICE: { 3933 struct splice splice; 3934 3935 #ifdef COMPAT_FREEBSD32 3936 if (SV_CURPROC_FLAG(SV_ILP32)) { 3937 struct splice32 splice32; 3938 3939 error = sooptcopyin(sopt, &splice32, 3940 sizeof(splice32), sizeof(splice32)); 3941 if (error == 0) { 3942 splice.sp_fd = splice32.sp_fd; 3943 splice.sp_max = splice32.sp_max; 3944 CP(splice32.sp_idle, splice.sp_idle, 3945 tv_sec); 3946 CP(splice32.sp_idle, splice.sp_idle, 3947 tv_usec); 3948 } 3949 } else 3950 #endif 3951 { 3952 error = sooptcopyin(sopt, &splice, 3953 sizeof(splice), sizeof(splice)); 3954 } 3955 if (error) 3956 goto bad; 3957 ktrsplice(&splice); 3958 3959 error = splice_init(); 3960 if (error != 0) 3961 goto bad; 3962 3963 if (splice.sp_fd >= 0) { 3964 struct file *fp; 3965 struct socket *so2; 3966 3967 if (!cap_rights_contains(sopt->sopt_rights, 3968 &cap_recv_rights)) { 3969 error = ENOTCAPABLE; 3970 goto bad; 3971 } 3972 error = getsock(sopt->sopt_td, splice.sp_fd, 3973 &cap_send_rights, &fp); 3974 if (error != 0) 3975 goto bad; 3976 so2 = fp->f_data; 3977 3978 error = so_splice(so, so2, &splice); 3979 fdrop(fp, sopt->sopt_td); 3980 } else { 3981 error = so_unsplice(so, false); 3982 } 3983 break; 3984 } 3985 default: 3986 #ifdef SOCKET_HHOOK 3987 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3988 error = hhook_run_socket(so, sopt, 3989 HHOOK_SOCKET_OPT); 3990 else 3991 #endif 3992 error = ENOPROTOOPT; 3993 break; 3994 } 3995 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 3996 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 3997 } 3998 bad: 3999 CURVNET_RESTORE(); 4000 return (error); 4001 } 4002 4003 /* 4004 * Helper routine for getsockopt. 4005 */ 4006 int 4007 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 4008 { 4009 int error; 4010 size_t valsize; 4011 4012 error = 0; 4013 4014 /* 4015 * Documented get behavior is that we always return a value, possibly 4016 * truncated to fit in the user's buffer. Traditional behavior is 4017 * that we always tell the user precisely how much we copied, rather 4018 * than something useful like the total amount we had available for 4019 * her. Note that this interface is not idempotent; the entire 4020 * answer must be generated ahead of time. 4021 */ 4022 valsize = min(len, sopt->sopt_valsize); 4023 sopt->sopt_valsize = valsize; 4024 if (sopt->sopt_val != NULL) { 4025 if (sopt->sopt_td != NULL) 4026 error = copyout(buf, sopt->sopt_val, valsize); 4027 else 4028 bcopy(buf, sopt->sopt_val, valsize); 4029 } 4030 return (error); 4031 } 4032 4033 int 4034 sogetopt(struct socket *so, struct sockopt *sopt) 4035 { 4036 int error, optval; 4037 struct linger l; 4038 struct timeval tv; 4039 #ifdef MAC 4040 struct mac extmac; 4041 #endif 4042 4043 CURVNET_SET(so->so_vnet); 4044 error = 0; 4045 if (sopt->sopt_level != SOL_SOCKET) { 4046 if (so->so_proto->pr_ctloutput != NULL) 4047 error = (*so->so_proto->pr_ctloutput)(so, sopt); 4048 else 4049 error = ENOPROTOOPT; 4050 CURVNET_RESTORE(); 4051 return (error); 4052 } else { 4053 switch (sopt->sopt_name) { 4054 case SO_ACCEPTFILTER: 4055 error = accept_filt_getopt(so, sopt); 4056 break; 4057 4058 case SO_LINGER: 4059 SOCK_LOCK(so); 4060 l.l_onoff = so->so_options & SO_LINGER; 4061 l.l_linger = so->so_linger; 4062 SOCK_UNLOCK(so); 4063 error = sooptcopyout(sopt, &l, sizeof l); 4064 break; 4065 4066 case SO_USELOOPBACK: 4067 case SO_DONTROUTE: 4068 case SO_DEBUG: 4069 case SO_KEEPALIVE: 4070 case SO_REUSEADDR: 4071 case SO_REUSEPORT: 4072 case SO_REUSEPORT_LB: 4073 case SO_BROADCAST: 4074 case SO_OOBINLINE: 4075 case SO_ACCEPTCONN: 4076 case SO_TIMESTAMP: 4077 case SO_BINTIME: 4078 case SO_NOSIGPIPE: 4079 case SO_NO_DDP: 4080 case SO_NO_OFFLOAD: 4081 case SO_RERROR: 4082 optval = so->so_options & sopt->sopt_name; 4083 integer: 4084 error = sooptcopyout(sopt, &optval, sizeof optval); 4085 break; 4086 4087 case SO_DOMAIN: 4088 optval = so->so_proto->pr_domain->dom_family; 4089 goto integer; 4090 4091 case SO_TYPE: 4092 optval = so->so_type; 4093 goto integer; 4094 4095 case SO_PROTOCOL: 4096 optval = so->so_proto->pr_protocol; 4097 goto integer; 4098 4099 case SO_ERROR: 4100 SOCK_LOCK(so); 4101 if (so->so_error) { 4102 optval = so->so_error; 4103 so->so_error = 0; 4104 } else { 4105 optval = so->so_rerror; 4106 so->so_rerror = 0; 4107 } 4108 SOCK_UNLOCK(so); 4109 goto integer; 4110 4111 case SO_SNDBUF: 4112 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 4113 so->so_snd.sb_hiwat; 4114 goto integer; 4115 4116 case SO_RCVBUF: 4117 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 4118 so->so_rcv.sb_hiwat; 4119 goto integer; 4120 4121 case SO_SNDLOWAT: 4122 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 4123 so->so_snd.sb_lowat; 4124 goto integer; 4125 4126 case SO_RCVLOWAT: 4127 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 4128 so->so_rcv.sb_lowat; 4129 goto integer; 4130 4131 case SO_SNDTIMEO: 4132 case SO_RCVTIMEO: 4133 SOCK_LOCK(so); 4134 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 4135 (SOLISTENING(so) ? so->sol_sbsnd_timeo : 4136 so->so_snd.sb_timeo) : 4137 (SOLISTENING(so) ? so->sol_sbrcv_timeo : 4138 so->so_rcv.sb_timeo)); 4139 SOCK_UNLOCK(so); 4140 #ifdef COMPAT_FREEBSD32 4141 if (SV_CURPROC_FLAG(SV_ILP32)) { 4142 struct timeval32 tv32; 4143 4144 CP(tv, tv32, tv_sec); 4145 CP(tv, tv32, tv_usec); 4146 error = sooptcopyout(sopt, &tv32, sizeof tv32); 4147 } else 4148 #endif 4149 error = sooptcopyout(sopt, &tv, sizeof tv); 4150 break; 4151 4152 case SO_LABEL: 4153 #ifdef MAC 4154 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4155 sizeof(extmac)); 4156 if (error) 4157 goto bad; 4158 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 4159 so, &extmac); 4160 if (error) 4161 goto bad; 4162 /* Don't copy out extmac, it is unchanged. */ 4163 #else 4164 error = EOPNOTSUPP; 4165 #endif 4166 break; 4167 4168 case SO_PEERLABEL: 4169 #ifdef MAC 4170 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4171 sizeof(extmac)); 4172 if (error) 4173 goto bad; 4174 error = mac_getsockopt_peerlabel( 4175 sopt->sopt_td->td_ucred, so, &extmac); 4176 if (error) 4177 goto bad; 4178 /* Don't copy out extmac, it is unchanged. */ 4179 #else 4180 error = EOPNOTSUPP; 4181 #endif 4182 break; 4183 4184 case SO_LISTENQLIMIT: 4185 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 4186 goto integer; 4187 4188 case SO_LISTENQLEN: 4189 optval = SOLISTENING(so) ? so->sol_qlen : 0; 4190 goto integer; 4191 4192 case SO_LISTENINCQLEN: 4193 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 4194 goto integer; 4195 4196 case SO_TS_CLOCK: 4197 optval = so->so_ts_clock; 4198 goto integer; 4199 4200 case SO_MAX_PACING_RATE: 4201 optval = so->so_max_pacing_rate; 4202 goto integer; 4203 4204 case SO_SPLICE: { 4205 off_t n; 4206 4207 /* 4208 * Acquire the I/O lock to serialize with 4209 * so_splice_xfer(). This is not required for 4210 * correctness, but makes testing simpler: once a byte 4211 * has been transmitted to the sink and observed (e.g., 4212 * by reading from the socket to which the sink is 4213 * connected), a subsequent getsockopt(SO_SPLICE) will 4214 * return an up-to-date value. 4215 */ 4216 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT); 4217 if (error != 0) 4218 goto bad; 4219 SOCK_LOCK(so); 4220 if (SOLISTENING(so)) { 4221 n = 0; 4222 } else { 4223 n = so->so_splice_sent; 4224 } 4225 SOCK_UNLOCK(so); 4226 SOCK_IO_RECV_UNLOCK(so); 4227 error = sooptcopyout(sopt, &n, sizeof(n)); 4228 break; 4229 } 4230 4231 default: 4232 #ifdef SOCKET_HHOOK 4233 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 4234 error = hhook_run_socket(so, sopt, 4235 HHOOK_SOCKET_OPT); 4236 else 4237 #endif 4238 error = ENOPROTOOPT; 4239 break; 4240 } 4241 } 4242 bad: 4243 CURVNET_RESTORE(); 4244 return (error); 4245 } 4246 4247 int 4248 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 4249 { 4250 struct mbuf *m, *m_prev; 4251 int sopt_size = sopt->sopt_valsize; 4252 4253 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4254 if (m == NULL) 4255 return ENOBUFS; 4256 if (sopt_size > MLEN) { 4257 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 4258 if ((m->m_flags & M_EXT) == 0) { 4259 m_free(m); 4260 return ENOBUFS; 4261 } 4262 m->m_len = min(MCLBYTES, sopt_size); 4263 } else { 4264 m->m_len = min(MLEN, sopt_size); 4265 } 4266 sopt_size -= m->m_len; 4267 *mp = m; 4268 m_prev = m; 4269 4270 while (sopt_size) { 4271 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4272 if (m == NULL) { 4273 m_freem(*mp); 4274 return ENOBUFS; 4275 } 4276 if (sopt_size > MLEN) { 4277 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 4278 M_NOWAIT); 4279 if ((m->m_flags & M_EXT) == 0) { 4280 m_freem(m); 4281 m_freem(*mp); 4282 return ENOBUFS; 4283 } 4284 m->m_len = min(MCLBYTES, sopt_size); 4285 } else { 4286 m->m_len = min(MLEN, sopt_size); 4287 } 4288 sopt_size -= m->m_len; 4289 m_prev->m_next = m; 4290 m_prev = m; 4291 } 4292 return (0); 4293 } 4294 4295 int 4296 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 4297 { 4298 struct mbuf *m0 = m; 4299 4300 if (sopt->sopt_val == NULL) 4301 return (0); 4302 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4303 if (sopt->sopt_td != NULL) { 4304 int error; 4305 4306 error = copyin(sopt->sopt_val, mtod(m, char *), 4307 m->m_len); 4308 if (error != 0) { 4309 m_freem(m0); 4310 return(error); 4311 } 4312 } else 4313 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 4314 sopt->sopt_valsize -= m->m_len; 4315 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4316 m = m->m_next; 4317 } 4318 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 4319 panic("ip6_sooptmcopyin"); 4320 return (0); 4321 } 4322 4323 int 4324 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 4325 { 4326 struct mbuf *m0 = m; 4327 size_t valsize = 0; 4328 4329 if (sopt->sopt_val == NULL) 4330 return (0); 4331 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4332 if (sopt->sopt_td != NULL) { 4333 int error; 4334 4335 error = copyout(mtod(m, char *), sopt->sopt_val, 4336 m->m_len); 4337 if (error != 0) { 4338 m_freem(m0); 4339 return(error); 4340 } 4341 } else 4342 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 4343 sopt->sopt_valsize -= m->m_len; 4344 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4345 valsize += m->m_len; 4346 m = m->m_next; 4347 } 4348 if (m != NULL) { 4349 /* enough soopt buffer should be given from user-land */ 4350 m_freem(m0); 4351 return(EINVAL); 4352 } 4353 sopt->sopt_valsize = valsize; 4354 return (0); 4355 } 4356 4357 /* 4358 * sohasoutofband(): protocol notifies socket layer of the arrival of new 4359 * out-of-band data, which will then notify socket consumers. 4360 */ 4361 void 4362 sohasoutofband(struct socket *so) 4363 { 4364 4365 if (so->so_sigio != NULL) 4366 pgsigio(&so->so_sigio, SIGURG, 0); 4367 selwakeuppri(&so->so_rdsel, PSOCK); 4368 } 4369 4370 int 4371 sopoll(struct socket *so, int events, struct ucred *active_cred, 4372 struct thread *td) 4373 { 4374 4375 /* 4376 * We do not need to set or assert curvnet as long as everyone uses 4377 * sopoll_generic(). 4378 */ 4379 return (so->so_proto->pr_sopoll(so, events, active_cred, td)); 4380 } 4381 4382 int 4383 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 4384 struct thread *td) 4385 { 4386 int revents; 4387 4388 SOCK_LOCK(so); 4389 if (SOLISTENING(so)) { 4390 if (!(events & (POLLIN | POLLRDNORM))) 4391 revents = 0; 4392 else if (!TAILQ_EMPTY(&so->sol_comp)) 4393 revents = events & (POLLIN | POLLRDNORM); 4394 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 4395 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 4396 else { 4397 selrecord(td, &so->so_rdsel); 4398 revents = 0; 4399 } 4400 } else { 4401 revents = 0; 4402 SOCK_SENDBUF_LOCK(so); 4403 SOCK_RECVBUF_LOCK(so); 4404 if (events & (POLLIN | POLLRDNORM)) 4405 if (soreadabledata(so) && !isspliced(so)) 4406 revents |= events & (POLLIN | POLLRDNORM); 4407 if (events & (POLLOUT | POLLWRNORM)) 4408 if (sowriteable(so) && !issplicedback(so)) 4409 revents |= events & (POLLOUT | POLLWRNORM); 4410 if (events & (POLLPRI | POLLRDBAND)) 4411 if (so->so_oobmark || 4412 (so->so_rcv.sb_state & SBS_RCVATMARK)) 4413 revents |= events & (POLLPRI | POLLRDBAND); 4414 if ((events & POLLINIGNEOF) == 0) { 4415 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4416 revents |= events & (POLLIN | POLLRDNORM); 4417 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 4418 revents |= POLLHUP; 4419 } 4420 } 4421 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4422 revents |= events & POLLRDHUP; 4423 if (revents == 0) { 4424 if (events & 4425 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 4426 selrecord(td, &so->so_rdsel); 4427 so->so_rcv.sb_flags |= SB_SEL; 4428 } 4429 if (events & (POLLOUT | POLLWRNORM)) { 4430 selrecord(td, &so->so_wrsel); 4431 so->so_snd.sb_flags |= SB_SEL; 4432 } 4433 } 4434 SOCK_RECVBUF_UNLOCK(so); 4435 SOCK_SENDBUF_UNLOCK(so); 4436 } 4437 SOCK_UNLOCK(so); 4438 return (revents); 4439 } 4440 4441 int 4442 soo_kqfilter(struct file *fp, struct knote *kn) 4443 { 4444 struct socket *so = kn->kn_fp->f_data; 4445 struct sockbuf *sb; 4446 sb_which which; 4447 struct knlist *knl; 4448 4449 switch (kn->kn_filter) { 4450 case EVFILT_READ: 4451 kn->kn_fop = &soread_filtops; 4452 knl = &so->so_rdsel.si_note; 4453 sb = &so->so_rcv; 4454 which = SO_RCV; 4455 break; 4456 case EVFILT_WRITE: 4457 kn->kn_fop = &sowrite_filtops; 4458 knl = &so->so_wrsel.si_note; 4459 sb = &so->so_snd; 4460 which = SO_SND; 4461 break; 4462 case EVFILT_EMPTY: 4463 kn->kn_fop = &soempty_filtops; 4464 knl = &so->so_wrsel.si_note; 4465 sb = &so->so_snd; 4466 which = SO_SND; 4467 break; 4468 default: 4469 return (EINVAL); 4470 } 4471 4472 SOCK_LOCK(so); 4473 if (SOLISTENING(so)) { 4474 knlist_add(knl, kn, 1); 4475 } else { 4476 SOCK_BUF_LOCK(so, which); 4477 knlist_add(knl, kn, 1); 4478 sb->sb_flags |= SB_KNOTE; 4479 SOCK_BUF_UNLOCK(so, which); 4480 } 4481 SOCK_UNLOCK(so); 4482 return (0); 4483 } 4484 4485 static void 4486 filt_sordetach(struct knote *kn) 4487 { 4488 struct socket *so = kn->kn_fp->f_data; 4489 4490 so_rdknl_lock(so); 4491 knlist_remove(&so->so_rdsel.si_note, kn, 1); 4492 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 4493 so->so_rcv.sb_flags &= ~SB_KNOTE; 4494 so_rdknl_unlock(so); 4495 } 4496 4497 /*ARGSUSED*/ 4498 static int 4499 filt_soread(struct knote *kn, long hint) 4500 { 4501 struct socket *so; 4502 4503 so = kn->kn_fp->f_data; 4504 4505 if (SOLISTENING(so)) { 4506 SOCK_LOCK_ASSERT(so); 4507 kn->kn_data = so->sol_qlen; 4508 if (so->so_error) { 4509 kn->kn_flags |= EV_EOF; 4510 kn->kn_fflags = so->so_error; 4511 return (1); 4512 } 4513 return (!TAILQ_EMPTY(&so->sol_comp)); 4514 } 4515 4516 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 4517 return (0); 4518 4519 SOCK_RECVBUF_LOCK_ASSERT(so); 4520 4521 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 4522 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4523 kn->kn_flags |= EV_EOF; 4524 kn->kn_fflags = so->so_error; 4525 return (1); 4526 } else if (so->so_error || so->so_rerror) 4527 return (1); 4528 4529 if (kn->kn_sfflags & NOTE_LOWAT) { 4530 if (kn->kn_data >= kn->kn_sdata) 4531 return (1); 4532 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 4533 return (1); 4534 4535 #ifdef SOCKET_HHOOK 4536 /* This hook returning non-zero indicates an event, not error */ 4537 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 4538 #else 4539 return (0); 4540 #endif 4541 } 4542 4543 static void 4544 filt_sowdetach(struct knote *kn) 4545 { 4546 struct socket *so = kn->kn_fp->f_data; 4547 4548 so_wrknl_lock(so); 4549 knlist_remove(&so->so_wrsel.si_note, kn, 1); 4550 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 4551 so->so_snd.sb_flags &= ~SB_KNOTE; 4552 so_wrknl_unlock(so); 4553 } 4554 4555 /*ARGSUSED*/ 4556 static int 4557 filt_sowrite(struct knote *kn, long hint) 4558 { 4559 struct socket *so; 4560 4561 so = kn->kn_fp->f_data; 4562 4563 if (SOLISTENING(so)) 4564 return (0); 4565 4566 SOCK_SENDBUF_LOCK_ASSERT(so); 4567 kn->kn_data = sbspace(&so->so_snd); 4568 4569 #ifdef SOCKET_HHOOK 4570 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 4571 #endif 4572 4573 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 4574 kn->kn_flags |= EV_EOF; 4575 kn->kn_fflags = so->so_error; 4576 return (1); 4577 } else if (so->so_error) /* temporary udp error */ 4578 return (1); 4579 else if (((so->so_state & SS_ISCONNECTED) == 0) && 4580 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 4581 return (0); 4582 else if (kn->kn_sfflags & NOTE_LOWAT) 4583 return (kn->kn_data >= kn->kn_sdata); 4584 else 4585 return (kn->kn_data >= so->so_snd.sb_lowat); 4586 } 4587 4588 static int 4589 filt_soempty(struct knote *kn, long hint) 4590 { 4591 struct socket *so; 4592 4593 so = kn->kn_fp->f_data; 4594 4595 if (SOLISTENING(so)) 4596 return (1); 4597 4598 SOCK_SENDBUF_LOCK_ASSERT(so); 4599 kn->kn_data = sbused(&so->so_snd); 4600 4601 if (kn->kn_data == 0) 4602 return (1); 4603 else 4604 return (0); 4605 } 4606 4607 int 4608 socheckuid(struct socket *so, uid_t uid) 4609 { 4610 4611 if (so == NULL) 4612 return (EPERM); 4613 if (so->so_cred->cr_uid != uid) 4614 return (EPERM); 4615 return (0); 4616 } 4617 4618 /* 4619 * These functions are used by protocols to notify the socket layer (and its 4620 * consumers) of state changes in the sockets driven by protocol-side events. 4621 */ 4622 4623 /* 4624 * Procedures to manipulate state flags of socket and do appropriate wakeups. 4625 * 4626 * Normal sequence from the active (originating) side is that 4627 * soisconnecting() is called during processing of connect() call, resulting 4628 * in an eventual call to soisconnected() if/when the connection is 4629 * established. When the connection is torn down soisdisconnecting() is 4630 * called during processing of disconnect() call, and soisdisconnected() is 4631 * called when the connection to the peer is totally severed. The semantics 4632 * of these routines are such that connectionless protocols can call 4633 * soisconnected() and soisdisconnected() only, bypassing the in-progress 4634 * calls when setting up a ``connection'' takes no time. 4635 * 4636 * From the passive side, a socket is created with two queues of sockets: 4637 * so_incomp for connections in progress and so_comp for connections already 4638 * made and awaiting user acceptance. As a protocol is preparing incoming 4639 * connections, it creates a socket structure queued on so_incomp by calling 4640 * sonewconn(). When the connection is established, soisconnected() is 4641 * called, and transfers the socket structure to so_comp, making it available 4642 * to accept(). 4643 * 4644 * If a socket is closed with sockets on either so_incomp or so_comp, these 4645 * sockets are dropped. 4646 * 4647 * If higher-level protocols are implemented in the kernel, the wakeups done 4648 * here will sometimes cause software-interrupt process scheduling. 4649 */ 4650 void 4651 soisconnecting(struct socket *so) 4652 { 4653 4654 SOCK_LOCK(so); 4655 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 4656 so->so_state |= SS_ISCONNECTING; 4657 SOCK_UNLOCK(so); 4658 } 4659 4660 void 4661 soisconnected(struct socket *so) 4662 { 4663 bool last __diagused; 4664 4665 SOCK_LOCK(so); 4666 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 4667 so->so_state |= SS_ISCONNECTED; 4668 4669 if (so->so_qstate == SQ_INCOMP) { 4670 struct socket *head = so->so_listen; 4671 int ret; 4672 4673 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 4674 /* 4675 * Promoting a socket from incomplete queue to complete, we 4676 * need to go through reverse order of locking. We first do 4677 * trylock, and if that doesn't succeed, we go the hard way 4678 * leaving a reference and rechecking consistency after proper 4679 * locking. 4680 */ 4681 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 4682 soref(head); 4683 SOCK_UNLOCK(so); 4684 SOLISTEN_LOCK(head); 4685 SOCK_LOCK(so); 4686 if (__predict_false(head != so->so_listen)) { 4687 /* 4688 * The socket went off the listen queue, 4689 * should be lost race to close(2) of sol. 4690 * The socket is about to soabort(). 4691 */ 4692 SOCK_UNLOCK(so); 4693 sorele_locked(head); 4694 return; 4695 } 4696 last = refcount_release(&head->so_count); 4697 KASSERT(!last, ("%s: released last reference for %p", 4698 __func__, head)); 4699 } 4700 again: 4701 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 4702 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 4703 head->sol_incqlen--; 4704 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 4705 head->sol_qlen++; 4706 so->so_qstate = SQ_COMP; 4707 SOCK_UNLOCK(so); 4708 solisten_wakeup(head); /* unlocks */ 4709 } else { 4710 SOCK_RECVBUF_LOCK(so); 4711 soupcall_set(so, SO_RCV, 4712 head->sol_accept_filter->accf_callback, 4713 head->sol_accept_filter_arg); 4714 so->so_options &= ~SO_ACCEPTFILTER; 4715 ret = head->sol_accept_filter->accf_callback(so, 4716 head->sol_accept_filter_arg, M_NOWAIT); 4717 if (ret == SU_ISCONNECTED) { 4718 soupcall_clear(so, SO_RCV); 4719 SOCK_RECVBUF_UNLOCK(so); 4720 goto again; 4721 } 4722 SOCK_RECVBUF_UNLOCK(so); 4723 SOCK_UNLOCK(so); 4724 SOLISTEN_UNLOCK(head); 4725 } 4726 return; 4727 } 4728 SOCK_UNLOCK(so); 4729 wakeup(&so->so_timeo); 4730 sorwakeup(so); 4731 sowwakeup(so); 4732 } 4733 4734 void 4735 soisdisconnecting(struct socket *so) 4736 { 4737 4738 SOCK_LOCK(so); 4739 so->so_state &= ~SS_ISCONNECTING; 4740 so->so_state |= SS_ISDISCONNECTING; 4741 4742 if (!SOLISTENING(so)) { 4743 SOCK_RECVBUF_LOCK(so); 4744 socantrcvmore_locked(so); 4745 SOCK_SENDBUF_LOCK(so); 4746 socantsendmore_locked(so); 4747 } 4748 SOCK_UNLOCK(so); 4749 wakeup(&so->so_timeo); 4750 } 4751 4752 void 4753 soisdisconnected(struct socket *so) 4754 { 4755 4756 SOCK_LOCK(so); 4757 4758 /* 4759 * There is at least one reader of so_state that does not 4760 * acquire socket lock, namely soreceive_generic(). Ensure 4761 * that it never sees all flags that track connection status 4762 * cleared, by ordering the update with a barrier semantic of 4763 * our release thread fence. 4764 */ 4765 so->so_state |= SS_ISDISCONNECTED; 4766 atomic_thread_fence_rel(); 4767 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 4768 4769 if (!SOLISTENING(so)) { 4770 SOCK_UNLOCK(so); 4771 SOCK_RECVBUF_LOCK(so); 4772 socantrcvmore_locked(so); 4773 SOCK_SENDBUF_LOCK(so); 4774 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 4775 socantsendmore_locked(so); 4776 } else 4777 SOCK_UNLOCK(so); 4778 wakeup(&so->so_timeo); 4779 } 4780 4781 int 4782 soiolock(struct socket *so, struct sx *sx, int flags) 4783 { 4784 int error; 4785 4786 KASSERT((flags & SBL_VALID) == flags, 4787 ("soiolock: invalid flags %#x", flags)); 4788 4789 if ((flags & SBL_WAIT) != 0) { 4790 if ((flags & SBL_NOINTR) != 0) { 4791 sx_xlock(sx); 4792 } else { 4793 error = sx_xlock_sig(sx); 4794 if (error != 0) 4795 return (error); 4796 } 4797 } else if (!sx_try_xlock(sx)) { 4798 return (EWOULDBLOCK); 4799 } 4800 4801 if (__predict_false(SOLISTENING(so))) { 4802 sx_xunlock(sx); 4803 return (ENOTCONN); 4804 } 4805 return (0); 4806 } 4807 4808 void 4809 soiounlock(struct sx *sx) 4810 { 4811 sx_xunlock(sx); 4812 } 4813 4814 /* 4815 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 4816 */ 4817 struct sockaddr * 4818 sodupsockaddr(const struct sockaddr *sa, int mflags) 4819 { 4820 struct sockaddr *sa2; 4821 4822 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 4823 if (sa2) 4824 bcopy(sa, sa2, sa->sa_len); 4825 return sa2; 4826 } 4827 4828 /* 4829 * Register per-socket destructor. 4830 */ 4831 void 4832 sodtor_set(struct socket *so, so_dtor_t *func) 4833 { 4834 4835 SOCK_LOCK_ASSERT(so); 4836 so->so_dtor = func; 4837 } 4838 4839 /* 4840 * Register per-socket buffer upcalls. 4841 */ 4842 void 4843 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) 4844 { 4845 struct sockbuf *sb; 4846 4847 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4848 4849 switch (which) { 4850 case SO_RCV: 4851 sb = &so->so_rcv; 4852 break; 4853 case SO_SND: 4854 sb = &so->so_snd; 4855 break; 4856 } 4857 SOCK_BUF_LOCK_ASSERT(so, which); 4858 sb->sb_upcall = func; 4859 sb->sb_upcallarg = arg; 4860 sb->sb_flags |= SB_UPCALL; 4861 } 4862 4863 void 4864 soupcall_clear(struct socket *so, sb_which which) 4865 { 4866 struct sockbuf *sb; 4867 4868 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4869 4870 switch (which) { 4871 case SO_RCV: 4872 sb = &so->so_rcv; 4873 break; 4874 case SO_SND: 4875 sb = &so->so_snd; 4876 break; 4877 } 4878 SOCK_BUF_LOCK_ASSERT(so, which); 4879 KASSERT(sb->sb_upcall != NULL, 4880 ("%s: so %p no upcall to clear", __func__, so)); 4881 sb->sb_upcall = NULL; 4882 sb->sb_upcallarg = NULL; 4883 sb->sb_flags &= ~SB_UPCALL; 4884 } 4885 4886 void 4887 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4888 { 4889 4890 SOLISTEN_LOCK_ASSERT(so); 4891 so->sol_upcall = func; 4892 so->sol_upcallarg = arg; 4893 } 4894 4895 static void 4896 so_rdknl_lock(void *arg) 4897 { 4898 struct socket *so = arg; 4899 4900 retry: 4901 if (SOLISTENING(so)) { 4902 SOLISTEN_LOCK(so); 4903 } else { 4904 SOCK_RECVBUF_LOCK(so); 4905 if (__predict_false(SOLISTENING(so))) { 4906 SOCK_RECVBUF_UNLOCK(so); 4907 goto retry; 4908 } 4909 } 4910 } 4911 4912 static void 4913 so_rdknl_unlock(void *arg) 4914 { 4915 struct socket *so = arg; 4916 4917 if (SOLISTENING(so)) 4918 SOLISTEN_UNLOCK(so); 4919 else 4920 SOCK_RECVBUF_UNLOCK(so); 4921 } 4922 4923 static void 4924 so_rdknl_assert_lock(void *arg, int what) 4925 { 4926 struct socket *so = arg; 4927 4928 if (what == LA_LOCKED) { 4929 if (SOLISTENING(so)) 4930 SOLISTEN_LOCK_ASSERT(so); 4931 else 4932 SOCK_RECVBUF_LOCK_ASSERT(so); 4933 } else { 4934 if (SOLISTENING(so)) 4935 SOLISTEN_UNLOCK_ASSERT(so); 4936 else 4937 SOCK_RECVBUF_UNLOCK_ASSERT(so); 4938 } 4939 } 4940 4941 static void 4942 so_wrknl_lock(void *arg) 4943 { 4944 struct socket *so = arg; 4945 4946 retry: 4947 if (SOLISTENING(so)) { 4948 SOLISTEN_LOCK(so); 4949 } else { 4950 SOCK_SENDBUF_LOCK(so); 4951 if (__predict_false(SOLISTENING(so))) { 4952 SOCK_SENDBUF_UNLOCK(so); 4953 goto retry; 4954 } 4955 } 4956 } 4957 4958 static void 4959 so_wrknl_unlock(void *arg) 4960 { 4961 struct socket *so = arg; 4962 4963 if (SOLISTENING(so)) 4964 SOLISTEN_UNLOCK(so); 4965 else 4966 SOCK_SENDBUF_UNLOCK(so); 4967 } 4968 4969 static void 4970 so_wrknl_assert_lock(void *arg, int what) 4971 { 4972 struct socket *so = arg; 4973 4974 if (what == LA_LOCKED) { 4975 if (SOLISTENING(so)) 4976 SOLISTEN_LOCK_ASSERT(so); 4977 else 4978 SOCK_SENDBUF_LOCK_ASSERT(so); 4979 } else { 4980 if (SOLISTENING(so)) 4981 SOLISTEN_UNLOCK_ASSERT(so); 4982 else 4983 SOCK_SENDBUF_UNLOCK_ASSERT(so); 4984 } 4985 } 4986 4987 /* 4988 * Create an external-format (``xsocket'') structure using the information in 4989 * the kernel-format socket structure pointed to by so. This is done to 4990 * reduce the spew of irrelevant information over this interface, to isolate 4991 * user code from changes in the kernel structure, and potentially to provide 4992 * information-hiding if we decide that some of this information should be 4993 * hidden from users. 4994 */ 4995 void 4996 sotoxsocket(struct socket *so, struct xsocket *xso) 4997 { 4998 4999 bzero(xso, sizeof(*xso)); 5000 xso->xso_len = sizeof *xso; 5001 xso->xso_so = (uintptr_t)so; 5002 xso->so_type = so->so_type; 5003 xso->so_options = so->so_options; 5004 xso->so_linger = so->so_linger; 5005 xso->so_state = so->so_state; 5006 xso->so_pcb = (uintptr_t)so->so_pcb; 5007 xso->xso_protocol = so->so_proto->pr_protocol; 5008 xso->xso_family = so->so_proto->pr_domain->dom_family; 5009 xso->so_timeo = so->so_timeo; 5010 xso->so_error = so->so_error; 5011 xso->so_uid = so->so_cred->cr_uid; 5012 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 5013 SOCK_LOCK(so); 5014 if (SOLISTENING(so)) { 5015 xso->so_qlen = so->sol_qlen; 5016 xso->so_incqlen = so->sol_incqlen; 5017 xso->so_qlimit = so->sol_qlimit; 5018 xso->so_oobmark = 0; 5019 } else { 5020 xso->so_state |= so->so_qstate; 5021 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 5022 xso->so_oobmark = so->so_oobmark; 5023 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 5024 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 5025 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 5026 xso->so_splice_so = (uintptr_t)so->so_splice->dst; 5027 } 5028 SOCK_UNLOCK(so); 5029 } 5030 5031 int 5032 so_options_get(const struct socket *so) 5033 { 5034 5035 return (so->so_options); 5036 } 5037 5038 void 5039 so_options_set(struct socket *so, int val) 5040 { 5041 5042 so->so_options = val; 5043 } 5044 5045 int 5046 so_error_get(const struct socket *so) 5047 { 5048 5049 return (so->so_error); 5050 } 5051 5052 void 5053 so_error_set(struct socket *so, int val) 5054 { 5055 5056 so->so_error = val; 5057 } 5058