1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 * 96 * NOTE: With regard to VNETs the general rule is that callers do not set 97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 98 * sofree(), sorele(), sonewconn() and sorflush(), which are usually called 99 * from a pre-set VNET context. sopoll() currently does not need a VNET 100 * context to be set. 101 */ 102 103 #include <sys/cdefs.h> 104 #include "opt_inet.h" 105 #include "opt_inet6.h" 106 #include "opt_kern_tls.h" 107 #include "opt_ktrace.h" 108 #include "opt_sctp.h" 109 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/capsicum.h> 113 #include <sys/fcntl.h> 114 #include <sys/limits.h> 115 #include <sys/lock.h> 116 #include <sys/mac.h> 117 #include <sys/malloc.h> 118 #include <sys/mbuf.h> 119 #include <sys/mutex.h> 120 #include <sys/domain.h> 121 #include <sys/file.h> /* for struct knote */ 122 #include <sys/hhook.h> 123 #include <sys/kernel.h> 124 #include <sys/khelp.h> 125 #include <sys/kthread.h> 126 #include <sys/ktls.h> 127 #include <sys/event.h> 128 #include <sys/eventhandler.h> 129 #include <sys/poll.h> 130 #include <sys/proc.h> 131 #include <sys/protosw.h> 132 #include <sys/sbuf.h> 133 #include <sys/socket.h> 134 #include <sys/socketvar.h> 135 #include <sys/resourcevar.h> 136 #include <net/route.h> 137 #include <sys/sched.h> 138 #include <sys/signalvar.h> 139 #include <sys/smp.h> 140 #include <sys/stat.h> 141 #include <sys/sx.h> 142 #include <sys/sysctl.h> 143 #include <sys/taskqueue.h> 144 #include <sys/uio.h> 145 #include <sys/un.h> 146 #include <sys/unpcb.h> 147 #include <sys/jail.h> 148 #include <sys/syslog.h> 149 #include <netinet/in.h> 150 #include <netinet/in_pcb.h> 151 #include <netinet/tcp.h> 152 153 #include <net/vnet.h> 154 155 #include <security/mac/mac_framework.h> 156 #include <security/mac/mac_internal.h> 157 158 #include <vm/uma.h> 159 160 #ifdef COMPAT_FREEBSD32 161 #include <sys/mount.h> 162 #include <sys/sysent.h> 163 #include <compat/freebsd32/freebsd32.h> 164 #endif 165 166 static int soreceive_generic_locked(struct socket *so, 167 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 168 struct mbuf **controlp, int *flagsp); 169 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 170 int flags); 171 static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 172 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 173 struct mbuf **controlp, int flags); 174 static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, 175 struct uio *uio, struct mbuf *top, struct mbuf *control, 176 int flags, struct thread *td); 177 static void so_rdknl_lock(void *); 178 static void so_rdknl_unlock(void *); 179 static void so_rdknl_assert_lock(void *, int); 180 static void so_wrknl_lock(void *); 181 static void so_wrknl_unlock(void *); 182 static void so_wrknl_assert_lock(void *, int); 183 184 static void filt_sordetach(struct knote *kn); 185 static int filt_soread(struct knote *kn, long hint); 186 static void filt_sowdetach(struct knote *kn); 187 static int filt_sowrite(struct knote *kn, long hint); 188 static int filt_soempty(struct knote *kn, long hint); 189 fo_kqfilter_t soo_kqfilter; 190 191 static const struct filterops soread_filtops = { 192 .f_isfd = 1, 193 .f_detach = filt_sordetach, 194 .f_event = filt_soread, 195 }; 196 static const struct filterops sowrite_filtops = { 197 .f_isfd = 1, 198 .f_detach = filt_sowdetach, 199 .f_event = filt_sowrite, 200 }; 201 static const struct filterops soempty_filtops = { 202 .f_isfd = 1, 203 .f_detach = filt_sowdetach, 204 .f_event = filt_soempty, 205 }; 206 207 so_gen_t so_gencnt; /* generation count for sockets */ 208 209 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 210 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 211 212 #define VNET_SO_ASSERT(so) \ 213 VNET_ASSERT(curvnet != NULL, \ 214 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 215 216 #ifdef SOCKET_HHOOK 217 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 218 #define V_socket_hhh VNET(socket_hhh) 219 static inline int hhook_run_socket(struct socket *, void *, int32_t); 220 #endif 221 222 #ifdef COMPAT_FREEBSD32 223 #ifdef __amd64__ 224 /* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */ 225 #define __splice32_packed __packed 226 #else 227 #define __splice32_packed 228 #endif 229 struct splice32 { 230 int32_t sp_fd; 231 int64_t sp_max; 232 struct timeval32 sp_idle; 233 } __splice32_packed; 234 #undef __splice32_packed 235 #endif 236 237 /* 238 * Limit on the number of connections in the listen queue waiting 239 * for accept(2). 240 * NB: The original sysctl somaxconn is still available but hidden 241 * to prevent confusion about the actual purpose of this number. 242 */ 243 static u_int somaxconn = SOMAXCONN; 244 245 static int 246 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 247 { 248 int error; 249 int val; 250 251 val = somaxconn; 252 error = sysctl_handle_int(oidp, &val, 0, req); 253 if (error || !req->newptr ) 254 return (error); 255 256 /* 257 * The purpose of the UINT_MAX / 3 limit, is so that the formula 258 * 3 * so_qlimit / 2 259 * below, will not overflow. 260 */ 261 262 if (val < 1 || val > UINT_MAX / 3) 263 return (EINVAL); 264 265 somaxconn = val; 266 return (0); 267 } 268 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 269 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), 270 sysctl_somaxconn, "I", 271 "Maximum listen socket pending connection accept queue size"); 272 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 273 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, 274 sizeof(int), sysctl_somaxconn, "I", 275 "Maximum listen socket pending connection accept queue size (compat)"); 276 277 static int numopensockets; 278 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 279 &numopensockets, 0, "Number of open sockets"); 280 281 /* 282 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 283 * so_gencnt field. 284 */ 285 static struct mtx so_global_mtx; 286 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 287 288 /* 289 * General IPC sysctl name space, used by sockets and a variety of other IPC 290 * types. 291 */ 292 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 293 "IPC"); 294 295 /* 296 * Initialize the socket subsystem and set up the socket 297 * memory allocator. 298 */ 299 static uma_zone_t socket_zone; 300 int maxsockets; 301 302 static void 303 socket_zone_change(void *tag) 304 { 305 306 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 307 } 308 309 static int splice_init_state; 310 static struct sx splice_init_lock; 311 SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init"); 312 313 static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0, 314 "Settings relating to the SO_SPLICE socket option"); 315 316 static bool splice_receive_stream = true; 317 SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN, 318 &splice_receive_stream, 0, 319 "Use soreceive_stream() for stream splices"); 320 321 static uma_zone_t splice_zone; 322 static struct proc *splice_proc; 323 struct splice_wq { 324 struct mtx mtx; 325 STAILQ_HEAD(, so_splice) head; 326 bool running; 327 } __aligned(CACHE_LINE_SIZE); 328 static struct splice_wq *splice_wq; 329 static uint32_t splice_index = 0; 330 331 static void so_splice_timeout(void *arg, int pending); 332 static void so_splice_xfer(struct so_splice *s); 333 static int so_unsplice(struct socket *so, bool timeout); 334 335 static void 336 splice_work_thread(void *ctx) 337 { 338 struct splice_wq *wq = ctx; 339 struct so_splice *s, *s_temp; 340 STAILQ_HEAD(, so_splice) local_head; 341 int cpu; 342 343 cpu = wq - splice_wq; 344 if (bootverbose) 345 printf("starting so_splice worker thread for CPU %d\n", cpu); 346 347 for (;;) { 348 mtx_lock(&wq->mtx); 349 while (STAILQ_EMPTY(&wq->head)) { 350 wq->running = false; 351 mtx_sleep(wq, &wq->mtx, 0, "-", 0); 352 wq->running = true; 353 } 354 STAILQ_INIT(&local_head); 355 STAILQ_CONCAT(&local_head, &wq->head); 356 STAILQ_INIT(&wq->head); 357 mtx_unlock(&wq->mtx); 358 STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) { 359 mtx_lock(&s->mtx); 360 CURVNET_SET(s->src->so_vnet); 361 so_splice_xfer(s); 362 CURVNET_RESTORE(); 363 } 364 } 365 } 366 367 static void 368 so_splice_dispatch_async(struct so_splice *sp) 369 { 370 struct splice_wq *wq; 371 bool running; 372 373 wq = &splice_wq[sp->wq_index]; 374 mtx_lock(&wq->mtx); 375 STAILQ_INSERT_TAIL(&wq->head, sp, next); 376 running = wq->running; 377 mtx_unlock(&wq->mtx); 378 if (!running) 379 wakeup(wq); 380 } 381 382 void 383 so_splice_dispatch(struct so_splice *sp) 384 { 385 mtx_assert(&sp->mtx, MA_OWNED); 386 387 if (sp->state != SPLICE_IDLE) { 388 mtx_unlock(&sp->mtx); 389 } else { 390 sp->state = SPLICE_QUEUED; 391 mtx_unlock(&sp->mtx); 392 so_splice_dispatch_async(sp); 393 } 394 } 395 396 static int 397 splice_zinit(void *mem, int size __unused, int flags __unused) 398 { 399 struct so_splice *s; 400 401 s = (struct so_splice *)mem; 402 mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF); 403 return (0); 404 } 405 406 static void 407 splice_zfini(void *mem, int size) 408 { 409 struct so_splice *s; 410 411 s = (struct so_splice *)mem; 412 mtx_destroy(&s->mtx); 413 } 414 415 static int 416 splice_init(void) 417 { 418 struct thread *td; 419 int error, i, state; 420 421 state = atomic_load_acq_int(&splice_init_state); 422 if (__predict_true(state > 0)) 423 return (0); 424 if (state < 0) 425 return (ENXIO); 426 sx_xlock(&splice_init_lock); 427 if (splice_init_state != 0) { 428 sx_xunlock(&splice_init_lock); 429 return (0); 430 } 431 432 splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL, 433 NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0); 434 435 splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP, 436 M_WAITOK | M_ZERO); 437 438 /* 439 * Initialize the workqueues to run the splice work. We create a 440 * work queue for each CPU. 441 */ 442 CPU_FOREACH(i) { 443 STAILQ_INIT(&splice_wq[i].head); 444 mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF); 445 } 446 447 /* Start kthreads for each workqueue. */ 448 error = 0; 449 CPU_FOREACH(i) { 450 error = kproc_kthread_add(splice_work_thread, &splice_wq[i], 451 &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i); 452 if (error) { 453 printf("Can't add so_splice thread %d error %d\n", 454 i, error); 455 break; 456 } 457 458 /* 459 * It's possible to create loops with SO_SPLICE; ensure that 460 * worker threads aren't able to starve the system too easily. 461 */ 462 thread_lock(td); 463 sched_prio(td, PUSER); 464 thread_unlock(td); 465 } 466 467 splice_init_state = error != 0 ? -1 : 1; 468 sx_xunlock(&splice_init_lock); 469 470 return (error); 471 } 472 473 /* 474 * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding 475 * one lock in order to avoid potential deadlocks in case there is some other 476 * code path which acquires more than one I/O lock at a time. 477 */ 478 static void 479 splice_lock_pair(struct socket *so_src, struct socket *so_dst) 480 { 481 int error; 482 483 for (;;) { 484 error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR); 485 KASSERT(error == 0, 486 ("%s: failed to lock send I/O lock: %d", __func__, error)); 487 error = SOCK_IO_RECV_LOCK(so_src, 0); 488 KASSERT(error == 0 || error == EWOULDBLOCK, 489 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 490 if (error == 0) 491 break; 492 SOCK_IO_SEND_UNLOCK(so_dst); 493 494 error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR); 495 KASSERT(error == 0, 496 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 497 error = SOCK_IO_SEND_LOCK(so_dst, 0); 498 KASSERT(error == 0 || error == EWOULDBLOCK, 499 ("%s: failed to lock send I/O lock: %d", __func__, error)); 500 if (error == 0) 501 break; 502 SOCK_IO_RECV_UNLOCK(so_src); 503 } 504 } 505 506 static void 507 splice_unlock_pair(struct socket *so_src, struct socket *so_dst) 508 { 509 SOCK_IO_RECV_UNLOCK(so_src); 510 SOCK_IO_SEND_UNLOCK(so_dst); 511 } 512 513 /* 514 * Move data from the source to the sink. Assumes that both of the relevant 515 * socket I/O locks are held. 516 */ 517 static int 518 so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max, 519 ssize_t *lenp) 520 { 521 struct uio uio; 522 struct mbuf *m; 523 struct sockbuf *sb_src, *sb_dst; 524 ssize_t len; 525 long space; 526 int error, flags; 527 528 SOCK_IO_RECV_ASSERT_LOCKED(so_src); 529 SOCK_IO_SEND_ASSERT_LOCKED(so_dst); 530 531 error = 0; 532 m = NULL; 533 memset(&uio, 0, sizeof(uio)); 534 535 sb_src = &so_src->so_rcv; 536 sb_dst = &so_dst->so_snd; 537 538 space = sbspace(sb_dst); 539 if (space < 0) 540 space = 0; 541 len = MIN(max, MIN(space, sbavail(sb_src))); 542 if (len == 0) { 543 SOCK_RECVBUF_LOCK(so_src); 544 if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0) 545 error = EPIPE; 546 SOCK_RECVBUF_UNLOCK(so_src); 547 } else { 548 flags = MSG_DONTWAIT; 549 uio.uio_resid = len; 550 if (splice_receive_stream && sb_src->sb_tls_info == NULL) { 551 error = soreceive_stream_locked(so_src, sb_src, NULL, 552 &uio, &m, NULL, flags); 553 } else { 554 error = soreceive_generic_locked(so_src, NULL, 555 &uio, &m, NULL, &flags); 556 } 557 if (error != 0 && m != NULL) { 558 m_freem(m); 559 m = NULL; 560 } 561 } 562 if (m != NULL) { 563 len -= uio.uio_resid; 564 error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL, 565 MSG_DONTWAIT, curthread); 566 } else if (error == 0) { 567 len = 0; 568 SOCK_SENDBUF_LOCK(so_dst); 569 if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0) 570 error = EPIPE; 571 SOCK_SENDBUF_UNLOCK(so_dst); 572 } 573 if (error == 0) 574 *lenp = len; 575 return (error); 576 } 577 578 /* 579 * Transfer data from the source to the sink. 580 * 581 * If "direct" is true, the transfer is done in the context of whichever thread 582 * is operating on one of the socket buffers. We do not know which locks are 583 * held, so we can only trylock the socket buffers; if this fails, we fall back 584 * to the worker thread, which invokes this routine with "direct" set to false. 585 */ 586 static void 587 so_splice_xfer(struct so_splice *sp) 588 { 589 struct socket *so_src, *so_dst; 590 off_t max; 591 ssize_t len; 592 int error; 593 594 mtx_assert(&sp->mtx, MA_OWNED); 595 KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING, 596 ("so_splice_xfer: invalid state %d", sp->state)); 597 KASSERT(sp->max != 0, ("so_splice_xfer: max == 0")); 598 599 if (sp->state == SPLICE_CLOSING) { 600 /* Userspace asked us to close the splice. */ 601 goto closing; 602 } 603 604 sp->state = SPLICE_RUNNING; 605 so_src = sp->src; 606 so_dst = sp->dst; 607 max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX; 608 if (max < 0) 609 max = 0; 610 611 /* 612 * Lock the sockets in order to block userspace from doing anything 613 * sneaky. If an error occurs or one of the sockets can no longer 614 * transfer data, we will automatically unsplice. 615 */ 616 mtx_unlock(&sp->mtx); 617 splice_lock_pair(so_src, so_dst); 618 619 error = so_splice_xfer_data(so_src, so_dst, max, &len); 620 621 mtx_lock(&sp->mtx); 622 623 /* 624 * Update our stats while still holding the socket locks. This 625 * synchronizes with getsockopt(SO_SPLICE), see the comment there. 626 */ 627 if (error == 0) { 628 KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len)); 629 so_src->so_splice_sent += len; 630 } 631 splice_unlock_pair(so_src, so_dst); 632 633 switch (sp->state) { 634 case SPLICE_CLOSING: 635 closing: 636 sp->state = SPLICE_CLOSED; 637 wakeup(sp); 638 mtx_unlock(&sp->mtx); 639 break; 640 case SPLICE_RUNNING: 641 if (error != 0 || 642 (sp->max > 0 && so_src->so_splice_sent >= sp->max)) { 643 sp->state = SPLICE_EXCEPTION; 644 soref(so_src); 645 mtx_unlock(&sp->mtx); 646 (void)so_unsplice(so_src, false); 647 sorele(so_src); 648 } else { 649 /* 650 * Locklessly check for additional bytes in the source's 651 * receive buffer and queue more work if possible. We 652 * may end up queuing needless work, but that's ok, and 653 * if we race with a thread inserting more data into the 654 * buffer and observe sbavail() == 0, the splice mutex 655 * ensures that splice_push() will queue more work for 656 * us. 657 */ 658 if (sbavail(&so_src->so_rcv) > 0 && 659 sbspace(&so_dst->so_snd) > 0) { 660 sp->state = SPLICE_QUEUED; 661 mtx_unlock(&sp->mtx); 662 so_splice_dispatch_async(sp); 663 } else { 664 sp->state = SPLICE_IDLE; 665 mtx_unlock(&sp->mtx); 666 } 667 } 668 break; 669 default: 670 __assert_unreachable(); 671 } 672 } 673 674 static void 675 socket_init(void *tag) 676 { 677 678 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 679 NULL, NULL, UMA_ALIGN_PTR, 0); 680 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 681 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 682 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 683 EVENTHANDLER_PRI_FIRST); 684 } 685 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 686 687 #ifdef SOCKET_HHOOK 688 static void 689 socket_hhook_register(int subtype) 690 { 691 692 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 693 &V_socket_hhh[subtype], 694 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 695 printf("%s: WARNING: unable to register hook\n", __func__); 696 } 697 698 static void 699 socket_hhook_deregister(int subtype) 700 { 701 702 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 703 printf("%s: WARNING: unable to deregister hook\n", __func__); 704 } 705 706 static void 707 socket_vnet_init(const void *unused __unused) 708 { 709 int i; 710 711 /* We expect a contiguous range */ 712 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 713 socket_hhook_register(i); 714 } 715 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 716 socket_vnet_init, NULL); 717 718 static void 719 socket_vnet_uninit(const void *unused __unused) 720 { 721 int i; 722 723 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 724 socket_hhook_deregister(i); 725 } 726 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 727 socket_vnet_uninit, NULL); 728 #endif /* SOCKET_HHOOK */ 729 730 /* 731 * Initialise maxsockets. This SYSINIT must be run after 732 * tunable_mbinit(). 733 */ 734 static void 735 init_maxsockets(void *ignored) 736 { 737 738 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 739 maxsockets = imax(maxsockets, maxfiles); 740 } 741 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 742 743 /* 744 * Sysctl to get and set the maximum global sockets limit. Notify protocols 745 * of the change so that they can update their dependent limits as required. 746 */ 747 static int 748 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 749 { 750 int error, newmaxsockets; 751 752 newmaxsockets = maxsockets; 753 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 754 if (error == 0 && req->newptr && newmaxsockets != maxsockets) { 755 if (newmaxsockets > maxsockets && 756 newmaxsockets <= maxfiles) { 757 maxsockets = newmaxsockets; 758 EVENTHANDLER_INVOKE(maxsockets_change); 759 } else 760 error = EINVAL; 761 } 762 return (error); 763 } 764 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 765 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 766 &maxsockets, 0, sysctl_maxsockets, "IU", 767 "Maximum number of sockets available"); 768 769 /* 770 * Socket operation routines. These routines are called by the routines in 771 * sys_socket.c or from a system process, and implement the semantics of 772 * socket operations by switching out to the protocol specific routines. 773 */ 774 775 /* 776 * Get a socket structure from our zone, and initialize it. Note that it 777 * would probably be better to allocate socket and PCB at the same time, but 778 * I'm not convinced that all the protocols can be easily modified to do 779 * this. 780 * 781 * soalloc() returns a socket with a ref count of 0. 782 */ 783 static struct socket * 784 soalloc(struct vnet *vnet) 785 { 786 struct socket *so; 787 788 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 789 if (so == NULL) 790 return (NULL); 791 #ifdef MAC 792 if (mac_socket_init(so, M_NOWAIT) != 0) { 793 uma_zfree(socket_zone, so); 794 return (NULL); 795 } 796 #endif 797 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 798 uma_zfree(socket_zone, so); 799 return (NULL); 800 } 801 802 /* 803 * The socket locking protocol allows to lock 2 sockets at a time, 804 * however, the first one must be a listening socket. WITNESS lacks 805 * a feature to change class of an existing lock, so we use DUPOK. 806 */ 807 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 808 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); 809 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); 810 so->so_rcv.sb_sel = &so->so_rdsel; 811 so->so_snd.sb_sel = &so->so_wrsel; 812 sx_init(&so->so_snd_sx, "so_snd_sx"); 813 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 814 TAILQ_INIT(&so->so_snd.sb_aiojobq); 815 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 816 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 817 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 818 #ifdef VIMAGE 819 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 820 __func__, __LINE__, so)); 821 so->so_vnet = vnet; 822 #endif 823 #ifdef SOCKET_HHOOK 824 /* We shouldn't need the so_global_mtx */ 825 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 826 /* Do we need more comprehensive error returns? */ 827 uma_zfree(socket_zone, so); 828 return (NULL); 829 } 830 #endif 831 mtx_lock(&so_global_mtx); 832 so->so_gencnt = ++so_gencnt; 833 ++numopensockets; 834 #ifdef VIMAGE 835 vnet->vnet_sockcnt++; 836 #endif 837 mtx_unlock(&so_global_mtx); 838 839 return (so); 840 } 841 842 /* 843 * Free the storage associated with a socket at the socket layer, tear down 844 * locks, labels, etc. All protocol state is assumed already to have been 845 * torn down (and possibly never set up) by the caller. 846 */ 847 void 848 sodealloc(struct socket *so) 849 { 850 851 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 852 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 853 854 mtx_lock(&so_global_mtx); 855 so->so_gencnt = ++so_gencnt; 856 --numopensockets; /* Could be below, but faster here. */ 857 #ifdef VIMAGE 858 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 859 __func__, __LINE__, so)); 860 so->so_vnet->vnet_sockcnt--; 861 #endif 862 mtx_unlock(&so_global_mtx); 863 #ifdef MAC 864 mac_socket_destroy(so); 865 #endif 866 #ifdef SOCKET_HHOOK 867 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 868 #endif 869 870 khelp_destroy_osd(&so->osd); 871 if (SOLISTENING(so)) { 872 if (so->sol_accept_filter != NULL) 873 accept_filt_setopt(so, NULL); 874 } else { 875 if (so->so_rcv.sb_hiwat) 876 (void)chgsbsize(so->so_cred->cr_uidinfo, 877 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 878 if (so->so_snd.sb_hiwat) 879 (void)chgsbsize(so->so_cred->cr_uidinfo, 880 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 881 sx_destroy(&so->so_snd_sx); 882 sx_destroy(&so->so_rcv_sx); 883 mtx_destroy(&so->so_snd_mtx); 884 mtx_destroy(&so->so_rcv_mtx); 885 } 886 crfree(so->so_cred); 887 mtx_destroy(&so->so_lock); 888 uma_zfree(socket_zone, so); 889 } 890 891 /* 892 * socreate returns a socket with a ref count of 1 and a file descriptor 893 * reference. The socket should be closed with soclose(). 894 */ 895 int 896 socreate(int dom, struct socket **aso, int type, int proto, 897 struct ucred *cred, struct thread *td) 898 { 899 struct protosw *prp; 900 struct socket *so; 901 int error; 902 903 /* 904 * XXX: divert(4) historically abused PF_INET. Keep this compatibility 905 * shim until all applications have been updated. 906 */ 907 if (__predict_false(dom == PF_INET && type == SOCK_RAW && 908 proto == IPPROTO_DIVERT)) { 909 dom = PF_DIVERT; 910 printf("%s uses obsolete way to create divert(4) socket\n", 911 td->td_proc->p_comm); 912 } 913 914 prp = pffindproto(dom, type, proto); 915 if (prp == NULL) { 916 /* No support for domain. */ 917 if (pffinddomain(dom) == NULL) 918 return (EAFNOSUPPORT); 919 /* No support for socket type. */ 920 if (proto == 0 && type != 0) 921 return (EPROTOTYPE); 922 return (EPROTONOSUPPORT); 923 } 924 925 MPASS(prp->pr_attach); 926 927 if ((prp->pr_flags & PR_CAPATTACH) == 0) { 928 if (CAP_TRACING(td)) 929 ktrcapfail(CAPFAIL_PROTO, &proto); 930 if (IN_CAPABILITY_MODE(td)) 931 return (ECAPMODE); 932 } 933 934 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 935 return (EPROTONOSUPPORT); 936 937 so = soalloc(CRED_TO_VNET(cred)); 938 if (so == NULL) 939 return (ENOBUFS); 940 941 so->so_type = type; 942 so->so_cred = crhold(cred); 943 if ((prp->pr_domain->dom_family == PF_INET) || 944 (prp->pr_domain->dom_family == PF_INET6) || 945 (prp->pr_domain->dom_family == PF_ROUTE)) 946 so->so_fibnum = td->td_proc->p_fibnum; 947 else 948 so->so_fibnum = 0; 949 so->so_proto = prp; 950 #ifdef MAC 951 mac_socket_create(cred, so); 952 #endif 953 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 954 so_rdknl_assert_lock); 955 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 956 so_wrknl_assert_lock); 957 if ((prp->pr_flags & PR_SOCKBUF) == 0) { 958 so->so_snd.sb_mtx = &so->so_snd_mtx; 959 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 960 } 961 /* 962 * Auto-sizing of socket buffers is managed by the protocols and 963 * the appropriate flags must be set in the pru_attach function. 964 */ 965 CURVNET_SET(so->so_vnet); 966 error = prp->pr_attach(so, proto, td); 967 CURVNET_RESTORE(); 968 if (error) { 969 sodealloc(so); 970 return (error); 971 } 972 soref(so); 973 *aso = so; 974 return (0); 975 } 976 977 #ifdef REGRESSION 978 static int regression_sonewconn_earlytest = 1; 979 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 980 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 981 #endif 982 983 static int sooverprio = LOG_DEBUG; 984 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, 985 &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); 986 987 static struct timeval overinterval = { 60, 0 }; 988 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 989 &overinterval, 990 "Delay in seconds between warnings for listen socket overflows"); 991 992 /* 993 * When an attempt at a new connection is noted on a socket which supports 994 * accept(2), the protocol has two options: 995 * 1) Call legacy sonewconn() function, which would call protocol attach 996 * method, same as used for socket(2). 997 * 2) Call solisten_clone(), do attach that is specific to a cloned connection, 998 * and then call solisten_enqueue(). 999 * 1000 * Note: the ref count on the socket is 0 on return. 1001 */ 1002 struct socket * 1003 solisten_clone(struct socket *head) 1004 { 1005 struct sbuf descrsb; 1006 struct socket *so; 1007 int len, overcount; 1008 u_int qlen; 1009 const char localprefix[] = "local:"; 1010 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 1011 #if defined(INET6) 1012 char addrbuf[INET6_ADDRSTRLEN]; 1013 #elif defined(INET) 1014 char addrbuf[INET_ADDRSTRLEN]; 1015 #endif 1016 bool dolog, over; 1017 1018 SOLISTEN_LOCK(head); 1019 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 1020 #ifdef REGRESSION 1021 if (regression_sonewconn_earlytest && over) { 1022 #else 1023 if (over) { 1024 #endif 1025 head->sol_overcount++; 1026 dolog = (sooverprio >= 0) && 1027 !!ratecheck(&head->sol_lastover, &overinterval); 1028 1029 /* 1030 * If we're going to log, copy the overflow count and queue 1031 * length from the listen socket before dropping the lock. 1032 * Also, reset the overflow count. 1033 */ 1034 if (dolog) { 1035 overcount = head->sol_overcount; 1036 head->sol_overcount = 0; 1037 qlen = head->sol_qlen; 1038 } 1039 SOLISTEN_UNLOCK(head); 1040 1041 if (dolog) { 1042 /* 1043 * Try to print something descriptive about the 1044 * socket for the error message. 1045 */ 1046 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 1047 SBUF_FIXEDLEN); 1048 switch (head->so_proto->pr_domain->dom_family) { 1049 #if defined(INET) || defined(INET6) 1050 #ifdef INET 1051 case AF_INET: 1052 #endif 1053 #ifdef INET6 1054 case AF_INET6: 1055 if (head->so_proto->pr_domain->dom_family == 1056 AF_INET6 || 1057 (sotoinpcb(head)->inp_inc.inc_flags & 1058 INC_ISIPV6)) { 1059 ip6_sprintf(addrbuf, 1060 &sotoinpcb(head)->inp_inc.inc6_laddr); 1061 sbuf_printf(&descrsb, "[%s]", addrbuf); 1062 } else 1063 #endif 1064 { 1065 #ifdef INET 1066 inet_ntoa_r( 1067 sotoinpcb(head)->inp_inc.inc_laddr, 1068 addrbuf); 1069 sbuf_cat(&descrsb, addrbuf); 1070 #endif 1071 } 1072 sbuf_printf(&descrsb, ":%hu (proto %u)", 1073 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 1074 head->so_proto->pr_protocol); 1075 break; 1076 #endif /* INET || INET6 */ 1077 case AF_UNIX: 1078 sbuf_cat(&descrsb, localprefix); 1079 if (sotounpcb(head)->unp_addr != NULL) 1080 len = 1081 sotounpcb(head)->unp_addr->sun_len - 1082 offsetof(struct sockaddr_un, 1083 sun_path); 1084 else 1085 len = 0; 1086 if (len > 0) 1087 sbuf_bcat(&descrsb, 1088 sotounpcb(head)->unp_addr->sun_path, 1089 len); 1090 else 1091 sbuf_cat(&descrsb, "(unknown)"); 1092 break; 1093 } 1094 1095 /* 1096 * If we can't print something more specific, at least 1097 * print the domain name. 1098 */ 1099 if (sbuf_finish(&descrsb) != 0 || 1100 sbuf_len(&descrsb) <= 0) { 1101 sbuf_clear(&descrsb); 1102 sbuf_cat(&descrsb, 1103 head->so_proto->pr_domain->dom_name ?: 1104 "unknown"); 1105 sbuf_finish(&descrsb); 1106 } 1107 KASSERT(sbuf_len(&descrsb) > 0, 1108 ("%s: sbuf creation failed", __func__)); 1109 /* 1110 * Preserve the historic listen queue overflow log 1111 * message, that starts with "sonewconn:". It has 1112 * been known to sysadmins for years and also test 1113 * sys/kern/sonewconn_overflow checks for it. 1114 */ 1115 if (head->so_cred == 0) { 1116 log(LOG_PRI(sooverprio), 1117 "sonewconn: pcb %p (%s): " 1118 "Listen queue overflow: %i already in " 1119 "queue awaiting acceptance (%d " 1120 "occurrences)\n", head->so_pcb, 1121 sbuf_data(&descrsb), 1122 qlen, overcount); 1123 } else { 1124 log(LOG_PRI(sooverprio), 1125 "sonewconn: pcb %p (%s): " 1126 "Listen queue overflow: " 1127 "%i already in queue awaiting acceptance " 1128 "(%d occurrences), euid %d, rgid %d, jail %s\n", 1129 head->so_pcb, sbuf_data(&descrsb), qlen, 1130 overcount, head->so_cred->cr_uid, 1131 head->so_cred->cr_rgid, 1132 head->so_cred->cr_prison ? 1133 head->so_cred->cr_prison->pr_name : 1134 "not_jailed"); 1135 } 1136 sbuf_delete(&descrsb); 1137 1138 overcount = 0; 1139 } 1140 1141 return (NULL); 1142 } 1143 SOLISTEN_UNLOCK(head); 1144 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 1145 __func__, head)); 1146 so = soalloc(head->so_vnet); 1147 if (so == NULL) { 1148 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1149 "limit reached or out of memory\n", 1150 __func__, head->so_pcb); 1151 return (NULL); 1152 } 1153 so->so_listen = head; 1154 so->so_type = head->so_type; 1155 /* 1156 * POSIX is ambiguous on what options an accept(2)ed socket should 1157 * inherit from the listener. Words "create a new socket" may be 1158 * interpreted as not inheriting anything. Best programming practice 1159 * for application developers is to not rely on such inheritance. 1160 * FreeBSD had historically inherited all so_options excluding 1161 * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, 1162 * including those completely irrelevant to a new born socket. For 1163 * compatibility with older versions we will inherit a list of 1164 * meaningful options. 1165 * The crucial bit to inherit is SO_ACCEPTFILTER. We need it present 1166 * in the child socket for soisconnected() promoting socket from the 1167 * incomplete queue to complete. It will be cleared before the child 1168 * gets available to accept(2). 1169 */ 1170 so->so_options = head->so_options & (SO_ACCEPTFILTER | SO_KEEPALIVE | 1171 SO_DONTROUTE | SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); 1172 so->so_linger = head->so_linger; 1173 so->so_state = head->so_state; 1174 so->so_fibnum = head->so_fibnum; 1175 so->so_proto = head->so_proto; 1176 so->so_cred = crhold(head->so_cred); 1177 #ifdef SOCKET_HHOOK 1178 if (V_socket_hhh[HHOOK_SOCKET_NEWCONN]->hhh_nhooks > 0) { 1179 if (hhook_run_socket(so, head, HHOOK_SOCKET_NEWCONN)) { 1180 sodealloc(so); 1181 log(LOG_DEBUG, "%s: hhook run failed\n", __func__); 1182 return (NULL); 1183 } 1184 } 1185 #endif 1186 #ifdef MAC 1187 mac_socket_newconn(head, so); 1188 #endif 1189 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1190 so_rdknl_assert_lock); 1191 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1192 so_wrknl_assert_lock); 1193 VNET_SO_ASSERT(head); 1194 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 1195 sodealloc(so); 1196 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 1197 __func__, head->so_pcb); 1198 return (NULL); 1199 } 1200 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 1201 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 1202 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 1203 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 1204 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; 1205 so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; 1206 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 1207 so->so_snd.sb_mtx = &so->so_snd_mtx; 1208 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 1209 } 1210 1211 return (so); 1212 } 1213 1214 /* Connstatus may be 0 or SS_ISCONNECTED. */ 1215 struct socket * 1216 sonewconn(struct socket *head, int connstatus) 1217 { 1218 struct socket *so; 1219 1220 if ((so = solisten_clone(head)) == NULL) 1221 return (NULL); 1222 1223 if (so->so_proto->pr_attach(so, 0, NULL) != 0) { 1224 sodealloc(so); 1225 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 1226 __func__, head->so_pcb); 1227 return (NULL); 1228 } 1229 1230 (void)solisten_enqueue(so, connstatus); 1231 1232 return (so); 1233 } 1234 1235 /* 1236 * Enqueue socket cloned by solisten_clone() to the listen queue of the 1237 * listener it has been cloned from. 1238 * 1239 * Return 'true' if socket landed on complete queue, otherwise 'false'. 1240 */ 1241 bool 1242 solisten_enqueue(struct socket *so, int connstatus) 1243 { 1244 struct socket *head = so->so_listen; 1245 1246 MPASS(refcount_load(&so->so_count) == 0); 1247 refcount_init(&so->so_count, 1); 1248 1249 SOLISTEN_LOCK(head); 1250 if (head->sol_accept_filter != NULL) 1251 connstatus = 0; 1252 so->so_state |= connstatus; 1253 soref(head); /* A socket on (in)complete queue refs head. */ 1254 if (connstatus) { 1255 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 1256 so->so_qstate = SQ_COMP; 1257 head->sol_qlen++; 1258 solisten_wakeup(head); /* unlocks */ 1259 return (true); 1260 } else { 1261 /* 1262 * Keep removing sockets from the head until there's room for 1263 * us to insert on the tail. In pre-locking revisions, this 1264 * was a simple if(), but as we could be racing with other 1265 * threads and soabort() requires dropping locks, we must 1266 * loop waiting for the condition to be true. 1267 */ 1268 while (head->sol_incqlen > head->sol_qlimit) { 1269 struct socket *sp; 1270 1271 sp = TAILQ_FIRST(&head->sol_incomp); 1272 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 1273 head->sol_incqlen--; 1274 SOCK_LOCK(sp); 1275 sp->so_qstate = SQ_NONE; 1276 sp->so_listen = NULL; 1277 SOCK_UNLOCK(sp); 1278 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ 1279 soabort(sp); 1280 SOLISTEN_LOCK(head); 1281 } 1282 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 1283 so->so_qstate = SQ_INCOMP; 1284 head->sol_incqlen++; 1285 SOLISTEN_UNLOCK(head); 1286 return (false); 1287 } 1288 } 1289 1290 #if defined(SCTP) || defined(SCTP_SUPPORT) 1291 /* 1292 * Socket part of sctp_peeloff(). Detach a new socket from an 1293 * association. The new socket is returned with a reference. 1294 * 1295 * XXXGL: reduce copy-paste with solisten_clone(). 1296 */ 1297 struct socket * 1298 sopeeloff(struct socket *head) 1299 { 1300 struct socket *so; 1301 1302 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 1303 __func__, __LINE__, head)); 1304 so = soalloc(head->so_vnet); 1305 if (so == NULL) { 1306 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1307 "limit reached or out of memory\n", 1308 __func__, head->so_pcb); 1309 return (NULL); 1310 } 1311 so->so_type = head->so_type; 1312 so->so_options = head->so_options; 1313 so->so_linger = head->so_linger; 1314 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 1315 so->so_fibnum = head->so_fibnum; 1316 so->so_proto = head->so_proto; 1317 so->so_cred = crhold(head->so_cred); 1318 #ifdef MAC 1319 mac_socket_newconn(head, so); 1320 #endif 1321 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1322 so_rdknl_assert_lock); 1323 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1324 so_wrknl_assert_lock); 1325 VNET_SO_ASSERT(head); 1326 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 1327 sodealloc(so); 1328 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 1329 __func__, head->so_pcb); 1330 return (NULL); 1331 } 1332 if ((*so->so_proto->pr_attach)(so, 0, NULL)) { 1333 sodealloc(so); 1334 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 1335 __func__, head->so_pcb); 1336 return (NULL); 1337 } 1338 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 1339 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 1340 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 1341 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 1342 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 1343 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 1344 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 1345 so->so_snd.sb_mtx = &so->so_snd_mtx; 1346 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 1347 } 1348 1349 soref(so); 1350 1351 return (so); 1352 } 1353 #endif /* SCTP */ 1354 1355 int 1356 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 1357 { 1358 int error; 1359 1360 CURVNET_SET(so->so_vnet); 1361 error = so->so_proto->pr_bind(so, nam, td); 1362 CURVNET_RESTORE(); 1363 return (error); 1364 } 1365 1366 int 1367 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1368 { 1369 int error; 1370 1371 CURVNET_SET(so->so_vnet); 1372 error = so->so_proto->pr_bindat(fd, so, nam, td); 1373 CURVNET_RESTORE(); 1374 return (error); 1375 } 1376 1377 /* 1378 * solisten() transitions a socket from a non-listening state to a listening 1379 * state, but can also be used to update the listen queue depth on an 1380 * existing listen socket. The protocol will call back into the sockets 1381 * layer using solisten_proto_check() and solisten_proto() to check and set 1382 * socket-layer listen state. Call backs are used so that the protocol can 1383 * acquire both protocol and socket layer locks in whatever order is required 1384 * by the protocol. 1385 * 1386 * Protocol implementors are advised to hold the socket lock across the 1387 * socket-layer test and set to avoid races at the socket layer. 1388 */ 1389 int 1390 solisten(struct socket *so, int backlog, struct thread *td) 1391 { 1392 int error; 1393 1394 CURVNET_SET(so->so_vnet); 1395 error = so->so_proto->pr_listen(so, backlog, td); 1396 CURVNET_RESTORE(); 1397 return (error); 1398 } 1399 1400 /* 1401 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 1402 * order to interlock with socket I/O. 1403 */ 1404 int 1405 solisten_proto_check(struct socket *so) 1406 { 1407 SOCK_LOCK_ASSERT(so); 1408 1409 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1410 SS_ISDISCONNECTING)) != 0) 1411 return (EINVAL); 1412 1413 /* 1414 * Sleeping is not permitted here, so simply fail if userspace is 1415 * attempting to transmit or receive on the socket. This kind of 1416 * transient failure is not ideal, but it should occur only if userspace 1417 * is misusing the socket interfaces. 1418 */ 1419 if (!sx_try_xlock(&so->so_snd_sx)) 1420 return (EAGAIN); 1421 if (!sx_try_xlock(&so->so_rcv_sx)) { 1422 sx_xunlock(&so->so_snd_sx); 1423 return (EAGAIN); 1424 } 1425 mtx_lock(&so->so_snd_mtx); 1426 mtx_lock(&so->so_rcv_mtx); 1427 1428 /* Interlock with soo_aio_queue() and KTLS. */ 1429 if (!SOLISTENING(so)) { 1430 bool ktls; 1431 1432 #ifdef KERN_TLS 1433 ktls = so->so_snd.sb_tls_info != NULL || 1434 so->so_rcv.sb_tls_info != NULL; 1435 #else 1436 ktls = false; 1437 #endif 1438 if (ktls || 1439 (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || 1440 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { 1441 solisten_proto_abort(so); 1442 return (EINVAL); 1443 } 1444 } 1445 1446 return (0); 1447 } 1448 1449 /* 1450 * Undo the setup done by solisten_proto_check(). 1451 */ 1452 void 1453 solisten_proto_abort(struct socket *so) 1454 { 1455 mtx_unlock(&so->so_snd_mtx); 1456 mtx_unlock(&so->so_rcv_mtx); 1457 sx_xunlock(&so->so_snd_sx); 1458 sx_xunlock(&so->so_rcv_sx); 1459 } 1460 1461 void 1462 solisten_proto(struct socket *so, int backlog) 1463 { 1464 int sbrcv_lowat, sbsnd_lowat; 1465 u_int sbrcv_hiwat, sbsnd_hiwat; 1466 short sbrcv_flags, sbsnd_flags; 1467 sbintime_t sbrcv_timeo, sbsnd_timeo; 1468 1469 SOCK_LOCK_ASSERT(so); 1470 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1471 SS_ISDISCONNECTING)) == 0, 1472 ("%s: bad socket state %p", __func__, so)); 1473 1474 if (SOLISTENING(so)) 1475 goto listening; 1476 1477 /* 1478 * Change this socket to listening state. 1479 */ 1480 sbrcv_lowat = so->so_rcv.sb_lowat; 1481 sbsnd_lowat = so->so_snd.sb_lowat; 1482 sbrcv_hiwat = so->so_rcv.sb_hiwat; 1483 sbsnd_hiwat = so->so_snd.sb_hiwat; 1484 sbrcv_flags = so->so_rcv.sb_flags; 1485 sbsnd_flags = so->so_snd.sb_flags; 1486 sbrcv_timeo = so->so_rcv.sb_timeo; 1487 sbsnd_timeo = so->so_snd.sb_timeo; 1488 1489 #ifdef MAC 1490 mac_socketpeer_label_free(so->so_peerlabel); 1491 #endif 1492 1493 if (!(so->so_proto->pr_flags & PR_SOCKBUF)) { 1494 sbdestroy(so, SO_SND); 1495 sbdestroy(so, SO_RCV); 1496 } 1497 1498 #ifdef INVARIANTS 1499 bzero(&so->so_rcv, 1500 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 1501 #endif 1502 1503 so->sol_sbrcv_lowat = sbrcv_lowat; 1504 so->sol_sbsnd_lowat = sbsnd_lowat; 1505 so->sol_sbrcv_hiwat = sbrcv_hiwat; 1506 so->sol_sbsnd_hiwat = sbsnd_hiwat; 1507 so->sol_sbrcv_flags = sbrcv_flags; 1508 so->sol_sbsnd_flags = sbsnd_flags; 1509 so->sol_sbrcv_timeo = sbrcv_timeo; 1510 so->sol_sbsnd_timeo = sbsnd_timeo; 1511 1512 so->sol_qlen = so->sol_incqlen = 0; 1513 TAILQ_INIT(&so->sol_incomp); 1514 TAILQ_INIT(&so->sol_comp); 1515 1516 so->sol_accept_filter = NULL; 1517 so->sol_accept_filter_arg = NULL; 1518 so->sol_accept_filter_str = NULL; 1519 1520 so->sol_upcall = NULL; 1521 so->sol_upcallarg = NULL; 1522 1523 so->so_options |= SO_ACCEPTCONN; 1524 1525 listening: 1526 if (backlog < 0 || backlog > somaxconn) 1527 backlog = somaxconn; 1528 so->sol_qlimit = backlog; 1529 1530 mtx_unlock(&so->so_snd_mtx); 1531 mtx_unlock(&so->so_rcv_mtx); 1532 sx_xunlock(&so->so_snd_sx); 1533 sx_xunlock(&so->so_rcv_sx); 1534 } 1535 1536 /* 1537 * Wakeup listeners/subsystems once we have a complete connection. 1538 * Enters with lock, returns unlocked. 1539 */ 1540 void 1541 solisten_wakeup(struct socket *sol) 1542 { 1543 1544 if (sol->sol_upcall != NULL) 1545 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1546 else { 1547 selwakeuppri(&sol->so_rdsel, PSOCK); 1548 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1549 } 1550 SOLISTEN_UNLOCK(sol); 1551 wakeup_one(&sol->sol_comp); 1552 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1553 pgsigio(&sol->so_sigio, SIGIO, 0); 1554 } 1555 1556 /* 1557 * Return single connection off a listening socket queue. Main consumer of 1558 * the function is kern_accept4(). Some modules, that do their own accept 1559 * management also use the function. The socket reference held by the 1560 * listen queue is handed to the caller. 1561 * 1562 * Listening socket must be locked on entry and is returned unlocked on 1563 * return. 1564 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1565 */ 1566 int 1567 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1568 { 1569 struct socket *so; 1570 int error; 1571 1572 SOLISTEN_LOCK_ASSERT(head); 1573 1574 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1575 head->so_error == 0) { 1576 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1577 "accept", 0); 1578 if (error != 0) { 1579 SOLISTEN_UNLOCK(head); 1580 return (error); 1581 } 1582 } 1583 if (head->so_error) { 1584 error = head->so_error; 1585 head->so_error = 0; 1586 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1587 error = EWOULDBLOCK; 1588 else 1589 error = 0; 1590 if (error) { 1591 SOLISTEN_UNLOCK(head); 1592 return (error); 1593 } 1594 so = TAILQ_FIRST(&head->sol_comp); 1595 SOCK_LOCK(so); 1596 KASSERT(so->so_qstate == SQ_COMP, 1597 ("%s: so %p not SQ_COMP", __func__, so)); 1598 head->sol_qlen--; 1599 so->so_qstate = SQ_NONE; 1600 so->so_listen = NULL; 1601 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1602 if (flags & ACCEPT4_INHERIT) 1603 so->so_state |= (head->so_state & SS_NBIO); 1604 else 1605 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1606 SOCK_UNLOCK(so); 1607 sorele_locked(head); 1608 1609 *ret = so; 1610 return (0); 1611 } 1612 1613 static struct so_splice * 1614 so_splice_alloc(off_t max) 1615 { 1616 struct so_splice *sp; 1617 1618 sp = uma_zalloc(splice_zone, M_WAITOK); 1619 sp->src = NULL; 1620 sp->dst = NULL; 1621 sp->max = max > 0 ? max : -1; 1622 do { 1623 sp->wq_index = atomic_fetchadd_32(&splice_index, 1) % 1624 (mp_maxid + 1); 1625 } while (CPU_ABSENT(sp->wq_index)); 1626 sp->state = SPLICE_IDLE; 1627 TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout, 1628 sp); 1629 return (sp); 1630 } 1631 1632 static void 1633 so_splice_free(struct so_splice *sp) 1634 { 1635 KASSERT(sp->state == SPLICE_CLOSED, 1636 ("so_splice_free: sp %p not closed", sp)); 1637 uma_zfree(splice_zone, sp); 1638 } 1639 1640 static void 1641 so_splice_timeout(void *arg, int pending __unused) 1642 { 1643 struct so_splice *sp; 1644 1645 sp = arg; 1646 (void)so_unsplice(sp->src, true); 1647 } 1648 1649 /* 1650 * Splice the output from so to the input of so2. 1651 */ 1652 static int 1653 so_splice(struct socket *so, struct socket *so2, struct splice *splice) 1654 { 1655 struct so_splice *sp; 1656 int error; 1657 1658 if (splice->sp_max < 0) 1659 return (EINVAL); 1660 /* Handle only TCP for now; TODO: other streaming protos */ 1661 if (so->so_proto->pr_protocol != IPPROTO_TCP || 1662 so2->so_proto->pr_protocol != IPPROTO_TCP) 1663 return (EPROTONOSUPPORT); 1664 if (so->so_vnet != so2->so_vnet) 1665 return (EINVAL); 1666 1667 /* so_splice_xfer() assumes that we're using these implementations. */ 1668 KASSERT(so->so_proto->pr_sosend == sosend_generic, 1669 ("so_splice: sosend not sosend_generic")); 1670 KASSERT(so2->so_proto->pr_soreceive == soreceive_generic || 1671 so2->so_proto->pr_soreceive == soreceive_stream, 1672 ("so_splice: soreceive not soreceive_generic/stream")); 1673 1674 sp = so_splice_alloc(splice->sp_max); 1675 so->so_splice_sent = 0; 1676 sp->src = so; 1677 sp->dst = so2; 1678 1679 error = 0; 1680 SOCK_LOCK(so); 1681 if (SOLISTENING(so)) 1682 error = EINVAL; 1683 else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1684 error = ENOTCONN; 1685 else if (so->so_splice != NULL) 1686 error = EBUSY; 1687 if (error != 0) { 1688 SOCK_UNLOCK(so); 1689 uma_zfree(splice_zone, sp); 1690 return (error); 1691 } 1692 soref(so); 1693 so->so_splice = sp; 1694 SOCK_RECVBUF_LOCK(so); 1695 so->so_rcv.sb_flags |= SB_SPLICED; 1696 SOCK_RECVBUF_UNLOCK(so); 1697 SOCK_UNLOCK(so); 1698 1699 error = 0; 1700 SOCK_LOCK(so2); 1701 if (SOLISTENING(so2)) 1702 error = EINVAL; 1703 else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1704 error = ENOTCONN; 1705 else if (so2->so_splice_back != NULL) 1706 error = EBUSY; 1707 if (error != 0) { 1708 SOCK_UNLOCK(so2); 1709 SOCK_LOCK(so); 1710 so->so_splice = NULL; 1711 SOCK_RECVBUF_LOCK(so); 1712 so->so_rcv.sb_flags &= ~SB_SPLICED; 1713 SOCK_RECVBUF_UNLOCK(so); 1714 SOCK_UNLOCK(so); 1715 sorele(so); 1716 uma_zfree(splice_zone, sp); 1717 return (error); 1718 } 1719 soref(so2); 1720 so2->so_splice_back = sp; 1721 SOCK_SENDBUF_LOCK(so2); 1722 so2->so_snd.sb_flags |= SB_SPLICED; 1723 mtx_lock(&sp->mtx); 1724 SOCK_SENDBUF_UNLOCK(so2); 1725 SOCK_UNLOCK(so2); 1726 1727 if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) { 1728 taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout, 1729 tvtosbt(splice->sp_idle), 0, C_PREL(4)); 1730 } 1731 1732 /* 1733 * Transfer any data already present in the socket buffer. 1734 */ 1735 sp->state = SPLICE_QUEUED; 1736 so_splice_xfer(sp); 1737 return (0); 1738 } 1739 1740 static int 1741 so_unsplice(struct socket *so, bool timeout) 1742 { 1743 struct socket *so2; 1744 struct so_splice *sp; 1745 bool drain; 1746 1747 /* 1748 * First unset SB_SPLICED and hide the splice structure so that 1749 * wakeup routines will stop enqueuing work. This also ensures that 1750 * a only a single thread will proceed with the unsplice. 1751 */ 1752 SOCK_LOCK(so); 1753 if (SOLISTENING(so)) { 1754 SOCK_UNLOCK(so); 1755 return (EINVAL); 1756 } 1757 SOCK_RECVBUF_LOCK(so); 1758 if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) { 1759 SOCK_RECVBUF_UNLOCK(so); 1760 SOCK_UNLOCK(so); 1761 return (ENOTCONN); 1762 } 1763 so->so_rcv.sb_flags &= ~SB_SPLICED; 1764 sp = so->so_splice; 1765 so->so_splice = NULL; 1766 SOCK_RECVBUF_UNLOCK(so); 1767 SOCK_UNLOCK(so); 1768 1769 so2 = sp->dst; 1770 SOCK_LOCK(so2); 1771 KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__)); 1772 SOCK_SENDBUF_LOCK(so2); 1773 KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0, 1774 ("%s: so2 is not spliced", __func__)); 1775 KASSERT(so2->so_splice_back == sp, 1776 ("%s: so_splice_back != sp", __func__)); 1777 so2->so_snd.sb_flags &= ~SB_SPLICED; 1778 so2->so_splice_back = NULL; 1779 SOCK_SENDBUF_UNLOCK(so2); 1780 SOCK_UNLOCK(so2); 1781 1782 /* 1783 * No new work is being enqueued. The worker thread might be 1784 * splicing data right now, in which case we want to wait for it to 1785 * finish before proceeding. 1786 */ 1787 mtx_lock(&sp->mtx); 1788 switch (sp->state) { 1789 case SPLICE_QUEUED: 1790 case SPLICE_RUNNING: 1791 sp->state = SPLICE_CLOSING; 1792 while (sp->state == SPLICE_CLOSING) 1793 msleep(sp, &sp->mtx, PSOCK, "unsplice", 0); 1794 break; 1795 case SPLICE_IDLE: 1796 case SPLICE_EXCEPTION: 1797 sp->state = SPLICE_CLOSED; 1798 break; 1799 default: 1800 __assert_unreachable(); 1801 } 1802 if (!timeout) { 1803 drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout, 1804 NULL) != 0; 1805 } else { 1806 drain = false; 1807 } 1808 mtx_unlock(&sp->mtx); 1809 if (drain) 1810 taskqueue_drain_timeout(taskqueue_thread, &sp->timeout); 1811 1812 /* 1813 * Now we hold the sole reference to the splice structure. 1814 * Clean up: signal userspace and release socket references. 1815 */ 1816 sorwakeup(so); 1817 CURVNET_SET(so->so_vnet); 1818 sorele(so); 1819 sowwakeup(so2); 1820 sorele(so2); 1821 CURVNET_RESTORE(); 1822 so_splice_free(sp); 1823 return (0); 1824 } 1825 1826 /* 1827 * Free socket upon release of the very last reference. 1828 */ 1829 static void 1830 sofree(struct socket *so) 1831 { 1832 struct protosw *pr = so->so_proto; 1833 1834 SOCK_LOCK_ASSERT(so); 1835 KASSERT(refcount_load(&so->so_count) == 0, 1836 ("%s: so %p has references", __func__, so)); 1837 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, 1838 ("%s: so %p is on listen queue", __func__, so)); 1839 KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0, 1840 ("%s: so %p rcvbuf is spliced", __func__, so)); 1841 KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0, 1842 ("%s: so %p sndbuf is spliced", __func__, so)); 1843 KASSERT(so->so_splice == NULL && so->so_splice_back == NULL, 1844 ("%s: so %p has spliced data", __func__, so)); 1845 1846 SOCK_UNLOCK(so); 1847 1848 if (so->so_dtor != NULL) 1849 so->so_dtor(so); 1850 1851 VNET_SO_ASSERT(so); 1852 if (pr->pr_detach != NULL) 1853 pr->pr_detach(so); 1854 1855 /* 1856 * From this point on, we assume that no other references to this 1857 * socket exist anywhere else in the stack. Therefore, no locks need 1858 * to be acquired or held. 1859 */ 1860 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { 1861 sbdestroy(so, SO_SND); 1862 sbdestroy(so, SO_RCV); 1863 } 1864 seldrain(&so->so_rdsel); 1865 seldrain(&so->so_wrsel); 1866 knlist_destroy(&so->so_rdsel.si_note); 1867 knlist_destroy(&so->so_wrsel.si_note); 1868 sodealloc(so); 1869 } 1870 1871 /* 1872 * Release a reference on a socket while holding the socket lock. 1873 * Unlocks the socket lock before returning. 1874 */ 1875 void 1876 sorele_locked(struct socket *so) 1877 { 1878 SOCK_LOCK_ASSERT(so); 1879 if (refcount_release(&so->so_count)) 1880 sofree(so); 1881 else 1882 SOCK_UNLOCK(so); 1883 } 1884 1885 /* 1886 * Close a socket on last file table reference removal. Initiate disconnect 1887 * if connected. Free socket when disconnect complete. 1888 * 1889 * This function will sorele() the socket. Note that soclose() may be called 1890 * prior to the ref count reaching zero. The actual socket structure will 1891 * not be freed until the ref count reaches zero. 1892 */ 1893 int 1894 soclose(struct socket *so) 1895 { 1896 struct accept_queue lqueue; 1897 int error = 0; 1898 bool listening, last __diagused; 1899 1900 CURVNET_SET(so->so_vnet); 1901 funsetown(&so->so_sigio); 1902 if (so->so_state & SS_ISCONNECTED) { 1903 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1904 error = sodisconnect(so); 1905 if (error) { 1906 if (error == ENOTCONN) 1907 error = 0; 1908 goto drop; 1909 } 1910 } 1911 1912 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 1913 if ((so->so_state & SS_ISDISCONNECTING) && 1914 (so->so_state & SS_NBIO)) 1915 goto drop; 1916 while (so->so_state & SS_ISCONNECTED) { 1917 error = tsleep(&so->so_timeo, 1918 PSOCK | PCATCH, "soclos", 1919 so->so_linger * hz); 1920 if (error) 1921 break; 1922 } 1923 } 1924 } 1925 1926 drop: 1927 if (so->so_proto->pr_close != NULL) 1928 so->so_proto->pr_close(so); 1929 1930 SOCK_LOCK(so); 1931 if ((listening = SOLISTENING(so))) { 1932 struct socket *sp; 1933 1934 TAILQ_INIT(&lqueue); 1935 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1936 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1937 1938 so->sol_qlen = so->sol_incqlen = 0; 1939 1940 TAILQ_FOREACH(sp, &lqueue, so_list) { 1941 SOCK_LOCK(sp); 1942 sp->so_qstate = SQ_NONE; 1943 sp->so_listen = NULL; 1944 SOCK_UNLOCK(sp); 1945 last = refcount_release(&so->so_count); 1946 KASSERT(!last, ("%s: released last reference for %p", 1947 __func__, so)); 1948 } 1949 } 1950 sorele_locked(so); 1951 if (listening) { 1952 struct socket *sp, *tsp; 1953 1954 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) 1955 soabort(sp); 1956 } 1957 CURVNET_RESTORE(); 1958 return (error); 1959 } 1960 1961 /* 1962 * soabort() is used to abruptly tear down a connection, such as when a 1963 * resource limit is reached (listen queue depth exceeded), or if a listen 1964 * socket is closed while there are sockets waiting to be accepted. 1965 * 1966 * This interface is tricky, because it is called on an unreferenced socket, 1967 * and must be called only by a thread that has actually removed the socket 1968 * from the listen queue it was on. Likely this thread holds the last 1969 * reference on the socket and soabort() will proceed with sofree(). But 1970 * it might be not the last, as the sockets on the listen queues are seen 1971 * from the protocol side. 1972 * 1973 * This interface will call into the protocol code, so must not be called 1974 * with any socket locks held. Protocols do call it while holding their own 1975 * recursible protocol mutexes, but this is something that should be subject 1976 * to review in the future. 1977 * 1978 * Usually socket should have a single reference left, but this is not a 1979 * requirement. In the past, when we have had named references for file 1980 * descriptor and protocol, we asserted that none of them are being held. 1981 */ 1982 void 1983 soabort(struct socket *so) 1984 { 1985 1986 VNET_SO_ASSERT(so); 1987 1988 if (so->so_proto->pr_abort != NULL) 1989 so->so_proto->pr_abort(so); 1990 SOCK_LOCK(so); 1991 sorele_locked(so); 1992 } 1993 1994 int 1995 soaccept(struct socket *so, struct sockaddr *sa) 1996 { 1997 #ifdef INVARIANTS 1998 u_char len = sa->sa_len; 1999 #endif 2000 int error; 2001 2002 CURVNET_SET(so->so_vnet); 2003 error = so->so_proto->pr_accept(so, sa); 2004 KASSERT(sa->sa_len <= len, 2005 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2006 CURVNET_RESTORE(); 2007 return (error); 2008 } 2009 2010 int 2011 sopeeraddr(struct socket *so, struct sockaddr *sa) 2012 { 2013 #ifdef INVARIANTS 2014 u_char len = sa->sa_len; 2015 #endif 2016 int error; 2017 2018 CURVNET_SET(so->so_vnet); 2019 error = so->so_proto->pr_peeraddr(so, sa); 2020 KASSERT(sa->sa_len <= len, 2021 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2022 CURVNET_RESTORE(); 2023 2024 return (error); 2025 } 2026 2027 int 2028 sosockaddr(struct socket *so, struct sockaddr *sa) 2029 { 2030 #ifdef INVARIANTS 2031 u_char len = sa->sa_len; 2032 #endif 2033 int error; 2034 2035 CURVNET_SET(so->so_vnet); 2036 error = so->so_proto->pr_sockaddr(so, sa); 2037 KASSERT(sa->sa_len <= len, 2038 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2039 CURVNET_RESTORE(); 2040 2041 return (error); 2042 } 2043 2044 int 2045 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 2046 { 2047 2048 return (soconnectat(AT_FDCWD, so, nam, td)); 2049 } 2050 2051 int 2052 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 2053 { 2054 int error; 2055 2056 CURVNET_SET(so->so_vnet); 2057 2058 /* 2059 * If protocol is connection-based, can only connect once. 2060 * Otherwise, if connected, try to disconnect first. This allows 2061 * user to disconnect by connecting to, e.g., a null address. 2062 * 2063 * Note, this check is racy and may need to be re-evaluated at the 2064 * protocol layer. 2065 */ 2066 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 2067 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 2068 (error = sodisconnect(so)))) { 2069 error = EISCONN; 2070 } else { 2071 /* 2072 * Prevent accumulated error from previous connection from 2073 * biting us. 2074 */ 2075 so->so_error = 0; 2076 if (fd == AT_FDCWD) { 2077 error = so->so_proto->pr_connect(so, nam, td); 2078 } else { 2079 error = so->so_proto->pr_connectat(fd, so, nam, td); 2080 } 2081 } 2082 CURVNET_RESTORE(); 2083 2084 return (error); 2085 } 2086 2087 int 2088 soconnect2(struct socket *so1, struct socket *so2) 2089 { 2090 int error; 2091 2092 CURVNET_SET(so1->so_vnet); 2093 error = so1->so_proto->pr_connect2(so1, so2); 2094 CURVNET_RESTORE(); 2095 return (error); 2096 } 2097 2098 int 2099 sodisconnect(struct socket *so) 2100 { 2101 int error; 2102 2103 if ((so->so_state & SS_ISCONNECTED) == 0) 2104 return (ENOTCONN); 2105 if (so->so_state & SS_ISDISCONNECTING) 2106 return (EALREADY); 2107 VNET_SO_ASSERT(so); 2108 error = so->so_proto->pr_disconnect(so); 2109 return (error); 2110 } 2111 2112 int 2113 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 2114 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2115 { 2116 long space; 2117 ssize_t resid; 2118 int clen = 0, error, dontroute; 2119 2120 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 2121 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 2122 ("sosend_dgram: !PR_ATOMIC")); 2123 2124 if (uio != NULL) 2125 resid = uio->uio_resid; 2126 else 2127 resid = top->m_pkthdr.len; 2128 /* 2129 * In theory resid should be unsigned. However, space must be 2130 * signed, as it might be less than 0 if we over-committed, and we 2131 * must use a signed comparison of space and resid. On the other 2132 * hand, a negative resid causes us to loop sending 0-length 2133 * segments to the protocol. 2134 */ 2135 if (resid < 0) { 2136 error = EINVAL; 2137 goto out; 2138 } 2139 2140 dontroute = 2141 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 2142 if (td != NULL) 2143 td->td_ru.ru_msgsnd++; 2144 if (control != NULL) 2145 clen = control->m_len; 2146 2147 SOCKBUF_LOCK(&so->so_snd); 2148 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2149 SOCKBUF_UNLOCK(&so->so_snd); 2150 error = EPIPE; 2151 goto out; 2152 } 2153 if (so->so_error) { 2154 error = so->so_error; 2155 so->so_error = 0; 2156 SOCKBUF_UNLOCK(&so->so_snd); 2157 goto out; 2158 } 2159 if ((so->so_state & SS_ISCONNECTED) == 0) { 2160 /* 2161 * `sendto' and `sendmsg' is allowed on a connection-based 2162 * socket if it supports implied connect. Return ENOTCONN if 2163 * not connected and no address is supplied. 2164 */ 2165 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2166 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2167 if (!(resid == 0 && clen != 0)) { 2168 SOCKBUF_UNLOCK(&so->so_snd); 2169 error = ENOTCONN; 2170 goto out; 2171 } 2172 } else if (addr == NULL) { 2173 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2174 error = ENOTCONN; 2175 else 2176 error = EDESTADDRREQ; 2177 SOCKBUF_UNLOCK(&so->so_snd); 2178 goto out; 2179 } 2180 } 2181 2182 /* 2183 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 2184 * problem and need fixing. 2185 */ 2186 space = sbspace(&so->so_snd); 2187 if (flags & MSG_OOB) 2188 space += 1024; 2189 space -= clen; 2190 SOCKBUF_UNLOCK(&so->so_snd); 2191 if (resid > space) { 2192 error = EMSGSIZE; 2193 goto out; 2194 } 2195 if (uio == NULL) { 2196 resid = 0; 2197 if (flags & MSG_EOR) 2198 top->m_flags |= M_EOR; 2199 } else { 2200 /* 2201 * Copy the data from userland into a mbuf chain. 2202 * If no data is to be copied in, a single empty mbuf 2203 * is returned. 2204 */ 2205 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 2206 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 2207 if (top == NULL) { 2208 error = EFAULT; /* only possible error */ 2209 goto out; 2210 } 2211 space -= resid - uio->uio_resid; 2212 resid = uio->uio_resid; 2213 } 2214 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 2215 /* 2216 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 2217 * than with. 2218 */ 2219 if (dontroute) { 2220 SOCK_LOCK(so); 2221 so->so_options |= SO_DONTROUTE; 2222 SOCK_UNLOCK(so); 2223 } 2224 /* 2225 * XXX all the SBS_CANTSENDMORE checks previously done could be out 2226 * of date. We could have received a reset packet in an interrupt or 2227 * maybe we slept while doing page faults in uiomove() etc. We could 2228 * probably recheck again inside the locking protection here, but 2229 * there are probably other places that this also happens. We must 2230 * rethink this. 2231 */ 2232 VNET_SO_ASSERT(so); 2233 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : 2234 /* 2235 * If the user set MSG_EOF, the protocol understands this flag and 2236 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 2237 */ 2238 ((flags & MSG_EOF) && 2239 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2240 (resid <= 0)) ? 2241 PRUS_EOF : 2242 /* If there is more to send set PRUS_MORETOCOME */ 2243 (flags & MSG_MORETOCOME) || 2244 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 2245 top, addr, control, td); 2246 if (dontroute) { 2247 SOCK_LOCK(so); 2248 so->so_options &= ~SO_DONTROUTE; 2249 SOCK_UNLOCK(so); 2250 } 2251 clen = 0; 2252 control = NULL; 2253 top = NULL; 2254 out: 2255 if (top != NULL) 2256 m_freem(top); 2257 if (control != NULL) 2258 m_freem(control); 2259 return (error); 2260 } 2261 2262 /* 2263 * Send on a socket. If send must go all at once and message is larger than 2264 * send buffering, then hard error. Lock against other senders. If must go 2265 * all at once and not enough room now, then inform user that this would 2266 * block and do nothing. Otherwise, if nonblocking, send as much as 2267 * possible. The data to be sent is described by "uio" if nonzero, otherwise 2268 * by the mbuf chain "top" (which must be null if uio is not). Data provided 2269 * in mbuf chain must be small enough to send all at once. 2270 * 2271 * Returns nonzero on error, timeout or signal; callers must check for short 2272 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 2273 * on return. 2274 */ 2275 static int 2276 sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, 2277 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2278 { 2279 long space; 2280 ssize_t resid; 2281 int clen = 0, error, dontroute; 2282 int atomic = sosendallatonce(so) || top; 2283 int pr_send_flag; 2284 #ifdef KERN_TLS 2285 struct ktls_session *tls; 2286 int tls_enq_cnt, tls_send_flag; 2287 uint8_t tls_rtype; 2288 2289 tls = NULL; 2290 tls_rtype = TLS_RLTYPE_APP; 2291 #endif 2292 2293 SOCK_IO_SEND_ASSERT_LOCKED(so); 2294 2295 if (uio != NULL) 2296 resid = uio->uio_resid; 2297 else if ((top->m_flags & M_PKTHDR) != 0) 2298 resid = top->m_pkthdr.len; 2299 else 2300 resid = m_length(top, NULL); 2301 /* 2302 * In theory resid should be unsigned. However, space must be 2303 * signed, as it might be less than 0 if we over-committed, and we 2304 * must use a signed comparison of space and resid. On the other 2305 * hand, a negative resid causes us to loop sending 0-length 2306 * segments to the protocol. 2307 * 2308 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 2309 * type sockets since that's an error. 2310 */ 2311 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 2312 error = EINVAL; 2313 goto out; 2314 } 2315 2316 dontroute = 2317 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 2318 (so->so_proto->pr_flags & PR_ATOMIC); 2319 if (td != NULL) 2320 td->td_ru.ru_msgsnd++; 2321 if (control != NULL) 2322 clen = control->m_len; 2323 2324 #ifdef KERN_TLS 2325 tls_send_flag = 0; 2326 tls = ktls_hold(so->so_snd.sb_tls_info); 2327 if (tls != NULL) { 2328 if (tls->mode == TCP_TLS_MODE_SW) 2329 tls_send_flag = PRUS_NOTREADY; 2330 2331 if (control != NULL) { 2332 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 2333 2334 if (clen >= sizeof(*cm) && 2335 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 2336 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 2337 clen = 0; 2338 m_freem(control); 2339 control = NULL; 2340 atomic = 1; 2341 } 2342 } 2343 2344 if (resid == 0 && !ktls_permit_empty_frames(tls)) { 2345 error = EINVAL; 2346 goto out; 2347 } 2348 } 2349 #endif 2350 2351 restart: 2352 do { 2353 SOCKBUF_LOCK(&so->so_snd); 2354 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2355 SOCKBUF_UNLOCK(&so->so_snd); 2356 error = EPIPE; 2357 goto out; 2358 } 2359 if (so->so_error) { 2360 error = so->so_error; 2361 so->so_error = 0; 2362 SOCKBUF_UNLOCK(&so->so_snd); 2363 goto out; 2364 } 2365 if ((so->so_state & SS_ISCONNECTED) == 0) { 2366 /* 2367 * `sendto' and `sendmsg' is allowed on a connection- 2368 * based socket if it supports implied connect. 2369 * Return ENOTCONN if not connected and no address is 2370 * supplied. 2371 */ 2372 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2373 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2374 if (!(resid == 0 && clen != 0)) { 2375 SOCKBUF_UNLOCK(&so->so_snd); 2376 error = ENOTCONN; 2377 goto out; 2378 } 2379 } else if (addr == NULL) { 2380 SOCKBUF_UNLOCK(&so->so_snd); 2381 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2382 error = ENOTCONN; 2383 else 2384 error = EDESTADDRREQ; 2385 goto out; 2386 } 2387 } 2388 space = sbspace(&so->so_snd); 2389 if (flags & MSG_OOB) 2390 space += 1024; 2391 if ((atomic && resid > so->so_snd.sb_hiwat) || 2392 clen > so->so_snd.sb_hiwat) { 2393 SOCKBUF_UNLOCK(&so->so_snd); 2394 error = EMSGSIZE; 2395 goto out; 2396 } 2397 if (space < resid + clen && 2398 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 2399 if ((so->so_state & SS_NBIO) || 2400 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 2401 SOCKBUF_UNLOCK(&so->so_snd); 2402 error = EWOULDBLOCK; 2403 goto out; 2404 } 2405 error = sbwait(so, SO_SND); 2406 SOCKBUF_UNLOCK(&so->so_snd); 2407 if (error) 2408 goto out; 2409 goto restart; 2410 } 2411 SOCKBUF_UNLOCK(&so->so_snd); 2412 space -= clen; 2413 do { 2414 if (uio == NULL) { 2415 resid = 0; 2416 if (flags & MSG_EOR) 2417 top->m_flags |= M_EOR; 2418 #ifdef KERN_TLS 2419 if (tls != NULL) { 2420 ktls_frame(top, tls, &tls_enq_cnt, 2421 tls_rtype); 2422 tls_rtype = TLS_RLTYPE_APP; 2423 } 2424 #endif 2425 } else { 2426 /* 2427 * Copy the data from userland into a mbuf 2428 * chain. If resid is 0, which can happen 2429 * only if we have control to send, then 2430 * a single empty mbuf is returned. This 2431 * is a workaround to prevent protocol send 2432 * methods to panic. 2433 */ 2434 #ifdef KERN_TLS 2435 if (tls != NULL) { 2436 top = m_uiotombuf(uio, M_WAITOK, space, 2437 tls->params.max_frame_len, 2438 M_EXTPG | 2439 ((flags & MSG_EOR) ? M_EOR : 0)); 2440 if (top != NULL) { 2441 ktls_frame(top, tls, 2442 &tls_enq_cnt, tls_rtype); 2443 } 2444 tls_rtype = TLS_RLTYPE_APP; 2445 } else 2446 #endif 2447 top = m_uiotombuf(uio, M_WAITOK, space, 2448 (atomic ? max_hdr : 0), 2449 (atomic ? M_PKTHDR : 0) | 2450 ((flags & MSG_EOR) ? M_EOR : 0)); 2451 if (top == NULL) { 2452 error = EFAULT; /* only possible error */ 2453 goto out; 2454 } 2455 space -= resid - uio->uio_resid; 2456 resid = uio->uio_resid; 2457 } 2458 if (dontroute) { 2459 SOCK_LOCK(so); 2460 so->so_options |= SO_DONTROUTE; 2461 SOCK_UNLOCK(so); 2462 } 2463 /* 2464 * XXX all the SBS_CANTSENDMORE checks previously 2465 * done could be out of date. We could have received 2466 * a reset packet in an interrupt or maybe we slept 2467 * while doing page faults in uiomove() etc. We 2468 * could probably recheck again inside the locking 2469 * protection here, but there are probably other 2470 * places that this also happens. We must rethink 2471 * this. 2472 */ 2473 VNET_SO_ASSERT(so); 2474 2475 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : 2476 /* 2477 * If the user set MSG_EOF, the protocol understands 2478 * this flag and nothing left to send then use 2479 * PRU_SEND_EOF instead of PRU_SEND. 2480 */ 2481 ((flags & MSG_EOF) && 2482 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2483 (resid <= 0)) ? 2484 PRUS_EOF : 2485 /* If there is more to send set PRUS_MORETOCOME. */ 2486 (flags & MSG_MORETOCOME) || 2487 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 2488 2489 #ifdef KERN_TLS 2490 pr_send_flag |= tls_send_flag; 2491 #endif 2492 2493 error = so->so_proto->pr_send(so, pr_send_flag, top, 2494 addr, control, td); 2495 2496 if (dontroute) { 2497 SOCK_LOCK(so); 2498 so->so_options &= ~SO_DONTROUTE; 2499 SOCK_UNLOCK(so); 2500 } 2501 2502 #ifdef KERN_TLS 2503 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 2504 if (error != 0) { 2505 m_freem(top); 2506 top = NULL; 2507 } else { 2508 soref(so); 2509 ktls_enqueue(top, so, tls_enq_cnt); 2510 } 2511 } 2512 #endif 2513 clen = 0; 2514 control = NULL; 2515 top = NULL; 2516 if (error) 2517 goto out; 2518 } while (resid && space > 0); 2519 } while (resid); 2520 2521 out: 2522 #ifdef KERN_TLS 2523 if (tls != NULL) 2524 ktls_free(tls); 2525 #endif 2526 if (top != NULL) 2527 m_freem(top); 2528 if (control != NULL) 2529 m_freem(control); 2530 return (error); 2531 } 2532 2533 int 2534 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 2535 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2536 { 2537 int error; 2538 2539 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 2540 if (error) 2541 return (error); 2542 error = sosend_generic_locked(so, addr, uio, top, control, flags, td); 2543 SOCK_IO_SEND_UNLOCK(so); 2544 return (error); 2545 } 2546 2547 /* 2548 * Send to a socket from a kernel thread. 2549 * 2550 * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. 2551 * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs 2552 * to be set at all. This function should just boil down to a static inline 2553 * calling the protocol method. 2554 */ 2555 int 2556 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2557 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2558 { 2559 int error; 2560 2561 CURVNET_SET(so->so_vnet); 2562 error = so->so_proto->pr_sosend(so, addr, uio, 2563 top, control, flags, td); 2564 CURVNET_RESTORE(); 2565 return (error); 2566 } 2567 2568 /* 2569 * send(2), write(2) or aio_write(2) on a socket. 2570 */ 2571 int 2572 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2573 struct mbuf *control, int flags, struct proc *userproc) 2574 { 2575 struct thread *td; 2576 ssize_t len; 2577 int error; 2578 2579 td = uio->uio_td; 2580 len = uio->uio_resid; 2581 CURVNET_SET(so->so_vnet); 2582 error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, 2583 td); 2584 CURVNET_RESTORE(); 2585 if (error != 0) { 2586 /* 2587 * Clear transient errors for stream protocols if they made 2588 * some progress. Make exclusion for aio(4) that would 2589 * schedule a new write in case of EWOULDBLOCK and clear 2590 * error itself. See soaio_process_job(). 2591 */ 2592 if (uio->uio_resid != len && 2593 (so->so_proto->pr_flags & PR_ATOMIC) == 0 && 2594 userproc == NULL && 2595 (error == ERESTART || error == EINTR || 2596 error == EWOULDBLOCK)) 2597 error = 0; 2598 /* Generation of SIGPIPE can be controlled per socket. */ 2599 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && 2600 (flags & MSG_NOSIGNAL) == 0) { 2601 if (userproc != NULL) { 2602 /* aio(4) job */ 2603 PROC_LOCK(userproc); 2604 kern_psignal(userproc, SIGPIPE); 2605 PROC_UNLOCK(userproc); 2606 } else { 2607 PROC_LOCK(td->td_proc); 2608 tdsignal(td, SIGPIPE); 2609 PROC_UNLOCK(td->td_proc); 2610 } 2611 } 2612 } 2613 return (error); 2614 } 2615 2616 /* 2617 * The part of soreceive() that implements reading non-inline out-of-band 2618 * data from a socket. For more complete comments, see soreceive(), from 2619 * which this code originated. 2620 * 2621 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 2622 * unable to return an mbuf chain to the caller. 2623 */ 2624 static int 2625 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 2626 { 2627 struct protosw *pr = so->so_proto; 2628 struct mbuf *m; 2629 int error; 2630 2631 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 2632 VNET_SO_ASSERT(so); 2633 2634 m = m_get(M_WAITOK, MT_DATA); 2635 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 2636 if (error) 2637 goto bad; 2638 do { 2639 error = uiomove(mtod(m, void *), 2640 (int) min(uio->uio_resid, m->m_len), uio); 2641 m = m_free(m); 2642 } while (uio->uio_resid && error == 0 && m); 2643 bad: 2644 if (m != NULL) 2645 m_freem(m); 2646 return (error); 2647 } 2648 2649 /* 2650 * Following replacement or removal of the first mbuf on the first mbuf chain 2651 * of a socket buffer, push necessary state changes back into the socket 2652 * buffer so that other consumers see the values consistently. 'nextrecord' 2653 * is the callers locally stored value of the original value of 2654 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 2655 * NOTE: 'nextrecord' may be NULL. 2656 */ 2657 static __inline void 2658 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 2659 { 2660 2661 SOCKBUF_LOCK_ASSERT(sb); 2662 /* 2663 * First, update for the new value of nextrecord. If necessary, make 2664 * it the first record. 2665 */ 2666 if (sb->sb_mb != NULL) 2667 sb->sb_mb->m_nextpkt = nextrecord; 2668 else 2669 sb->sb_mb = nextrecord; 2670 2671 /* 2672 * Now update any dependent socket buffer fields to reflect the new 2673 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 2674 * addition of a second clause that takes care of the case where 2675 * sb_mb has been updated, but remains the last record. 2676 */ 2677 if (sb->sb_mb == NULL) { 2678 sb->sb_mbtail = NULL; 2679 sb->sb_lastrecord = NULL; 2680 } else if (sb->sb_mb->m_nextpkt == NULL) 2681 sb->sb_lastrecord = sb->sb_mb; 2682 } 2683 2684 /* 2685 * Implement receive operations on a socket. We depend on the way that 2686 * records are added to the sockbuf by sbappend. In particular, each record 2687 * (mbufs linked through m_next) must begin with an address if the protocol 2688 * so specifies, followed by an optional mbuf or mbufs containing ancillary 2689 * data, and then zero or more mbufs of data. In order to allow parallelism 2690 * between network receive and copying to user space, as well as avoid 2691 * sleeping with a mutex held, we release the socket buffer mutex during the 2692 * user space copy. Although the sockbuf is locked, new data may still be 2693 * appended, and thus we must maintain consistency of the sockbuf during that 2694 * time. 2695 * 2696 * The caller may receive the data as a single mbuf chain by supplying an 2697 * mbuf **mp0 for use in returning the chain. The uio is then used only for 2698 * the count in uio_resid. 2699 */ 2700 static int 2701 soreceive_generic_locked(struct socket *so, struct sockaddr **psa, 2702 struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) 2703 { 2704 struct mbuf *m; 2705 int flags, error, offset; 2706 ssize_t len; 2707 struct protosw *pr = so->so_proto; 2708 struct mbuf *nextrecord; 2709 int moff, type = 0; 2710 ssize_t orig_resid = uio->uio_resid; 2711 bool report_real_len = false; 2712 2713 SOCK_IO_RECV_ASSERT_LOCKED(so); 2714 2715 error = 0; 2716 if (flagsp != NULL) { 2717 report_real_len = *flagsp & MSG_TRUNC; 2718 *flagsp &= ~MSG_TRUNC; 2719 flags = *flagsp &~ MSG_EOR; 2720 } else 2721 flags = 0; 2722 2723 restart: 2724 SOCKBUF_LOCK(&so->so_rcv); 2725 m = so->so_rcv.sb_mb; 2726 /* 2727 * If we have less data than requested, block awaiting more (subject 2728 * to any timeout) if: 2729 * 1. the current count is less than the low water mark, or 2730 * 2. MSG_DONTWAIT is not set 2731 */ 2732 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 2733 sbavail(&so->so_rcv) < uio->uio_resid) && 2734 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 2735 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 2736 KASSERT(m != NULL || !sbavail(&so->so_rcv), 2737 ("receive: m == %p sbavail == %u", 2738 m, sbavail(&so->so_rcv))); 2739 if (so->so_error || so->so_rerror) { 2740 if (m != NULL) 2741 goto dontblock; 2742 if (so->so_error) 2743 error = so->so_error; 2744 else 2745 error = so->so_rerror; 2746 if ((flags & MSG_PEEK) == 0) { 2747 if (so->so_error) 2748 so->so_error = 0; 2749 else 2750 so->so_rerror = 0; 2751 } 2752 SOCKBUF_UNLOCK(&so->so_rcv); 2753 goto release; 2754 } 2755 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2756 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2757 if (m != NULL) 2758 goto dontblock; 2759 #ifdef KERN_TLS 2760 else if (so->so_rcv.sb_tlsdcc == 0 && 2761 so->so_rcv.sb_tlscc == 0) { 2762 #else 2763 else { 2764 #endif 2765 SOCKBUF_UNLOCK(&so->so_rcv); 2766 goto release; 2767 } 2768 } 2769 for (; m != NULL; m = m->m_next) 2770 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2771 m = so->so_rcv.sb_mb; 2772 goto dontblock; 2773 } 2774 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2775 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2776 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2777 SOCKBUF_UNLOCK(&so->so_rcv); 2778 error = ENOTCONN; 2779 goto release; 2780 } 2781 if (uio->uio_resid == 0 && !report_real_len) { 2782 SOCKBUF_UNLOCK(&so->so_rcv); 2783 goto release; 2784 } 2785 if ((so->so_state & SS_NBIO) || 2786 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2787 SOCKBUF_UNLOCK(&so->so_rcv); 2788 error = EWOULDBLOCK; 2789 goto release; 2790 } 2791 SBLASTRECORDCHK(&so->so_rcv); 2792 SBLASTMBUFCHK(&so->so_rcv); 2793 error = sbwait(so, SO_RCV); 2794 SOCKBUF_UNLOCK(&so->so_rcv); 2795 if (error) 2796 goto release; 2797 goto restart; 2798 } 2799 dontblock: 2800 /* 2801 * From this point onward, we maintain 'nextrecord' as a cache of the 2802 * pointer to the next record in the socket buffer. We must keep the 2803 * various socket buffer pointers and local stack versions of the 2804 * pointers in sync, pushing out modifications before dropping the 2805 * socket buffer mutex, and re-reading them when picking it up. 2806 * 2807 * Otherwise, we will race with the network stack appending new data 2808 * or records onto the socket buffer by using inconsistent/stale 2809 * versions of the field, possibly resulting in socket buffer 2810 * corruption. 2811 * 2812 * By holding the high-level sblock(), we prevent simultaneous 2813 * readers from pulling off the front of the socket buffer. 2814 */ 2815 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2816 if (uio->uio_td) 2817 uio->uio_td->td_ru.ru_msgrcv++; 2818 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2819 SBLASTRECORDCHK(&so->so_rcv); 2820 SBLASTMBUFCHK(&so->so_rcv); 2821 nextrecord = m->m_nextpkt; 2822 if (pr->pr_flags & PR_ADDR) { 2823 KASSERT(m->m_type == MT_SONAME, 2824 ("m->m_type == %d", m->m_type)); 2825 orig_resid = 0; 2826 if (psa != NULL) 2827 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2828 M_NOWAIT); 2829 if (flags & MSG_PEEK) { 2830 m = m->m_next; 2831 } else { 2832 sbfree(&so->so_rcv, m); 2833 so->so_rcv.sb_mb = m_free(m); 2834 m = so->so_rcv.sb_mb; 2835 sockbuf_pushsync(&so->so_rcv, nextrecord); 2836 } 2837 } 2838 2839 /* 2840 * Process one or more MT_CONTROL mbufs present before any data mbufs 2841 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2842 * just copy the data; if !MSG_PEEK, we call into the protocol to 2843 * perform externalization (or freeing if controlp == NULL). 2844 */ 2845 if (m != NULL && m->m_type == MT_CONTROL) { 2846 struct mbuf *cm = NULL, *cmn; 2847 struct mbuf **cme = &cm; 2848 #ifdef KERN_TLS 2849 struct cmsghdr *cmsg; 2850 struct tls_get_record tgr; 2851 2852 /* 2853 * For MSG_TLSAPPDATA, check for an alert record. 2854 * If found, return ENXIO without removing 2855 * it from the receive queue. This allows a subsequent 2856 * call without MSG_TLSAPPDATA to receive it. 2857 * Note that, for TLS, there should only be a single 2858 * control mbuf with the TLS_GET_RECORD message in it. 2859 */ 2860 if (flags & MSG_TLSAPPDATA) { 2861 cmsg = mtod(m, struct cmsghdr *); 2862 if (cmsg->cmsg_type == TLS_GET_RECORD && 2863 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2864 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2865 if (__predict_false(tgr.tls_type == 2866 TLS_RLTYPE_ALERT)) { 2867 SOCKBUF_UNLOCK(&so->so_rcv); 2868 error = ENXIO; 2869 goto release; 2870 } 2871 } 2872 } 2873 #endif 2874 2875 do { 2876 if (flags & MSG_PEEK) { 2877 if (controlp != NULL) { 2878 *controlp = m_copym(m, 0, m->m_len, 2879 M_NOWAIT); 2880 controlp = &(*controlp)->m_next; 2881 } 2882 m = m->m_next; 2883 } else { 2884 sbfree(&so->so_rcv, m); 2885 so->so_rcv.sb_mb = m->m_next; 2886 m->m_next = NULL; 2887 *cme = m; 2888 cme = &(*cme)->m_next; 2889 m = so->so_rcv.sb_mb; 2890 } 2891 } while (m != NULL && m->m_type == MT_CONTROL); 2892 if ((flags & MSG_PEEK) == 0) 2893 sockbuf_pushsync(&so->so_rcv, nextrecord); 2894 while (cm != NULL) { 2895 cmn = cm->m_next; 2896 cm->m_next = NULL; 2897 if (pr->pr_domain->dom_externalize != NULL) { 2898 SOCKBUF_UNLOCK(&so->so_rcv); 2899 VNET_SO_ASSERT(so); 2900 error = (*pr->pr_domain->dom_externalize) 2901 (cm, controlp, flags); 2902 SOCKBUF_LOCK(&so->so_rcv); 2903 } else if (controlp != NULL) 2904 *controlp = cm; 2905 else 2906 m_freem(cm); 2907 if (controlp != NULL) { 2908 while (*controlp != NULL) 2909 controlp = &(*controlp)->m_next; 2910 } 2911 cm = cmn; 2912 } 2913 if (m != NULL) 2914 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 2915 else 2916 nextrecord = so->so_rcv.sb_mb; 2917 orig_resid = 0; 2918 } 2919 if (m != NULL) { 2920 if ((flags & MSG_PEEK) == 0) { 2921 KASSERT(m->m_nextpkt == nextrecord, 2922 ("soreceive: post-control, nextrecord !sync")); 2923 if (nextrecord == NULL) { 2924 KASSERT(so->so_rcv.sb_mb == m, 2925 ("soreceive: post-control, sb_mb!=m")); 2926 KASSERT(so->so_rcv.sb_lastrecord == m, 2927 ("soreceive: post-control, lastrecord!=m")); 2928 } 2929 } 2930 type = m->m_type; 2931 if (type == MT_OOBDATA) 2932 flags |= MSG_OOB; 2933 } else { 2934 if ((flags & MSG_PEEK) == 0) { 2935 KASSERT(so->so_rcv.sb_mb == nextrecord, 2936 ("soreceive: sb_mb != nextrecord")); 2937 if (so->so_rcv.sb_mb == NULL) { 2938 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2939 ("soreceive: sb_lastercord != NULL")); 2940 } 2941 } 2942 } 2943 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2944 SBLASTRECORDCHK(&so->so_rcv); 2945 SBLASTMBUFCHK(&so->so_rcv); 2946 2947 /* 2948 * Now continue to read any data mbufs off of the head of the socket 2949 * buffer until the read request is satisfied. Note that 'type' is 2950 * used to store the type of any mbuf reads that have happened so far 2951 * such that soreceive() can stop reading if the type changes, which 2952 * causes soreceive() to return only one of regular data and inline 2953 * out-of-band data in a single socket receive operation. 2954 */ 2955 moff = 0; 2956 offset = 0; 2957 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 2958 && error == 0) { 2959 /* 2960 * If the type of mbuf has changed since the last mbuf 2961 * examined ('type'), end the receive operation. 2962 */ 2963 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2964 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 2965 if (type != m->m_type) 2966 break; 2967 } else if (type == MT_OOBDATA) 2968 break; 2969 else 2970 KASSERT(m->m_type == MT_DATA, 2971 ("m->m_type == %d", m->m_type)); 2972 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 2973 len = uio->uio_resid; 2974 if (so->so_oobmark && len > so->so_oobmark - offset) 2975 len = so->so_oobmark - offset; 2976 if (len > m->m_len - moff) 2977 len = m->m_len - moff; 2978 /* 2979 * If mp is set, just pass back the mbufs. Otherwise copy 2980 * them out via the uio, then free. Sockbuf must be 2981 * consistent here (points to current mbuf, it points to next 2982 * record) when we drop priority; we must note any additions 2983 * to the sockbuf when we block interrupts again. 2984 */ 2985 if (mp == NULL) { 2986 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2987 SBLASTRECORDCHK(&so->so_rcv); 2988 SBLASTMBUFCHK(&so->so_rcv); 2989 SOCKBUF_UNLOCK(&so->so_rcv); 2990 if ((m->m_flags & M_EXTPG) != 0) 2991 error = m_unmapped_uiomove(m, moff, uio, 2992 (int)len); 2993 else 2994 error = uiomove(mtod(m, char *) + moff, 2995 (int)len, uio); 2996 SOCKBUF_LOCK(&so->so_rcv); 2997 if (error) { 2998 /* 2999 * The MT_SONAME mbuf has already been removed 3000 * from the record, so it is necessary to 3001 * remove the data mbufs, if any, to preserve 3002 * the invariant in the case of PR_ADDR that 3003 * requires MT_SONAME mbufs at the head of 3004 * each record. 3005 */ 3006 if (pr->pr_flags & PR_ATOMIC && 3007 ((flags & MSG_PEEK) == 0)) 3008 (void)sbdroprecord_locked(&so->so_rcv); 3009 SOCKBUF_UNLOCK(&so->so_rcv); 3010 goto release; 3011 } 3012 } else 3013 uio->uio_resid -= len; 3014 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3015 if (len == m->m_len - moff) { 3016 if (m->m_flags & M_EOR) 3017 flags |= MSG_EOR; 3018 if (flags & MSG_PEEK) { 3019 m = m->m_next; 3020 moff = 0; 3021 } else { 3022 nextrecord = m->m_nextpkt; 3023 sbfree(&so->so_rcv, m); 3024 if (mp != NULL) { 3025 m->m_nextpkt = NULL; 3026 *mp = m; 3027 mp = &m->m_next; 3028 so->so_rcv.sb_mb = m = m->m_next; 3029 *mp = NULL; 3030 } else { 3031 so->so_rcv.sb_mb = m_free(m); 3032 m = so->so_rcv.sb_mb; 3033 } 3034 sockbuf_pushsync(&so->so_rcv, nextrecord); 3035 SBLASTRECORDCHK(&so->so_rcv); 3036 SBLASTMBUFCHK(&so->so_rcv); 3037 } 3038 } else { 3039 if (flags & MSG_PEEK) 3040 moff += len; 3041 else { 3042 if (mp != NULL) { 3043 if (flags & MSG_DONTWAIT) { 3044 *mp = m_copym(m, 0, len, 3045 M_NOWAIT); 3046 if (*mp == NULL) { 3047 /* 3048 * m_copym() couldn't 3049 * allocate an mbuf. 3050 * Adjust uio_resid back 3051 * (it was adjusted 3052 * down by len bytes, 3053 * which we didn't end 3054 * up "copying" over). 3055 */ 3056 uio->uio_resid += len; 3057 break; 3058 } 3059 } else { 3060 SOCKBUF_UNLOCK(&so->so_rcv); 3061 *mp = m_copym(m, 0, len, 3062 M_WAITOK); 3063 SOCKBUF_LOCK(&so->so_rcv); 3064 } 3065 } 3066 sbcut_locked(&so->so_rcv, len); 3067 } 3068 } 3069 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3070 if (so->so_oobmark) { 3071 if ((flags & MSG_PEEK) == 0) { 3072 so->so_oobmark -= len; 3073 if (so->so_oobmark == 0) { 3074 so->so_rcv.sb_state |= SBS_RCVATMARK; 3075 break; 3076 } 3077 } else { 3078 offset += len; 3079 if (offset == so->so_oobmark) 3080 break; 3081 } 3082 } 3083 if (flags & MSG_EOR) 3084 break; 3085 /* 3086 * If the MSG_WAITALL flag is set (for non-atomic socket), we 3087 * must not quit until "uio->uio_resid == 0" or an error 3088 * termination. If a signal/timeout occurs, return with a 3089 * short count but without error. Keep sockbuf locked 3090 * against other readers. 3091 */ 3092 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 3093 !sosendallatonce(so) && nextrecord == NULL) { 3094 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3095 if (so->so_error || so->so_rerror || 3096 so->so_rcv.sb_state & SBS_CANTRCVMORE) 3097 break; 3098 /* 3099 * Notify the protocol that some data has been 3100 * drained before blocking. 3101 */ 3102 if (pr->pr_flags & PR_WANTRCVD) { 3103 SOCKBUF_UNLOCK(&so->so_rcv); 3104 VNET_SO_ASSERT(so); 3105 pr->pr_rcvd(so, flags); 3106 SOCKBUF_LOCK(&so->so_rcv); 3107 if (__predict_false(so->so_rcv.sb_mb == NULL && 3108 (so->so_error || so->so_rerror || 3109 so->so_rcv.sb_state & SBS_CANTRCVMORE))) 3110 break; 3111 } 3112 SBLASTRECORDCHK(&so->so_rcv); 3113 SBLASTMBUFCHK(&so->so_rcv); 3114 /* 3115 * We could receive some data while was notifying 3116 * the protocol. Skip blocking in this case. 3117 */ 3118 if (so->so_rcv.sb_mb == NULL) { 3119 error = sbwait(so, SO_RCV); 3120 if (error) { 3121 SOCKBUF_UNLOCK(&so->so_rcv); 3122 goto release; 3123 } 3124 } 3125 m = so->so_rcv.sb_mb; 3126 if (m != NULL) 3127 nextrecord = m->m_nextpkt; 3128 } 3129 } 3130 3131 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3132 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 3133 if (report_real_len) 3134 uio->uio_resid -= m_length(m, NULL) - moff; 3135 flags |= MSG_TRUNC; 3136 if ((flags & MSG_PEEK) == 0) 3137 (void) sbdroprecord_locked(&so->so_rcv); 3138 } 3139 if ((flags & MSG_PEEK) == 0) { 3140 if (m == NULL) { 3141 /* 3142 * First part is an inline SB_EMPTY_FIXUP(). Second 3143 * part makes sure sb_lastrecord is up-to-date if 3144 * there is still data in the socket buffer. 3145 */ 3146 so->so_rcv.sb_mb = nextrecord; 3147 if (so->so_rcv.sb_mb == NULL) { 3148 so->so_rcv.sb_mbtail = NULL; 3149 so->so_rcv.sb_lastrecord = NULL; 3150 } else if (nextrecord->m_nextpkt == NULL) 3151 so->so_rcv.sb_lastrecord = nextrecord; 3152 } 3153 SBLASTRECORDCHK(&so->so_rcv); 3154 SBLASTMBUFCHK(&so->so_rcv); 3155 /* 3156 * If soreceive() is being done from the socket callback, 3157 * then don't need to generate ACK to peer to update window, 3158 * since ACK will be generated on return to TCP. 3159 */ 3160 if (!(flags & MSG_SOCALLBCK) && 3161 (pr->pr_flags & PR_WANTRCVD)) { 3162 SOCKBUF_UNLOCK(&so->so_rcv); 3163 VNET_SO_ASSERT(so); 3164 pr->pr_rcvd(so, flags); 3165 SOCKBUF_LOCK(&so->so_rcv); 3166 } 3167 } 3168 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3169 if (orig_resid == uio->uio_resid && orig_resid && 3170 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 3171 SOCKBUF_UNLOCK(&so->so_rcv); 3172 goto restart; 3173 } 3174 SOCKBUF_UNLOCK(&so->so_rcv); 3175 3176 if (flagsp != NULL) 3177 *flagsp |= flags; 3178 release: 3179 return (error); 3180 } 3181 3182 int 3183 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 3184 struct mbuf **mp, struct mbuf **controlp, int *flagsp) 3185 { 3186 int error, flags; 3187 3188 if (psa != NULL) 3189 *psa = NULL; 3190 if (controlp != NULL) 3191 *controlp = NULL; 3192 if (flagsp != NULL) { 3193 flags = *flagsp; 3194 if ((flags & MSG_OOB) != 0) 3195 return (soreceive_rcvoob(so, uio, flags)); 3196 } else { 3197 flags = 0; 3198 } 3199 if (mp != NULL) 3200 *mp = NULL; 3201 3202 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3203 if (error) 3204 return (error); 3205 error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp); 3206 SOCK_IO_RECV_UNLOCK(so); 3207 return (error); 3208 } 3209 3210 /* 3211 * Optimized version of soreceive() for stream (TCP) sockets. 3212 */ 3213 static int 3214 soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 3215 struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, 3216 struct mbuf **controlp, int flags) 3217 { 3218 int len = 0, error = 0, oresid; 3219 struct mbuf *m, *n = NULL; 3220 3221 SOCK_IO_RECV_ASSERT_LOCKED(so); 3222 3223 /* Easy one, no space to copyout anything. */ 3224 if (uio->uio_resid == 0) 3225 return (EINVAL); 3226 oresid = uio->uio_resid; 3227 3228 SOCKBUF_LOCK(sb); 3229 /* We will never ever get anything unless we are or were connected. */ 3230 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 3231 error = ENOTCONN; 3232 goto out; 3233 } 3234 3235 restart: 3236 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3237 3238 /* Abort if socket has reported problems. */ 3239 if (so->so_error) { 3240 if (sbavail(sb) > 0) 3241 goto deliver; 3242 if (oresid > uio->uio_resid) 3243 goto out; 3244 error = so->so_error; 3245 if (!(flags & MSG_PEEK)) 3246 so->so_error = 0; 3247 goto out; 3248 } 3249 3250 /* Door is closed. Deliver what is left, if any. */ 3251 if (sb->sb_state & SBS_CANTRCVMORE) { 3252 if (sbavail(sb) > 0) 3253 goto deliver; 3254 else 3255 goto out; 3256 } 3257 3258 /* Socket buffer is empty and we shall not block. */ 3259 if (sbavail(sb) == 0 && 3260 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 3261 error = EAGAIN; 3262 goto out; 3263 } 3264 3265 /* Socket buffer got some data that we shall deliver now. */ 3266 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 3267 ((so->so_state & SS_NBIO) || 3268 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 3269 sbavail(sb) >= sb->sb_lowat || 3270 sbavail(sb) >= uio->uio_resid || 3271 sbavail(sb) >= sb->sb_hiwat) ) { 3272 goto deliver; 3273 } 3274 3275 /* On MSG_WAITALL we must wait until all data or error arrives. */ 3276 if ((flags & MSG_WAITALL) && 3277 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 3278 goto deliver; 3279 3280 /* 3281 * Wait and block until (more) data comes in. 3282 * NB: Drops the sockbuf lock during wait. 3283 */ 3284 error = sbwait(so, SO_RCV); 3285 if (error) 3286 goto out; 3287 goto restart; 3288 3289 deliver: 3290 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3291 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 3292 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 3293 3294 /* Statistics. */ 3295 if (uio->uio_td) 3296 uio->uio_td->td_ru.ru_msgrcv++; 3297 3298 /* Fill uio until full or current end of socket buffer is reached. */ 3299 len = min(uio->uio_resid, sbavail(sb)); 3300 if (mp0 != NULL) { 3301 /* Dequeue as many mbufs as possible. */ 3302 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 3303 if (*mp0 == NULL) 3304 *mp0 = sb->sb_mb; 3305 else 3306 m_cat(*mp0, sb->sb_mb); 3307 for (m = sb->sb_mb; 3308 m != NULL && m->m_len <= len; 3309 m = m->m_next) { 3310 KASSERT(!(m->m_flags & M_NOTAVAIL), 3311 ("%s: m %p not available", __func__, m)); 3312 len -= m->m_len; 3313 uio->uio_resid -= m->m_len; 3314 sbfree(sb, m); 3315 n = m; 3316 } 3317 n->m_next = NULL; 3318 sb->sb_mb = m; 3319 sb->sb_lastrecord = sb->sb_mb; 3320 if (sb->sb_mb == NULL) 3321 SB_EMPTY_FIXUP(sb); 3322 } 3323 /* Copy the remainder. */ 3324 if (len > 0) { 3325 KASSERT(sb->sb_mb != NULL, 3326 ("%s: len > 0 && sb->sb_mb empty", __func__)); 3327 3328 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 3329 if (m == NULL) 3330 len = 0; /* Don't flush data from sockbuf. */ 3331 else 3332 uio->uio_resid -= len; 3333 if (*mp0 != NULL) 3334 m_cat(*mp0, m); 3335 else 3336 *mp0 = m; 3337 if (*mp0 == NULL) { 3338 error = ENOBUFS; 3339 goto out; 3340 } 3341 } 3342 } else { 3343 /* NB: Must unlock socket buffer as uiomove may sleep. */ 3344 SOCKBUF_UNLOCK(sb); 3345 error = m_mbuftouio(uio, sb->sb_mb, len); 3346 SOCKBUF_LOCK(sb); 3347 if (error) 3348 goto out; 3349 } 3350 SBLASTRECORDCHK(sb); 3351 SBLASTMBUFCHK(sb); 3352 3353 /* 3354 * Remove the delivered data from the socket buffer unless we 3355 * were only peeking. 3356 */ 3357 if (!(flags & MSG_PEEK)) { 3358 if (len > 0) 3359 sbdrop_locked(sb, len); 3360 3361 /* Notify protocol that we drained some data. */ 3362 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 3363 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 3364 !(flags & MSG_SOCALLBCK))) { 3365 SOCKBUF_UNLOCK(sb); 3366 VNET_SO_ASSERT(so); 3367 so->so_proto->pr_rcvd(so, flags); 3368 SOCKBUF_LOCK(sb); 3369 } 3370 } 3371 3372 /* 3373 * For MSG_WAITALL we may have to loop again and wait for 3374 * more data to come in. 3375 */ 3376 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 3377 goto restart; 3378 out: 3379 SBLASTRECORDCHK(sb); 3380 SBLASTMBUFCHK(sb); 3381 SOCKBUF_UNLOCK(sb); 3382 return (error); 3383 } 3384 3385 int 3386 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 3387 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3388 { 3389 struct sockbuf *sb; 3390 int error, flags; 3391 3392 sb = &so->so_rcv; 3393 3394 /* We only do stream sockets. */ 3395 if (so->so_type != SOCK_STREAM) 3396 return (EINVAL); 3397 if (psa != NULL) 3398 *psa = NULL; 3399 if (flagsp != NULL) 3400 flags = *flagsp & ~MSG_EOR; 3401 else 3402 flags = 0; 3403 if (controlp != NULL) 3404 *controlp = NULL; 3405 if (flags & MSG_OOB) 3406 return (soreceive_rcvoob(so, uio, flags)); 3407 if (mp0 != NULL) 3408 *mp0 = NULL; 3409 3410 #ifdef KERN_TLS 3411 /* 3412 * KTLS store TLS records as records with a control message to 3413 * describe the framing. 3414 * 3415 * We check once here before acquiring locks to optimize the 3416 * common case. 3417 */ 3418 if (sb->sb_tls_info != NULL) 3419 return (soreceive_generic(so, psa, uio, mp0, controlp, 3420 flagsp)); 3421 #endif 3422 3423 /* 3424 * Prevent other threads from reading from the socket. This lock may be 3425 * dropped in order to sleep waiting for data to arrive. 3426 */ 3427 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3428 if (error) 3429 return (error); 3430 #ifdef KERN_TLS 3431 if (__predict_false(sb->sb_tls_info != NULL)) { 3432 SOCK_IO_RECV_UNLOCK(so); 3433 return (soreceive_generic(so, psa, uio, mp0, controlp, 3434 flagsp)); 3435 } 3436 #endif 3437 error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags); 3438 SOCK_IO_RECV_UNLOCK(so); 3439 return (error); 3440 } 3441 3442 /* 3443 * Optimized version of soreceive() for simple datagram cases from userspace. 3444 * Unlike in the stream case, we're able to drop a datagram if copyout() 3445 * fails, and because we handle datagrams atomically, we don't need to use a 3446 * sleep lock to prevent I/O interlacing. 3447 */ 3448 int 3449 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 3450 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3451 { 3452 struct mbuf *m, *m2; 3453 int flags, error; 3454 ssize_t len; 3455 struct protosw *pr = so->so_proto; 3456 struct mbuf *nextrecord; 3457 3458 if (psa != NULL) 3459 *psa = NULL; 3460 if (controlp != NULL) 3461 *controlp = NULL; 3462 if (flagsp != NULL) 3463 flags = *flagsp &~ MSG_EOR; 3464 else 3465 flags = 0; 3466 3467 /* 3468 * For any complicated cases, fall back to the full 3469 * soreceive_generic(). 3470 */ 3471 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) 3472 return (soreceive_generic(so, psa, uio, mp0, controlp, 3473 flagsp)); 3474 3475 /* 3476 * Enforce restrictions on use. 3477 */ 3478 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 3479 ("soreceive_dgram: wantrcvd")); 3480 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 3481 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 3482 ("soreceive_dgram: SBS_RCVATMARK")); 3483 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 3484 ("soreceive_dgram: P_CONNREQUIRED")); 3485 3486 /* 3487 * Loop blocking while waiting for a datagram. 3488 */ 3489 SOCKBUF_LOCK(&so->so_rcv); 3490 while ((m = so->so_rcv.sb_mb) == NULL) { 3491 KASSERT(sbavail(&so->so_rcv) == 0, 3492 ("soreceive_dgram: sb_mb NULL but sbavail %u", 3493 sbavail(&so->so_rcv))); 3494 if (so->so_error) { 3495 error = so->so_error; 3496 so->so_error = 0; 3497 SOCKBUF_UNLOCK(&so->so_rcv); 3498 return (error); 3499 } 3500 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 3501 uio->uio_resid == 0) { 3502 SOCKBUF_UNLOCK(&so->so_rcv); 3503 return (0); 3504 } 3505 if ((so->so_state & SS_NBIO) || 3506 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 3507 SOCKBUF_UNLOCK(&so->so_rcv); 3508 return (EWOULDBLOCK); 3509 } 3510 SBLASTRECORDCHK(&so->so_rcv); 3511 SBLASTMBUFCHK(&so->so_rcv); 3512 error = sbwait(so, SO_RCV); 3513 if (error) { 3514 SOCKBUF_UNLOCK(&so->so_rcv); 3515 return (error); 3516 } 3517 } 3518 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3519 3520 if (uio->uio_td) 3521 uio->uio_td->td_ru.ru_msgrcv++; 3522 SBLASTRECORDCHK(&so->so_rcv); 3523 SBLASTMBUFCHK(&so->so_rcv); 3524 nextrecord = m->m_nextpkt; 3525 if (nextrecord == NULL) { 3526 KASSERT(so->so_rcv.sb_lastrecord == m, 3527 ("soreceive_dgram: lastrecord != m")); 3528 } 3529 3530 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 3531 ("soreceive_dgram: m_nextpkt != nextrecord")); 3532 3533 /* 3534 * Pull 'm' and its chain off the front of the packet queue. 3535 */ 3536 so->so_rcv.sb_mb = NULL; 3537 sockbuf_pushsync(&so->so_rcv, nextrecord); 3538 3539 /* 3540 * Walk 'm's chain and free that many bytes from the socket buffer. 3541 */ 3542 for (m2 = m; m2 != NULL; m2 = m2->m_next) 3543 sbfree(&so->so_rcv, m2); 3544 3545 /* 3546 * Do a few last checks before we let go of the lock. 3547 */ 3548 SBLASTRECORDCHK(&so->so_rcv); 3549 SBLASTMBUFCHK(&so->so_rcv); 3550 SOCKBUF_UNLOCK(&so->so_rcv); 3551 3552 if (pr->pr_flags & PR_ADDR) { 3553 KASSERT(m->m_type == MT_SONAME, 3554 ("m->m_type == %d", m->m_type)); 3555 if (psa != NULL) 3556 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 3557 M_WAITOK); 3558 m = m_free(m); 3559 } 3560 KASSERT(m, ("%s: no data or control after soname", __func__)); 3561 3562 /* 3563 * Packet to copyout() is now in 'm' and it is disconnected from the 3564 * queue. 3565 * 3566 * Process one or more MT_CONTROL mbufs present before any data mbufs 3567 * in the first mbuf chain on the socket buffer. We call into the 3568 * protocol to perform externalization (or freeing if controlp == 3569 * NULL). In some cases there can be only MT_CONTROL mbufs without 3570 * MT_DATA mbufs. 3571 */ 3572 if (m->m_type == MT_CONTROL) { 3573 struct mbuf *cm = NULL, *cmn; 3574 struct mbuf **cme = &cm; 3575 3576 do { 3577 m2 = m->m_next; 3578 m->m_next = NULL; 3579 *cme = m; 3580 cme = &(*cme)->m_next; 3581 m = m2; 3582 } while (m != NULL && m->m_type == MT_CONTROL); 3583 while (cm != NULL) { 3584 cmn = cm->m_next; 3585 cm->m_next = NULL; 3586 if (pr->pr_domain->dom_externalize != NULL) { 3587 error = (*pr->pr_domain->dom_externalize) 3588 (cm, controlp, flags); 3589 } else if (controlp != NULL) 3590 *controlp = cm; 3591 else 3592 m_freem(cm); 3593 if (controlp != NULL) { 3594 while (*controlp != NULL) 3595 controlp = &(*controlp)->m_next; 3596 } 3597 cm = cmn; 3598 } 3599 } 3600 KASSERT(m == NULL || m->m_type == MT_DATA, 3601 ("soreceive_dgram: !data")); 3602 while (m != NULL && uio->uio_resid > 0) { 3603 len = uio->uio_resid; 3604 if (len > m->m_len) 3605 len = m->m_len; 3606 error = uiomove(mtod(m, char *), (int)len, uio); 3607 if (error) { 3608 m_freem(m); 3609 return (error); 3610 } 3611 if (len == m->m_len) 3612 m = m_free(m); 3613 else { 3614 m->m_data += len; 3615 m->m_len -= len; 3616 } 3617 } 3618 if (m != NULL) { 3619 flags |= MSG_TRUNC; 3620 m_freem(m); 3621 } 3622 if (flagsp != NULL) 3623 *flagsp |= flags; 3624 return (0); 3625 } 3626 3627 int 3628 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 3629 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3630 { 3631 int error; 3632 3633 CURVNET_SET(so->so_vnet); 3634 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); 3635 CURVNET_RESTORE(); 3636 return (error); 3637 } 3638 3639 int 3640 soshutdown(struct socket *so, enum shutdown_how how) 3641 { 3642 int error; 3643 3644 CURVNET_SET(so->so_vnet); 3645 error = so->so_proto->pr_shutdown(so, how); 3646 CURVNET_RESTORE(); 3647 3648 return (error); 3649 } 3650 3651 /* 3652 * Used by several pr_shutdown implementations that use generic socket buffers. 3653 */ 3654 void 3655 sorflush(struct socket *so) 3656 { 3657 int error; 3658 3659 VNET_SO_ASSERT(so); 3660 3661 /* 3662 * Dislodge threads currently blocked in receive and wait to acquire 3663 * a lock against other simultaneous readers before clearing the 3664 * socket buffer. Don't let our acquire be interrupted by a signal 3665 * despite any existing socket disposition on interruptable waiting. 3666 * 3667 * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive 3668 * methods that read the top of the socket buffer without acquisition 3669 * of the socket buffer mutex, assuming that top of the buffer 3670 * exclusively belongs to the read(2) syscall. This is handy when 3671 * performing MSG_PEEK. 3672 */ 3673 socantrcvmore(so); 3674 3675 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 3676 if (error != 0) { 3677 KASSERT(SOLISTENING(so), 3678 ("%s: soiolock(%p) failed", __func__, so)); 3679 return; 3680 } 3681 3682 sbrelease(so, SO_RCV); 3683 SOCK_IO_RECV_UNLOCK(so); 3684 3685 } 3686 3687 #ifdef SOCKET_HHOOK 3688 /* 3689 * Wrapper for Socket established helper hook. 3690 * Parameters: socket, context of the hook point, hook id. 3691 */ 3692 static inline int 3693 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 3694 { 3695 struct socket_hhook_data hhook_data = { 3696 .so = so, 3697 .hctx = hctx, 3698 .m = NULL, 3699 .status = 0 3700 }; 3701 3702 CURVNET_SET(so->so_vnet); 3703 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 3704 CURVNET_RESTORE(); 3705 3706 /* Ugly but needed, since hhooks return void for now */ 3707 return (hhook_data.status); 3708 } 3709 #endif 3710 3711 /* 3712 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 3713 * additional variant to handle the case where the option value needs to be 3714 * some kind of integer, but not a specific size. In addition to their use 3715 * here, these functions are also called by the protocol-level pr_ctloutput() 3716 * routines. 3717 */ 3718 int 3719 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 3720 { 3721 size_t valsize; 3722 3723 /* 3724 * If the user gives us more than we wanted, we ignore it, but if we 3725 * don't get the minimum length the caller wants, we return EINVAL. 3726 * On success, sopt->sopt_valsize is set to however much we actually 3727 * retrieved. 3728 */ 3729 if ((valsize = sopt->sopt_valsize) < minlen) 3730 return EINVAL; 3731 if (valsize > len) 3732 sopt->sopt_valsize = valsize = len; 3733 3734 if (sopt->sopt_td != NULL) 3735 return (copyin(sopt->sopt_val, buf, valsize)); 3736 3737 bcopy(sopt->sopt_val, buf, valsize); 3738 return (0); 3739 } 3740 3741 /* 3742 * Kernel version of setsockopt(2). 3743 * 3744 * XXX: optlen is size_t, not socklen_t 3745 */ 3746 int 3747 so_setsockopt(struct socket *so, int level, int optname, void *optval, 3748 size_t optlen) 3749 { 3750 struct sockopt sopt; 3751 3752 sopt.sopt_level = level; 3753 sopt.sopt_name = optname; 3754 sopt.sopt_dir = SOPT_SET; 3755 sopt.sopt_val = optval; 3756 sopt.sopt_valsize = optlen; 3757 sopt.sopt_td = NULL; 3758 return (sosetopt(so, &sopt)); 3759 } 3760 3761 int 3762 sosetopt(struct socket *so, struct sockopt *sopt) 3763 { 3764 int error, optval; 3765 struct linger l; 3766 struct timeval tv; 3767 sbintime_t val, *valp; 3768 uint32_t val32; 3769 #ifdef MAC 3770 struct mac extmac; 3771 #endif 3772 3773 CURVNET_SET(so->so_vnet); 3774 error = 0; 3775 if (sopt->sopt_level != SOL_SOCKET) { 3776 if (so->so_proto->pr_ctloutput != NULL) 3777 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3778 else 3779 error = ENOPROTOOPT; 3780 } else { 3781 switch (sopt->sopt_name) { 3782 case SO_ACCEPTFILTER: 3783 error = accept_filt_setopt(so, sopt); 3784 if (error) 3785 goto bad; 3786 break; 3787 3788 case SO_LINGER: 3789 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3790 if (error) 3791 goto bad; 3792 if (l.l_linger < 0 || 3793 l.l_linger > USHRT_MAX || 3794 l.l_linger > (INT_MAX / hz)) { 3795 error = EDOM; 3796 goto bad; 3797 } 3798 SOCK_LOCK(so); 3799 so->so_linger = l.l_linger; 3800 if (l.l_onoff) 3801 so->so_options |= SO_LINGER; 3802 else 3803 so->so_options &= ~SO_LINGER; 3804 SOCK_UNLOCK(so); 3805 break; 3806 3807 case SO_DEBUG: 3808 case SO_KEEPALIVE: 3809 case SO_DONTROUTE: 3810 case SO_USELOOPBACK: 3811 case SO_BROADCAST: 3812 case SO_REUSEADDR: 3813 case SO_REUSEPORT: 3814 case SO_REUSEPORT_LB: 3815 case SO_OOBINLINE: 3816 case SO_TIMESTAMP: 3817 case SO_BINTIME: 3818 case SO_NOSIGPIPE: 3819 case SO_NO_DDP: 3820 case SO_NO_OFFLOAD: 3821 case SO_RERROR: 3822 error = sooptcopyin(sopt, &optval, sizeof optval, 3823 sizeof optval); 3824 if (error) 3825 goto bad; 3826 SOCK_LOCK(so); 3827 if (optval) 3828 so->so_options |= sopt->sopt_name; 3829 else 3830 so->so_options &= ~sopt->sopt_name; 3831 SOCK_UNLOCK(so); 3832 break; 3833 3834 case SO_SETFIB: 3835 error = sooptcopyin(sopt, &optval, sizeof optval, 3836 sizeof optval); 3837 if (error) 3838 goto bad; 3839 3840 if (optval < 0 || optval >= rt_numfibs) { 3841 error = EINVAL; 3842 goto bad; 3843 } 3844 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 3845 (so->so_proto->pr_domain->dom_family == PF_INET6) || 3846 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 3847 so->so_fibnum = optval; 3848 else 3849 so->so_fibnum = 0; 3850 break; 3851 3852 case SO_USER_COOKIE: 3853 error = sooptcopyin(sopt, &val32, sizeof val32, 3854 sizeof val32); 3855 if (error) 3856 goto bad; 3857 so->so_user_cookie = val32; 3858 break; 3859 3860 case SO_SNDBUF: 3861 case SO_RCVBUF: 3862 case SO_SNDLOWAT: 3863 case SO_RCVLOWAT: 3864 error = so->so_proto->pr_setsbopt(so, sopt); 3865 if (error) 3866 goto bad; 3867 break; 3868 3869 case SO_SNDTIMEO: 3870 case SO_RCVTIMEO: 3871 #ifdef COMPAT_FREEBSD32 3872 if (SV_CURPROC_FLAG(SV_ILP32)) { 3873 struct timeval32 tv32; 3874 3875 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3876 sizeof tv32); 3877 CP(tv32, tv, tv_sec); 3878 CP(tv32, tv, tv_usec); 3879 } else 3880 #endif 3881 error = sooptcopyin(sopt, &tv, sizeof tv, 3882 sizeof tv); 3883 if (error) 3884 goto bad; 3885 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3886 tv.tv_usec >= 1000000) { 3887 error = EDOM; 3888 goto bad; 3889 } 3890 if (tv.tv_sec > INT32_MAX) 3891 val = SBT_MAX; 3892 else 3893 val = tvtosbt(tv); 3894 SOCK_LOCK(so); 3895 valp = sopt->sopt_name == SO_SNDTIMEO ? 3896 (SOLISTENING(so) ? &so->sol_sbsnd_timeo : 3897 &so->so_snd.sb_timeo) : 3898 (SOLISTENING(so) ? &so->sol_sbrcv_timeo : 3899 &so->so_rcv.sb_timeo); 3900 *valp = val; 3901 SOCK_UNLOCK(so); 3902 break; 3903 3904 case SO_LABEL: 3905 #ifdef MAC 3906 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3907 sizeof extmac); 3908 if (error) 3909 goto bad; 3910 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3911 so, &extmac); 3912 #else 3913 error = EOPNOTSUPP; 3914 #endif 3915 break; 3916 3917 case SO_TS_CLOCK: 3918 error = sooptcopyin(sopt, &optval, sizeof optval, 3919 sizeof optval); 3920 if (error) 3921 goto bad; 3922 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 3923 error = EINVAL; 3924 goto bad; 3925 } 3926 so->so_ts_clock = optval; 3927 break; 3928 3929 case SO_MAX_PACING_RATE: 3930 error = sooptcopyin(sopt, &val32, sizeof(val32), 3931 sizeof(val32)); 3932 if (error) 3933 goto bad; 3934 so->so_max_pacing_rate = val32; 3935 break; 3936 3937 case SO_SPLICE: { 3938 struct splice splice; 3939 3940 #ifdef COMPAT_FREEBSD32 3941 if (SV_CURPROC_FLAG(SV_ILP32)) { 3942 struct splice32 splice32; 3943 3944 error = sooptcopyin(sopt, &splice32, 3945 sizeof(splice32), sizeof(splice32)); 3946 if (error == 0) { 3947 splice.sp_fd = splice32.sp_fd; 3948 splice.sp_max = splice32.sp_max; 3949 CP(splice32.sp_idle, splice.sp_idle, 3950 tv_sec); 3951 CP(splice32.sp_idle, splice.sp_idle, 3952 tv_usec); 3953 } 3954 } else 3955 #endif 3956 { 3957 error = sooptcopyin(sopt, &splice, 3958 sizeof(splice), sizeof(splice)); 3959 } 3960 if (error) 3961 goto bad; 3962 #ifdef KTRACE 3963 if (KTRPOINT(curthread, KTR_STRUCT)) 3964 ktrsplice(&splice); 3965 #endif 3966 3967 error = splice_init(); 3968 if (error != 0) 3969 goto bad; 3970 3971 if (splice.sp_fd >= 0) { 3972 struct file *fp; 3973 struct socket *so2; 3974 3975 if (!cap_rights_contains(sopt->sopt_rights, 3976 &cap_recv_rights)) { 3977 error = ENOTCAPABLE; 3978 goto bad; 3979 } 3980 error = getsock(sopt->sopt_td, splice.sp_fd, 3981 &cap_send_rights, &fp); 3982 if (error != 0) 3983 goto bad; 3984 so2 = fp->f_data; 3985 3986 error = so_splice(so, so2, &splice); 3987 fdrop(fp, sopt->sopt_td); 3988 } else { 3989 error = so_unsplice(so, false); 3990 } 3991 break; 3992 } 3993 default: 3994 #ifdef SOCKET_HHOOK 3995 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3996 error = hhook_run_socket(so, sopt, 3997 HHOOK_SOCKET_OPT); 3998 else 3999 #endif 4000 error = ENOPROTOOPT; 4001 break; 4002 } 4003 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 4004 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 4005 } 4006 bad: 4007 CURVNET_RESTORE(); 4008 return (error); 4009 } 4010 4011 /* 4012 * Helper routine for getsockopt. 4013 */ 4014 int 4015 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 4016 { 4017 int error; 4018 size_t valsize; 4019 4020 error = 0; 4021 4022 /* 4023 * Documented get behavior is that we always return a value, possibly 4024 * truncated to fit in the user's buffer. Traditional behavior is 4025 * that we always tell the user precisely how much we copied, rather 4026 * than something useful like the total amount we had available for 4027 * her. Note that this interface is not idempotent; the entire 4028 * answer must be generated ahead of time. 4029 */ 4030 valsize = min(len, sopt->sopt_valsize); 4031 sopt->sopt_valsize = valsize; 4032 if (sopt->sopt_val != NULL) { 4033 if (sopt->sopt_td != NULL) 4034 error = copyout(buf, sopt->sopt_val, valsize); 4035 else 4036 bcopy(buf, sopt->sopt_val, valsize); 4037 } 4038 return (error); 4039 } 4040 4041 int 4042 sogetopt(struct socket *so, struct sockopt *sopt) 4043 { 4044 int error, optval; 4045 struct linger l; 4046 struct timeval tv; 4047 #ifdef MAC 4048 struct mac extmac; 4049 #endif 4050 4051 CURVNET_SET(so->so_vnet); 4052 error = 0; 4053 if (sopt->sopt_level != SOL_SOCKET) { 4054 if (so->so_proto->pr_ctloutput != NULL) 4055 error = (*so->so_proto->pr_ctloutput)(so, sopt); 4056 else 4057 error = ENOPROTOOPT; 4058 CURVNET_RESTORE(); 4059 return (error); 4060 } else { 4061 switch (sopt->sopt_name) { 4062 case SO_ACCEPTFILTER: 4063 error = accept_filt_getopt(so, sopt); 4064 break; 4065 4066 case SO_LINGER: 4067 SOCK_LOCK(so); 4068 l.l_onoff = so->so_options & SO_LINGER; 4069 l.l_linger = so->so_linger; 4070 SOCK_UNLOCK(so); 4071 error = sooptcopyout(sopt, &l, sizeof l); 4072 break; 4073 4074 case SO_USELOOPBACK: 4075 case SO_DONTROUTE: 4076 case SO_DEBUG: 4077 case SO_KEEPALIVE: 4078 case SO_REUSEADDR: 4079 case SO_REUSEPORT: 4080 case SO_REUSEPORT_LB: 4081 case SO_BROADCAST: 4082 case SO_OOBINLINE: 4083 case SO_ACCEPTCONN: 4084 case SO_TIMESTAMP: 4085 case SO_BINTIME: 4086 case SO_NOSIGPIPE: 4087 case SO_NO_DDP: 4088 case SO_NO_OFFLOAD: 4089 case SO_RERROR: 4090 optval = so->so_options & sopt->sopt_name; 4091 integer: 4092 error = sooptcopyout(sopt, &optval, sizeof optval); 4093 break; 4094 4095 case SO_DOMAIN: 4096 optval = so->so_proto->pr_domain->dom_family; 4097 goto integer; 4098 4099 case SO_TYPE: 4100 optval = so->so_type; 4101 goto integer; 4102 4103 case SO_PROTOCOL: 4104 optval = so->so_proto->pr_protocol; 4105 goto integer; 4106 4107 case SO_ERROR: 4108 SOCK_LOCK(so); 4109 if (so->so_error) { 4110 optval = so->so_error; 4111 so->so_error = 0; 4112 } else { 4113 optval = so->so_rerror; 4114 so->so_rerror = 0; 4115 } 4116 SOCK_UNLOCK(so); 4117 goto integer; 4118 4119 case SO_SNDBUF: 4120 SOCK_LOCK(so); 4121 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 4122 so->so_snd.sb_hiwat; 4123 SOCK_UNLOCK(so); 4124 goto integer; 4125 4126 case SO_RCVBUF: 4127 SOCK_LOCK(so); 4128 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 4129 so->so_rcv.sb_hiwat; 4130 SOCK_UNLOCK(so); 4131 goto integer; 4132 4133 case SO_SNDLOWAT: 4134 SOCK_LOCK(so); 4135 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 4136 so->so_snd.sb_lowat; 4137 SOCK_UNLOCK(so); 4138 goto integer; 4139 4140 case SO_RCVLOWAT: 4141 SOCK_LOCK(so); 4142 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 4143 so->so_rcv.sb_lowat; 4144 SOCK_UNLOCK(so); 4145 goto integer; 4146 4147 case SO_SNDTIMEO: 4148 case SO_RCVTIMEO: 4149 SOCK_LOCK(so); 4150 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 4151 (SOLISTENING(so) ? so->sol_sbsnd_timeo : 4152 so->so_snd.sb_timeo) : 4153 (SOLISTENING(so) ? so->sol_sbrcv_timeo : 4154 so->so_rcv.sb_timeo)); 4155 SOCK_UNLOCK(so); 4156 #ifdef COMPAT_FREEBSD32 4157 if (SV_CURPROC_FLAG(SV_ILP32)) { 4158 struct timeval32 tv32; 4159 4160 CP(tv, tv32, tv_sec); 4161 CP(tv, tv32, tv_usec); 4162 error = sooptcopyout(sopt, &tv32, sizeof tv32); 4163 } else 4164 #endif 4165 error = sooptcopyout(sopt, &tv, sizeof tv); 4166 break; 4167 4168 case SO_LABEL: 4169 #ifdef MAC 4170 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4171 sizeof(extmac)); 4172 if (error) 4173 goto bad; 4174 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 4175 so, &extmac); 4176 if (error) 4177 goto bad; 4178 /* Don't copy out extmac, it is unchanged. */ 4179 #else 4180 error = EOPNOTSUPP; 4181 #endif 4182 break; 4183 4184 case SO_PEERLABEL: 4185 #ifdef MAC 4186 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4187 sizeof(extmac)); 4188 if (error) 4189 goto bad; 4190 error = mac_getsockopt_peerlabel( 4191 sopt->sopt_td->td_ucred, so, &extmac); 4192 if (error) 4193 goto bad; 4194 /* Don't copy out extmac, it is unchanged. */ 4195 #else 4196 error = EOPNOTSUPP; 4197 #endif 4198 break; 4199 4200 case SO_LISTENQLIMIT: 4201 SOCK_LOCK(so); 4202 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 4203 SOCK_UNLOCK(so); 4204 goto integer; 4205 4206 case SO_LISTENQLEN: 4207 SOCK_LOCK(so); 4208 optval = SOLISTENING(so) ? so->sol_qlen : 0; 4209 SOCK_UNLOCK(so); 4210 goto integer; 4211 4212 case SO_LISTENINCQLEN: 4213 SOCK_LOCK(so); 4214 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 4215 SOCK_UNLOCK(so); 4216 goto integer; 4217 4218 case SO_TS_CLOCK: 4219 optval = so->so_ts_clock; 4220 goto integer; 4221 4222 case SO_MAX_PACING_RATE: 4223 optval = so->so_max_pacing_rate; 4224 goto integer; 4225 4226 case SO_SPLICE: { 4227 off_t n; 4228 4229 /* 4230 * Acquire the I/O lock to serialize with 4231 * so_splice_xfer(). This is not required for 4232 * correctness, but makes testing simpler: once a byte 4233 * has been transmitted to the sink and observed (e.g., 4234 * by reading from the socket to which the sink is 4235 * connected), a subsequent getsockopt(SO_SPLICE) will 4236 * return an up-to-date value. 4237 */ 4238 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT); 4239 if (error != 0) 4240 goto bad; 4241 SOCK_LOCK(so); 4242 if (SOLISTENING(so)) { 4243 n = 0; 4244 } else { 4245 n = so->so_splice_sent; 4246 } 4247 SOCK_UNLOCK(so); 4248 SOCK_IO_RECV_UNLOCK(so); 4249 error = sooptcopyout(sopt, &n, sizeof(n)); 4250 break; 4251 } 4252 4253 default: 4254 #ifdef SOCKET_HHOOK 4255 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 4256 error = hhook_run_socket(so, sopt, 4257 HHOOK_SOCKET_OPT); 4258 else 4259 #endif 4260 error = ENOPROTOOPT; 4261 break; 4262 } 4263 } 4264 bad: 4265 CURVNET_RESTORE(); 4266 return (error); 4267 } 4268 4269 int 4270 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 4271 { 4272 struct mbuf *m, *m_prev; 4273 int sopt_size = sopt->sopt_valsize; 4274 4275 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4276 if (m == NULL) 4277 return ENOBUFS; 4278 if (sopt_size > MLEN) { 4279 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 4280 if ((m->m_flags & M_EXT) == 0) { 4281 m_free(m); 4282 return ENOBUFS; 4283 } 4284 m->m_len = min(MCLBYTES, sopt_size); 4285 } else { 4286 m->m_len = min(MLEN, sopt_size); 4287 } 4288 sopt_size -= m->m_len; 4289 *mp = m; 4290 m_prev = m; 4291 4292 while (sopt_size) { 4293 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4294 if (m == NULL) { 4295 m_freem(*mp); 4296 return ENOBUFS; 4297 } 4298 if (sopt_size > MLEN) { 4299 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 4300 M_NOWAIT); 4301 if ((m->m_flags & M_EXT) == 0) { 4302 m_freem(m); 4303 m_freem(*mp); 4304 return ENOBUFS; 4305 } 4306 m->m_len = min(MCLBYTES, sopt_size); 4307 } else { 4308 m->m_len = min(MLEN, sopt_size); 4309 } 4310 sopt_size -= m->m_len; 4311 m_prev->m_next = m; 4312 m_prev = m; 4313 } 4314 return (0); 4315 } 4316 4317 int 4318 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 4319 { 4320 struct mbuf *m0 = m; 4321 4322 if (sopt->sopt_val == NULL) 4323 return (0); 4324 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4325 if (sopt->sopt_td != NULL) { 4326 int error; 4327 4328 error = copyin(sopt->sopt_val, mtod(m, char *), 4329 m->m_len); 4330 if (error != 0) { 4331 m_freem(m0); 4332 return(error); 4333 } 4334 } else 4335 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 4336 sopt->sopt_valsize -= m->m_len; 4337 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4338 m = m->m_next; 4339 } 4340 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 4341 panic("ip6_sooptmcopyin"); 4342 return (0); 4343 } 4344 4345 int 4346 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 4347 { 4348 struct mbuf *m0 = m; 4349 size_t valsize = 0; 4350 4351 if (sopt->sopt_val == NULL) 4352 return (0); 4353 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4354 if (sopt->sopt_td != NULL) { 4355 int error; 4356 4357 error = copyout(mtod(m, char *), sopt->sopt_val, 4358 m->m_len); 4359 if (error != 0) { 4360 m_freem(m0); 4361 return(error); 4362 } 4363 } else 4364 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 4365 sopt->sopt_valsize -= m->m_len; 4366 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4367 valsize += m->m_len; 4368 m = m->m_next; 4369 } 4370 if (m != NULL) { 4371 /* enough soopt buffer should be given from user-land */ 4372 m_freem(m0); 4373 return(EINVAL); 4374 } 4375 sopt->sopt_valsize = valsize; 4376 return (0); 4377 } 4378 4379 /* 4380 * sohasoutofband(): protocol notifies socket layer of the arrival of new 4381 * out-of-band data, which will then notify socket consumers. 4382 */ 4383 void 4384 sohasoutofband(struct socket *so) 4385 { 4386 4387 if (so->so_sigio != NULL) 4388 pgsigio(&so->so_sigio, SIGURG, 0); 4389 selwakeuppri(&so->so_rdsel, PSOCK); 4390 } 4391 4392 int 4393 sopoll(struct socket *so, int events, struct ucred *active_cred, 4394 struct thread *td) 4395 { 4396 4397 /* 4398 * We do not need to set or assert curvnet as long as everyone uses 4399 * sopoll_generic(). 4400 */ 4401 return (so->so_proto->pr_sopoll(so, events, active_cred, td)); 4402 } 4403 4404 int 4405 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 4406 struct thread *td) 4407 { 4408 int revents; 4409 4410 SOCK_LOCK(so); 4411 if (SOLISTENING(so)) { 4412 if (!(events & (POLLIN | POLLRDNORM))) 4413 revents = 0; 4414 else if (!TAILQ_EMPTY(&so->sol_comp)) 4415 revents = events & (POLLIN | POLLRDNORM); 4416 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 4417 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 4418 else { 4419 selrecord(td, &so->so_rdsel); 4420 revents = 0; 4421 } 4422 } else { 4423 revents = 0; 4424 SOCK_SENDBUF_LOCK(so); 4425 SOCK_RECVBUF_LOCK(so); 4426 if (events & (POLLIN | POLLRDNORM)) 4427 if (soreadabledata(so) && !isspliced(so)) 4428 revents |= events & (POLLIN | POLLRDNORM); 4429 if (events & (POLLOUT | POLLWRNORM)) 4430 if (sowriteable(so) && !issplicedback(so)) 4431 revents |= events & (POLLOUT | POLLWRNORM); 4432 if (events & (POLLPRI | POLLRDBAND)) 4433 if (so->so_oobmark || 4434 (so->so_rcv.sb_state & SBS_RCVATMARK)) 4435 revents |= events & (POLLPRI | POLLRDBAND); 4436 if ((events & POLLINIGNEOF) == 0) { 4437 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4438 revents |= events & (POLLIN | POLLRDNORM); 4439 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 4440 revents |= POLLHUP; 4441 } 4442 } 4443 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4444 revents |= events & POLLRDHUP; 4445 if (revents == 0) { 4446 if (events & 4447 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 4448 selrecord(td, &so->so_rdsel); 4449 so->so_rcv.sb_flags |= SB_SEL; 4450 } 4451 if (events & (POLLOUT | POLLWRNORM)) { 4452 selrecord(td, &so->so_wrsel); 4453 so->so_snd.sb_flags |= SB_SEL; 4454 } 4455 } 4456 SOCK_RECVBUF_UNLOCK(so); 4457 SOCK_SENDBUF_UNLOCK(so); 4458 } 4459 SOCK_UNLOCK(so); 4460 return (revents); 4461 } 4462 4463 int 4464 soo_kqfilter(struct file *fp, struct knote *kn) 4465 { 4466 struct socket *so = kn->kn_fp->f_data; 4467 struct sockbuf *sb; 4468 sb_which which; 4469 struct knlist *knl; 4470 4471 switch (kn->kn_filter) { 4472 case EVFILT_READ: 4473 kn->kn_fop = &soread_filtops; 4474 knl = &so->so_rdsel.si_note; 4475 sb = &so->so_rcv; 4476 which = SO_RCV; 4477 break; 4478 case EVFILT_WRITE: 4479 kn->kn_fop = &sowrite_filtops; 4480 knl = &so->so_wrsel.si_note; 4481 sb = &so->so_snd; 4482 which = SO_SND; 4483 break; 4484 case EVFILT_EMPTY: 4485 kn->kn_fop = &soempty_filtops; 4486 knl = &so->so_wrsel.si_note; 4487 sb = &so->so_snd; 4488 which = SO_SND; 4489 break; 4490 default: 4491 return (EINVAL); 4492 } 4493 4494 SOCK_LOCK(so); 4495 if (SOLISTENING(so)) { 4496 knlist_add(knl, kn, 1); 4497 } else { 4498 SOCK_BUF_LOCK(so, which); 4499 knlist_add(knl, kn, 1); 4500 sb->sb_flags |= SB_KNOTE; 4501 SOCK_BUF_UNLOCK(so, which); 4502 } 4503 SOCK_UNLOCK(so); 4504 return (0); 4505 } 4506 4507 static void 4508 filt_sordetach(struct knote *kn) 4509 { 4510 struct socket *so = kn->kn_fp->f_data; 4511 4512 so_rdknl_lock(so); 4513 knlist_remove(&so->so_rdsel.si_note, kn, 1); 4514 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 4515 so->so_rcv.sb_flags &= ~SB_KNOTE; 4516 so_rdknl_unlock(so); 4517 } 4518 4519 /*ARGSUSED*/ 4520 static int 4521 filt_soread(struct knote *kn, long hint) 4522 { 4523 struct socket *so; 4524 4525 so = kn->kn_fp->f_data; 4526 4527 if (SOLISTENING(so)) { 4528 SOCK_LOCK_ASSERT(so); 4529 kn->kn_data = so->sol_qlen; 4530 if (so->so_error) { 4531 kn->kn_flags |= EV_EOF; 4532 kn->kn_fflags = so->so_error; 4533 return (1); 4534 } 4535 return (!TAILQ_EMPTY(&so->sol_comp)); 4536 } 4537 4538 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 4539 return (0); 4540 4541 SOCK_RECVBUF_LOCK_ASSERT(so); 4542 4543 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 4544 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4545 kn->kn_flags |= EV_EOF; 4546 kn->kn_fflags = so->so_error; 4547 return (1); 4548 } else if (so->so_error || so->so_rerror) 4549 return (1); 4550 4551 if (kn->kn_sfflags & NOTE_LOWAT) { 4552 if (kn->kn_data >= kn->kn_sdata) 4553 return (1); 4554 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 4555 return (1); 4556 4557 #ifdef SOCKET_HHOOK 4558 /* This hook returning non-zero indicates an event, not error */ 4559 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 4560 #else 4561 return (0); 4562 #endif 4563 } 4564 4565 static void 4566 filt_sowdetach(struct knote *kn) 4567 { 4568 struct socket *so = kn->kn_fp->f_data; 4569 4570 so_wrknl_lock(so); 4571 knlist_remove(&so->so_wrsel.si_note, kn, 1); 4572 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 4573 so->so_snd.sb_flags &= ~SB_KNOTE; 4574 so_wrknl_unlock(so); 4575 } 4576 4577 /*ARGSUSED*/ 4578 static int 4579 filt_sowrite(struct knote *kn, long hint) 4580 { 4581 struct socket *so; 4582 4583 so = kn->kn_fp->f_data; 4584 4585 if (SOLISTENING(so)) 4586 return (0); 4587 4588 SOCK_SENDBUF_LOCK_ASSERT(so); 4589 kn->kn_data = sbspace(&so->so_snd); 4590 4591 #ifdef SOCKET_HHOOK 4592 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 4593 #endif 4594 4595 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 4596 kn->kn_flags |= EV_EOF; 4597 kn->kn_fflags = so->so_error; 4598 return (1); 4599 } else if (so->so_error) /* temporary udp error */ 4600 return (1); 4601 else if (((so->so_state & SS_ISCONNECTED) == 0) && 4602 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 4603 return (0); 4604 else if (kn->kn_sfflags & NOTE_LOWAT) 4605 return (kn->kn_data >= kn->kn_sdata); 4606 else 4607 return (kn->kn_data >= so->so_snd.sb_lowat); 4608 } 4609 4610 static int 4611 filt_soempty(struct knote *kn, long hint) 4612 { 4613 struct socket *so; 4614 4615 so = kn->kn_fp->f_data; 4616 4617 if (SOLISTENING(so)) 4618 return (1); 4619 4620 SOCK_SENDBUF_LOCK_ASSERT(so); 4621 kn->kn_data = sbused(&so->so_snd); 4622 4623 if (kn->kn_data == 0) 4624 return (1); 4625 else 4626 return (0); 4627 } 4628 4629 int 4630 socheckuid(struct socket *so, uid_t uid) 4631 { 4632 4633 if (so == NULL) 4634 return (EPERM); 4635 if (so->so_cred->cr_uid != uid) 4636 return (EPERM); 4637 return (0); 4638 } 4639 4640 /* 4641 * These functions are used by protocols to notify the socket layer (and its 4642 * consumers) of state changes in the sockets driven by protocol-side events. 4643 */ 4644 4645 /* 4646 * Procedures to manipulate state flags of socket and do appropriate wakeups. 4647 * 4648 * Normal sequence from the active (originating) side is that 4649 * soisconnecting() is called during processing of connect() call, resulting 4650 * in an eventual call to soisconnected() if/when the connection is 4651 * established. When the connection is torn down soisdisconnecting() is 4652 * called during processing of disconnect() call, and soisdisconnected() is 4653 * called when the connection to the peer is totally severed. The semantics 4654 * of these routines are such that connectionless protocols can call 4655 * soisconnected() and soisdisconnected() only, bypassing the in-progress 4656 * calls when setting up a ``connection'' takes no time. 4657 * 4658 * From the passive side, a socket is created with two queues of sockets: 4659 * so_incomp for connections in progress and so_comp for connections already 4660 * made and awaiting user acceptance. As a protocol is preparing incoming 4661 * connections, it creates a socket structure queued on so_incomp by calling 4662 * sonewconn(). When the connection is established, soisconnected() is 4663 * called, and transfers the socket structure to so_comp, making it available 4664 * to accept(). 4665 * 4666 * If a socket is closed with sockets on either so_incomp or so_comp, these 4667 * sockets are dropped. 4668 * 4669 * If higher-level protocols are implemented in the kernel, the wakeups done 4670 * here will sometimes cause software-interrupt process scheduling. 4671 */ 4672 void 4673 soisconnecting(struct socket *so) 4674 { 4675 4676 SOCK_LOCK(so); 4677 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 4678 so->so_state |= SS_ISCONNECTING; 4679 SOCK_UNLOCK(so); 4680 } 4681 4682 void 4683 soisconnected(struct socket *so) 4684 { 4685 bool last __diagused; 4686 4687 SOCK_LOCK(so); 4688 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 4689 so->so_state |= SS_ISCONNECTED; 4690 4691 if (so->so_qstate == SQ_INCOMP) { 4692 struct socket *head = so->so_listen; 4693 int ret; 4694 4695 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 4696 /* 4697 * Promoting a socket from incomplete queue to complete, we 4698 * need to go through reverse order of locking. We first do 4699 * trylock, and if that doesn't succeed, we go the hard way 4700 * leaving a reference and rechecking consistency after proper 4701 * locking. 4702 */ 4703 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 4704 soref(head); 4705 SOCK_UNLOCK(so); 4706 SOLISTEN_LOCK(head); 4707 SOCK_LOCK(so); 4708 if (__predict_false(head != so->so_listen)) { 4709 /* 4710 * The socket went off the listen queue, 4711 * should be lost race to close(2) of sol. 4712 * The socket is about to soabort(). 4713 */ 4714 SOCK_UNLOCK(so); 4715 sorele_locked(head); 4716 return; 4717 } 4718 last = refcount_release(&head->so_count); 4719 KASSERT(!last, ("%s: released last reference for %p", 4720 __func__, head)); 4721 } 4722 again: 4723 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 4724 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 4725 head->sol_incqlen--; 4726 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 4727 head->sol_qlen++; 4728 so->so_qstate = SQ_COMP; 4729 SOCK_UNLOCK(so); 4730 solisten_wakeup(head); /* unlocks */ 4731 } else { 4732 SOCK_RECVBUF_LOCK(so); 4733 soupcall_set(so, SO_RCV, 4734 head->sol_accept_filter->accf_callback, 4735 head->sol_accept_filter_arg); 4736 so->so_options &= ~SO_ACCEPTFILTER; 4737 ret = head->sol_accept_filter->accf_callback(so, 4738 head->sol_accept_filter_arg, M_NOWAIT); 4739 if (ret == SU_ISCONNECTED) { 4740 soupcall_clear(so, SO_RCV); 4741 SOCK_RECVBUF_UNLOCK(so); 4742 goto again; 4743 } 4744 SOCK_RECVBUF_UNLOCK(so); 4745 SOCK_UNLOCK(so); 4746 SOLISTEN_UNLOCK(head); 4747 } 4748 return; 4749 } 4750 SOCK_UNLOCK(so); 4751 wakeup(&so->so_timeo); 4752 sorwakeup(so); 4753 sowwakeup(so); 4754 } 4755 4756 void 4757 soisdisconnecting(struct socket *so) 4758 { 4759 4760 SOCK_LOCK(so); 4761 so->so_state &= ~SS_ISCONNECTING; 4762 so->so_state |= SS_ISDISCONNECTING; 4763 4764 if (!SOLISTENING(so)) { 4765 SOCK_RECVBUF_LOCK(so); 4766 socantrcvmore_locked(so); 4767 SOCK_SENDBUF_LOCK(so); 4768 socantsendmore_locked(so); 4769 } 4770 SOCK_UNLOCK(so); 4771 wakeup(&so->so_timeo); 4772 } 4773 4774 void 4775 soisdisconnected(struct socket *so) 4776 { 4777 4778 SOCK_LOCK(so); 4779 4780 /* 4781 * There is at least one reader of so_state that does not 4782 * acquire socket lock, namely soreceive_generic(). Ensure 4783 * that it never sees all flags that track connection status 4784 * cleared, by ordering the update with a barrier semantic of 4785 * our release thread fence. 4786 */ 4787 so->so_state |= SS_ISDISCONNECTED; 4788 atomic_thread_fence_rel(); 4789 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 4790 4791 if (!SOLISTENING(so)) { 4792 SOCK_UNLOCK(so); 4793 SOCK_RECVBUF_LOCK(so); 4794 socantrcvmore_locked(so); 4795 SOCK_SENDBUF_LOCK(so); 4796 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 4797 socantsendmore_locked(so); 4798 } else 4799 SOCK_UNLOCK(so); 4800 wakeup(&so->so_timeo); 4801 } 4802 4803 int 4804 soiolock(struct socket *so, struct sx *sx, int flags) 4805 { 4806 int error; 4807 4808 KASSERT((flags & SBL_VALID) == flags, 4809 ("soiolock: invalid flags %#x", flags)); 4810 4811 if ((flags & SBL_WAIT) != 0) { 4812 if ((flags & SBL_NOINTR) != 0) { 4813 sx_xlock(sx); 4814 } else { 4815 error = sx_xlock_sig(sx); 4816 if (error != 0) 4817 return (error); 4818 } 4819 } else if (!sx_try_xlock(sx)) { 4820 return (EWOULDBLOCK); 4821 } 4822 4823 if (__predict_false(SOLISTENING(so))) { 4824 sx_xunlock(sx); 4825 return (ENOTCONN); 4826 } 4827 return (0); 4828 } 4829 4830 void 4831 soiounlock(struct sx *sx) 4832 { 4833 sx_xunlock(sx); 4834 } 4835 4836 /* 4837 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 4838 */ 4839 struct sockaddr * 4840 sodupsockaddr(const struct sockaddr *sa, int mflags) 4841 { 4842 struct sockaddr *sa2; 4843 4844 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 4845 if (sa2) 4846 bcopy(sa, sa2, sa->sa_len); 4847 return sa2; 4848 } 4849 4850 /* 4851 * Register per-socket destructor. 4852 */ 4853 void 4854 sodtor_set(struct socket *so, so_dtor_t *func) 4855 { 4856 4857 SOCK_LOCK_ASSERT(so); 4858 so->so_dtor = func; 4859 } 4860 4861 /* 4862 * Register per-socket buffer upcalls. 4863 */ 4864 void 4865 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) 4866 { 4867 struct sockbuf *sb; 4868 4869 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4870 4871 switch (which) { 4872 case SO_RCV: 4873 sb = &so->so_rcv; 4874 break; 4875 case SO_SND: 4876 sb = &so->so_snd; 4877 break; 4878 } 4879 SOCK_BUF_LOCK_ASSERT(so, which); 4880 sb->sb_upcall = func; 4881 sb->sb_upcallarg = arg; 4882 sb->sb_flags |= SB_UPCALL; 4883 } 4884 4885 void 4886 soupcall_clear(struct socket *so, sb_which which) 4887 { 4888 struct sockbuf *sb; 4889 4890 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4891 4892 switch (which) { 4893 case SO_RCV: 4894 sb = &so->so_rcv; 4895 break; 4896 case SO_SND: 4897 sb = &so->so_snd; 4898 break; 4899 } 4900 SOCK_BUF_LOCK_ASSERT(so, which); 4901 KASSERT(sb->sb_upcall != NULL, 4902 ("%s: so %p no upcall to clear", __func__, so)); 4903 sb->sb_upcall = NULL; 4904 sb->sb_upcallarg = NULL; 4905 sb->sb_flags &= ~SB_UPCALL; 4906 } 4907 4908 void 4909 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4910 { 4911 4912 SOLISTEN_LOCK_ASSERT(so); 4913 so->sol_upcall = func; 4914 so->sol_upcallarg = arg; 4915 } 4916 4917 static void 4918 so_rdknl_lock(void *arg) 4919 { 4920 struct socket *so = arg; 4921 4922 retry: 4923 if (SOLISTENING(so)) { 4924 SOLISTEN_LOCK(so); 4925 } else { 4926 SOCK_RECVBUF_LOCK(so); 4927 if (__predict_false(SOLISTENING(so))) { 4928 SOCK_RECVBUF_UNLOCK(so); 4929 goto retry; 4930 } 4931 } 4932 } 4933 4934 static void 4935 so_rdknl_unlock(void *arg) 4936 { 4937 struct socket *so = arg; 4938 4939 if (SOLISTENING(so)) 4940 SOLISTEN_UNLOCK(so); 4941 else 4942 SOCK_RECVBUF_UNLOCK(so); 4943 } 4944 4945 static void 4946 so_rdknl_assert_lock(void *arg, int what) 4947 { 4948 struct socket *so = arg; 4949 4950 if (what == LA_LOCKED) { 4951 if (SOLISTENING(so)) 4952 SOLISTEN_LOCK_ASSERT(so); 4953 else 4954 SOCK_RECVBUF_LOCK_ASSERT(so); 4955 } else { 4956 if (SOLISTENING(so)) 4957 SOLISTEN_UNLOCK_ASSERT(so); 4958 else 4959 SOCK_RECVBUF_UNLOCK_ASSERT(so); 4960 } 4961 } 4962 4963 static void 4964 so_wrknl_lock(void *arg) 4965 { 4966 struct socket *so = arg; 4967 4968 retry: 4969 if (SOLISTENING(so)) { 4970 SOLISTEN_LOCK(so); 4971 } else { 4972 SOCK_SENDBUF_LOCK(so); 4973 if (__predict_false(SOLISTENING(so))) { 4974 SOCK_SENDBUF_UNLOCK(so); 4975 goto retry; 4976 } 4977 } 4978 } 4979 4980 static void 4981 so_wrknl_unlock(void *arg) 4982 { 4983 struct socket *so = arg; 4984 4985 if (SOLISTENING(so)) 4986 SOLISTEN_UNLOCK(so); 4987 else 4988 SOCK_SENDBUF_UNLOCK(so); 4989 } 4990 4991 static void 4992 so_wrknl_assert_lock(void *arg, int what) 4993 { 4994 struct socket *so = arg; 4995 4996 if (what == LA_LOCKED) { 4997 if (SOLISTENING(so)) 4998 SOLISTEN_LOCK_ASSERT(so); 4999 else 5000 SOCK_SENDBUF_LOCK_ASSERT(so); 5001 } else { 5002 if (SOLISTENING(so)) 5003 SOLISTEN_UNLOCK_ASSERT(so); 5004 else 5005 SOCK_SENDBUF_UNLOCK_ASSERT(so); 5006 } 5007 } 5008 5009 /* 5010 * Create an external-format (``xsocket'') structure using the information in 5011 * the kernel-format socket structure pointed to by so. This is done to 5012 * reduce the spew of irrelevant information over this interface, to isolate 5013 * user code from changes in the kernel structure, and potentially to provide 5014 * information-hiding if we decide that some of this information should be 5015 * hidden from users. 5016 */ 5017 void 5018 sotoxsocket(struct socket *so, struct xsocket *xso) 5019 { 5020 5021 bzero(xso, sizeof(*xso)); 5022 xso->xso_len = sizeof *xso; 5023 xso->xso_so = (uintptr_t)so; 5024 xso->so_type = so->so_type; 5025 xso->so_options = so->so_options; 5026 xso->so_linger = so->so_linger; 5027 xso->so_state = so->so_state; 5028 xso->so_pcb = (uintptr_t)so->so_pcb; 5029 xso->xso_protocol = so->so_proto->pr_protocol; 5030 xso->xso_family = so->so_proto->pr_domain->dom_family; 5031 xso->so_timeo = so->so_timeo; 5032 xso->so_error = so->so_error; 5033 xso->so_uid = so->so_cred->cr_uid; 5034 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 5035 SOCK_LOCK(so); 5036 xso->so_fibnum = so->so_fibnum; 5037 if (SOLISTENING(so)) { 5038 xso->so_qlen = so->sol_qlen; 5039 xso->so_incqlen = so->sol_incqlen; 5040 xso->so_qlimit = so->sol_qlimit; 5041 xso->so_oobmark = 0; 5042 } else { 5043 xso->so_state |= so->so_qstate; 5044 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 5045 xso->so_oobmark = so->so_oobmark; 5046 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 5047 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 5048 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 5049 xso->so_splice_so = (uintptr_t)so->so_splice->dst; 5050 } 5051 SOCK_UNLOCK(so); 5052 } 5053 5054 int 5055 so_options_get(const struct socket *so) 5056 { 5057 5058 return (so->so_options); 5059 } 5060 5061 void 5062 so_options_set(struct socket *so, int val) 5063 { 5064 5065 so->so_options = val; 5066 } 5067 5068 int 5069 so_error_get(const struct socket *so) 5070 { 5071 5072 return (so->so_error); 5073 } 5074 5075 void 5076 so_error_set(struct socket *so, int val) 5077 { 5078 5079 so->so_error = val; 5080 } 5081