1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pr_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pr_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pr_attach() has 50 * been successfully called. If pr_attach() returned an error, 51 * pr_detach() will not be called. Socket layer private. 52 * 53 * pr_abort() and pr_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pr_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. pr_fdclose() is called when userspace invokes close(2) on a socket 58 * file descriptor. 59 * 60 * socreate() creates a socket and attaches protocol state. This is a public 61 * interface that may be used by socket layer consumers to create new 62 * sockets. 63 * 64 * sonewconn() creates a socket and attaches protocol state. This is a 65 * public interface that may be used by protocols to create new sockets when 66 * a new connection is received and will be available for accept() on a 67 * listen socket. 68 * 69 * soclose() destroys a socket after possibly waiting for it to disconnect. 70 * This is a public interface that socket consumers should use to close and 71 * release a socket when done with it. 72 * 73 * soabort() destroys a socket without waiting for it to disconnect (used 74 * only for incoming connections that are already partially or fully 75 * connected). This is used internally by the socket layer when clearing 76 * listen socket queues (due to overflow or close on the listen socket), but 77 * is also a public interface protocols may use to abort connections in 78 * their incomplete listen queues should they no longer be required. Sockets 79 * placed in completed connection listen queues should not be aborted for 80 * reasons described in the comment above the soclose() implementation. This 81 * is not a general purpose close routine, and except in the specific 82 * circumstances described here, should not be used. 83 * 84 * sofree() will free a socket and its protocol state if all references on 85 * the socket have been released, and is the public interface to attempt to 86 * free a socket when a reference is removed. This is a socket layer private 87 * interface. 88 * 89 * NOTE: In addition to socreate() and soclose(), which provide a single 90 * socket reference to the consumer to be managed as required, there are two 91 * calls to explicitly manage socket references, soref(), and sorele(). 92 * Currently, these are generally required only when transitioning a socket 93 * from a listen queue to a file descriptor, in order to prevent garbage 94 * collection of the socket at an untimely moment. For a number of reasons, 95 * these interfaces are not preferred, and should be avoided. 96 * 97 * NOTE: With regard to VNETs the general rule is that callers do not set 98 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 99 * sofree(), sorele(), sonewconn() and sorflush(), which are usually called 100 * from a pre-set VNET context. sopoll_generic() currently does not need a 101 * VNET context to be set. 102 */ 103 104 #include <sys/cdefs.h> 105 #include "opt_inet.h" 106 #include "opt_inet6.h" 107 #include "opt_kern_tls.h" 108 #include "opt_ktrace.h" 109 #include "opt_sctp.h" 110 111 #include <sys/param.h> 112 #include <sys/systm.h> 113 #include <sys/capsicum.h> 114 #include <sys/fcntl.h> 115 #include <sys/limits.h> 116 #include <sys/lock.h> 117 #include <sys/mac.h> 118 #include <sys/malloc.h> 119 #include <sys/mbuf.h> 120 #include <sys/mutex.h> 121 #include <sys/domain.h> 122 #include <sys/file.h> /* for struct knote */ 123 #include <sys/hhook.h> 124 #include <sys/kernel.h> 125 #include <sys/khelp.h> 126 #include <sys/kthread.h> 127 #include <sys/ktls.h> 128 #include <sys/event.h> 129 #include <sys/eventhandler.h> 130 #include <sys/poll.h> 131 #include <sys/proc.h> 132 #include <sys/protosw.h> 133 #include <sys/sbuf.h> 134 #include <sys/socket.h> 135 #include <sys/socketvar.h> 136 #include <sys/resourcevar.h> 137 #include <net/route.h> 138 #include <sys/sched.h> 139 #include <sys/signalvar.h> 140 #include <sys/smp.h> 141 #include <sys/stat.h> 142 #include <sys/sx.h> 143 #include <sys/sysctl.h> 144 #include <sys/taskqueue.h> 145 #include <sys/uio.h> 146 #include <sys/un.h> 147 #include <sys/unpcb.h> 148 #include <sys/jail.h> 149 #include <sys/syslog.h> 150 #include <netinet/in.h> 151 #include <netinet/in_pcb.h> 152 #include <netinet/tcp.h> 153 154 #include <net/vnet.h> 155 156 #include <security/mac/mac_framework.h> 157 #include <security/mac/mac_internal.h> 158 159 #include <vm/uma.h> 160 161 #ifdef COMPAT_FREEBSD32 162 #include <sys/mount.h> 163 #include <sys/sysent.h> 164 #include <compat/freebsd32/freebsd32.h> 165 #endif 166 167 static int soreceive_generic_locked(struct socket *so, 168 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 169 struct mbuf **controlp, int *flagsp); 170 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 171 int flags); 172 static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 173 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 174 struct mbuf **controlp, int flags); 175 static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, 176 struct uio *uio, struct mbuf *top, struct mbuf *control, 177 int flags, struct thread *td); 178 static void so_rdknl_lock(void *); 179 static void so_rdknl_unlock(void *); 180 static void so_rdknl_assert_lock(void *, int); 181 static void so_wrknl_lock(void *); 182 static void so_wrknl_unlock(void *); 183 static void so_wrknl_assert_lock(void *, int); 184 185 static void filt_sordetach(struct knote *kn); 186 static int filt_soread(struct knote *kn, long hint); 187 static void filt_sowdetach(struct knote *kn); 188 static int filt_sowrite(struct knote *kn, long hint); 189 static int filt_soempty(struct knote *kn, long hint); 190 191 static const struct filterops soread_filtops = { 192 .f_isfd = 1, 193 .f_detach = filt_sordetach, 194 .f_event = filt_soread, 195 .f_copy = knote_triv_copy, 196 }; 197 static const struct filterops sowrite_filtops = { 198 .f_isfd = 1, 199 .f_detach = filt_sowdetach, 200 .f_event = filt_sowrite, 201 .f_copy = knote_triv_copy, 202 }; 203 static const struct filterops soempty_filtops = { 204 .f_isfd = 1, 205 .f_detach = filt_sowdetach, 206 .f_event = filt_soempty, 207 .f_copy = knote_triv_copy, 208 }; 209 210 so_gen_t so_gencnt; /* generation count for sockets */ 211 212 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 213 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 214 215 #define VNET_SO_ASSERT(so) \ 216 VNET_ASSERT(curvnet != NULL, \ 217 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 218 219 #ifdef SOCKET_HHOOK 220 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 221 #define V_socket_hhh VNET(socket_hhh) 222 static inline int hhook_run_socket(struct socket *, void *, int32_t); 223 #endif 224 225 #ifdef COMPAT_FREEBSD32 226 #ifdef __amd64__ 227 /* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */ 228 #define __splice32_packed __packed 229 #else 230 #define __splice32_packed 231 #endif 232 struct splice32 { 233 int32_t sp_fd; 234 int64_t sp_max; 235 struct timeval32 sp_idle; 236 } __splice32_packed; 237 #undef __splice32_packed 238 #endif 239 240 /* 241 * Limit on the number of connections in the listen queue waiting 242 * for accept(2). 243 * NB: The original sysctl somaxconn is still available but hidden 244 * to prevent confusion about the actual purpose of this number. 245 */ 246 VNET_DEFINE_STATIC(u_int, somaxconn) = SOMAXCONN; 247 #define V_somaxconn VNET(somaxconn) 248 249 static int 250 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 251 { 252 int error; 253 u_int val; 254 255 val = V_somaxconn; 256 error = sysctl_handle_int(oidp, &val, 0, req); 257 if (error || !req->newptr ) 258 return (error); 259 260 /* 261 * The purpose of the UINT_MAX / 3 limit, is so that the formula 262 * 3 * sol_qlimit / 2 263 * below, will not overflow. 264 */ 265 266 if (val < 1 || val > UINT_MAX / 3) 267 return (EINVAL); 268 269 V_somaxconn = val; 270 return (0); 271 } 272 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 273 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int), 274 sysctl_somaxconn, "IU", 275 "Maximum listen socket pending connection accept queue size"); 276 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 277 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, 278 sizeof(u_int), sysctl_somaxconn, "IU", 279 "Maximum listen socket pending connection accept queue size (compat)"); 280 281 static u_int numopensockets; 282 static int 283 sysctl_numopensockets(SYSCTL_HANDLER_ARGS) 284 { 285 u_int val; 286 287 #ifdef VIMAGE 288 if(!IS_DEFAULT_VNET(curvnet)) 289 val = curvnet->vnet_sockcnt; 290 else 291 #endif 292 val = numopensockets; 293 return (sysctl_handle_int(oidp, &val, 0, req)); 294 } 295 SYSCTL_PROC(_kern_ipc, OID_AUTO, numopensockets, 296 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int), 297 sysctl_numopensockets, "IU", "Number of open sockets"); 298 299 /* 300 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 301 * so_gencnt field. 302 */ 303 static struct mtx so_global_mtx; 304 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 305 306 /* 307 * General IPC sysctl name space, used by sockets and a variety of other IPC 308 * types. 309 */ 310 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 311 "IPC"); 312 313 /* 314 * Initialize the socket subsystem and set up the socket 315 * memory allocator. 316 */ 317 static uma_zone_t socket_zone; 318 int maxsockets; 319 320 static void 321 socket_zone_change(void *tag) 322 { 323 324 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 325 } 326 327 static int splice_init_state; 328 static struct sx splice_init_lock; 329 SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init"); 330 331 static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0, 332 "Settings relating to the SO_SPLICE socket option"); 333 334 static bool splice_receive_stream = true; 335 SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN, 336 &splice_receive_stream, 0, 337 "Use soreceive_stream() for stream splices"); 338 339 static uma_zone_t splice_zone; 340 static struct proc *splice_proc; 341 struct splice_wq { 342 struct mtx mtx; 343 STAILQ_HEAD(, so_splice) head; 344 bool running; 345 } __aligned(CACHE_LINE_SIZE); 346 static struct splice_wq *splice_wq; 347 static uint32_t splice_index = 0; 348 349 static void so_splice_timeout(void *arg, int pending); 350 static void so_splice_xfer(struct so_splice *s); 351 static int so_unsplice(struct socket *so, bool timeout); 352 353 static void 354 splice_work_thread(void *ctx) 355 { 356 struct splice_wq *wq = ctx; 357 struct so_splice *s, *s_temp; 358 STAILQ_HEAD(, so_splice) local_head; 359 int cpu; 360 361 cpu = wq - splice_wq; 362 if (bootverbose) 363 printf("starting so_splice worker thread for CPU %d\n", cpu); 364 365 for (;;) { 366 mtx_lock(&wq->mtx); 367 while (STAILQ_EMPTY(&wq->head)) { 368 wq->running = false; 369 mtx_sleep(wq, &wq->mtx, 0, "-", 0); 370 wq->running = true; 371 } 372 STAILQ_INIT(&local_head); 373 STAILQ_CONCAT(&local_head, &wq->head); 374 STAILQ_INIT(&wq->head); 375 mtx_unlock(&wq->mtx); 376 STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) { 377 mtx_lock(&s->mtx); 378 CURVNET_SET(s->src->so_vnet); 379 so_splice_xfer(s); 380 CURVNET_RESTORE(); 381 } 382 } 383 } 384 385 static void 386 so_splice_dispatch_async(struct so_splice *sp) 387 { 388 struct splice_wq *wq; 389 bool running; 390 391 wq = &splice_wq[sp->wq_index]; 392 mtx_lock(&wq->mtx); 393 STAILQ_INSERT_TAIL(&wq->head, sp, next); 394 running = wq->running; 395 mtx_unlock(&wq->mtx); 396 if (!running) 397 wakeup(wq); 398 } 399 400 void 401 so_splice_dispatch(struct so_splice *sp) 402 { 403 mtx_assert(&sp->mtx, MA_OWNED); 404 405 if (sp->state != SPLICE_IDLE) { 406 mtx_unlock(&sp->mtx); 407 } else { 408 sp->state = SPLICE_QUEUED; 409 mtx_unlock(&sp->mtx); 410 so_splice_dispatch_async(sp); 411 } 412 } 413 414 static int 415 splice_zinit(void *mem, int size __unused, int flags __unused) 416 { 417 struct so_splice *s; 418 419 s = (struct so_splice *)mem; 420 mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF); 421 return (0); 422 } 423 424 static void 425 splice_zfini(void *mem, int size) 426 { 427 struct so_splice *s; 428 429 s = (struct so_splice *)mem; 430 mtx_destroy(&s->mtx); 431 } 432 433 static int 434 splice_init(void) 435 { 436 struct thread *td; 437 int error, i, state; 438 439 state = atomic_load_acq_int(&splice_init_state); 440 if (__predict_true(state > 0)) 441 return (0); 442 if (state < 0) 443 return (ENXIO); 444 sx_xlock(&splice_init_lock); 445 if (splice_init_state != 0) { 446 sx_xunlock(&splice_init_lock); 447 return (0); 448 } 449 450 splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL, 451 NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0); 452 453 splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP, 454 M_WAITOK | M_ZERO); 455 456 /* 457 * Initialize the workqueues to run the splice work. We create a 458 * work queue for each CPU. 459 */ 460 CPU_FOREACH(i) { 461 STAILQ_INIT(&splice_wq[i].head); 462 mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF); 463 } 464 465 /* Start kthreads for each workqueue. */ 466 error = 0; 467 CPU_FOREACH(i) { 468 error = kproc_kthread_add(splice_work_thread, &splice_wq[i], 469 &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i); 470 if (error) { 471 printf("Can't add so_splice thread %d error %d\n", 472 i, error); 473 break; 474 } 475 476 /* 477 * It's possible to create loops with SO_SPLICE; ensure that 478 * worker threads aren't able to starve the system too easily. 479 */ 480 thread_lock(td); 481 sched_prio(td, PUSER); 482 thread_unlock(td); 483 } 484 485 splice_init_state = error != 0 ? -1 : 1; 486 sx_xunlock(&splice_init_lock); 487 488 return (error); 489 } 490 491 /* 492 * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding 493 * one lock in order to avoid potential deadlocks in case there is some other 494 * code path which acquires more than one I/O lock at a time. 495 */ 496 static void 497 splice_lock_pair(struct socket *so_src, struct socket *so_dst) 498 { 499 int error; 500 501 for (;;) { 502 error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR); 503 KASSERT(error == 0, 504 ("%s: failed to lock send I/O lock: %d", __func__, error)); 505 error = SOCK_IO_RECV_LOCK(so_src, 0); 506 KASSERT(error == 0 || error == EWOULDBLOCK, 507 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 508 if (error == 0) 509 break; 510 SOCK_IO_SEND_UNLOCK(so_dst); 511 512 error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR); 513 KASSERT(error == 0, 514 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 515 error = SOCK_IO_SEND_LOCK(so_dst, 0); 516 KASSERT(error == 0 || error == EWOULDBLOCK, 517 ("%s: failed to lock send I/O lock: %d", __func__, error)); 518 if (error == 0) 519 break; 520 SOCK_IO_RECV_UNLOCK(so_src); 521 } 522 } 523 524 static void 525 splice_unlock_pair(struct socket *so_src, struct socket *so_dst) 526 { 527 SOCK_IO_RECV_UNLOCK(so_src); 528 SOCK_IO_SEND_UNLOCK(so_dst); 529 } 530 531 /* 532 * Move data from the source to the sink. Assumes that both of the relevant 533 * socket I/O locks are held. 534 */ 535 static int 536 so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max, 537 ssize_t *lenp) 538 { 539 struct uio uio; 540 struct mbuf *m; 541 struct sockbuf *sb_src, *sb_dst; 542 ssize_t len; 543 long space; 544 int error, flags; 545 546 SOCK_IO_RECV_ASSERT_LOCKED(so_src); 547 SOCK_IO_SEND_ASSERT_LOCKED(so_dst); 548 549 error = 0; 550 m = NULL; 551 memset(&uio, 0, sizeof(uio)); 552 553 sb_src = &so_src->so_rcv; 554 sb_dst = &so_dst->so_snd; 555 556 space = sbspace(sb_dst); 557 if (space < 0) 558 space = 0; 559 len = MIN(max, MIN(space, sbavail(sb_src))); 560 if (len == 0) { 561 SOCK_RECVBUF_LOCK(so_src); 562 if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0) 563 error = EPIPE; 564 SOCK_RECVBUF_UNLOCK(so_src); 565 } else { 566 flags = MSG_DONTWAIT; 567 uio.uio_resid = len; 568 if (splice_receive_stream && sb_src->sb_tls_info == NULL) { 569 error = soreceive_stream_locked(so_src, sb_src, NULL, 570 &uio, &m, NULL, flags); 571 } else { 572 error = soreceive_generic_locked(so_src, NULL, 573 &uio, &m, NULL, &flags); 574 } 575 if (error != 0 && m != NULL) { 576 m_freem(m); 577 m = NULL; 578 } 579 } 580 if (m != NULL) { 581 len -= uio.uio_resid; 582 error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL, 583 MSG_DONTWAIT, curthread); 584 } else if (error == 0) { 585 len = 0; 586 SOCK_SENDBUF_LOCK(so_dst); 587 if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0) 588 error = EPIPE; 589 SOCK_SENDBUF_UNLOCK(so_dst); 590 } 591 if (error == 0) 592 *lenp = len; 593 return (error); 594 } 595 596 /* 597 * Transfer data from the source to the sink. 598 */ 599 static void 600 so_splice_xfer(struct so_splice *sp) 601 { 602 struct socket *so_src, *so_dst; 603 off_t max; 604 ssize_t len; 605 int error; 606 607 mtx_assert(&sp->mtx, MA_OWNED); 608 KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING, 609 ("so_splice_xfer: invalid state %d", sp->state)); 610 KASSERT(sp->max != 0, ("so_splice_xfer: max == 0")); 611 612 if (sp->state == SPLICE_CLOSING) { 613 /* Userspace asked us to close the splice. */ 614 goto closing; 615 } 616 617 sp->state = SPLICE_RUNNING; 618 so_src = sp->src; 619 so_dst = sp->dst; 620 max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX; 621 if (max < 0) 622 max = 0; 623 624 /* 625 * Lock the sockets in order to block userspace from doing anything 626 * sneaky. If an error occurs or one of the sockets can no longer 627 * transfer data, we will automatically unsplice. 628 */ 629 mtx_unlock(&sp->mtx); 630 splice_lock_pair(so_src, so_dst); 631 632 error = so_splice_xfer_data(so_src, so_dst, max, &len); 633 634 mtx_lock(&sp->mtx); 635 636 /* 637 * Update our stats while still holding the socket locks. This 638 * synchronizes with getsockopt(SO_SPLICE), see the comment there. 639 */ 640 if (error == 0) { 641 KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len)); 642 so_src->so_splice_sent += len; 643 } 644 splice_unlock_pair(so_src, so_dst); 645 646 switch (sp->state) { 647 case SPLICE_CLOSING: 648 closing: 649 sp->state = SPLICE_CLOSED; 650 wakeup(sp); 651 mtx_unlock(&sp->mtx); 652 break; 653 case SPLICE_RUNNING: 654 if (error != 0 || 655 (sp->max > 0 && so_src->so_splice_sent >= sp->max)) { 656 sp->state = SPLICE_EXCEPTION; 657 soref(so_src); 658 mtx_unlock(&sp->mtx); 659 (void)so_unsplice(so_src, false); 660 sorele(so_src); 661 } else { 662 /* 663 * Locklessly check for additional bytes in the source's 664 * receive buffer and queue more work if possible. We 665 * may end up queuing needless work, but that's ok, and 666 * if we race with a thread inserting more data into the 667 * buffer and observe sbavail() == 0, the splice mutex 668 * ensures that splice_push() will queue more work for 669 * us. 670 */ 671 if (sbavail(&so_src->so_rcv) > 0 && 672 sbspace(&so_dst->so_snd) > 0) { 673 sp->state = SPLICE_QUEUED; 674 mtx_unlock(&sp->mtx); 675 so_splice_dispatch_async(sp); 676 } else { 677 sp->state = SPLICE_IDLE; 678 mtx_unlock(&sp->mtx); 679 } 680 } 681 break; 682 default: 683 __assert_unreachable(); 684 } 685 } 686 687 static void 688 socket_init(void *tag) 689 { 690 691 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 692 NULL, NULL, UMA_ALIGN_PTR, 0); 693 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 694 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 695 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 696 EVENTHANDLER_PRI_FIRST); 697 } 698 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 699 700 #ifdef SOCKET_HHOOK 701 static void 702 socket_hhook_register(int subtype) 703 { 704 705 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 706 &V_socket_hhh[subtype], 707 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 708 printf("%s: WARNING: unable to register hook\n", __func__); 709 } 710 711 static void 712 socket_hhook_deregister(int subtype) 713 { 714 715 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 716 printf("%s: WARNING: unable to deregister hook\n", __func__); 717 } 718 719 static void 720 socket_vnet_init(const void *unused __unused) 721 { 722 int i; 723 724 /* We expect a contiguous range */ 725 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 726 socket_hhook_register(i); 727 } 728 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 729 socket_vnet_init, NULL); 730 731 static void 732 socket_vnet_uninit(const void *unused __unused) 733 { 734 int i; 735 736 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 737 socket_hhook_deregister(i); 738 } 739 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 740 socket_vnet_uninit, NULL); 741 #endif /* SOCKET_HHOOK */ 742 743 /* 744 * Initialise maxsockets. This SYSINIT must be run after 745 * tunable_mbinit(). 746 */ 747 static void 748 init_maxsockets(void *ignored) 749 { 750 751 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 752 maxsockets = imax(maxsockets, maxfiles); 753 } 754 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 755 756 /* 757 * Sysctl to get and set the maximum global sockets limit. Notify protocols 758 * of the change so that they can update their dependent limits as required. 759 */ 760 static int 761 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 762 { 763 int error, newmaxsockets; 764 765 newmaxsockets = maxsockets; 766 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 767 if (error == 0 && req->newptr && newmaxsockets != maxsockets) { 768 if (newmaxsockets > maxsockets && 769 newmaxsockets <= maxfiles) { 770 maxsockets = newmaxsockets; 771 EVENTHANDLER_INVOKE(maxsockets_change); 772 } else 773 error = EINVAL; 774 } 775 return (error); 776 } 777 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 778 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 779 &maxsockets, 0, sysctl_maxsockets, "IU", 780 "Maximum number of sockets available"); 781 782 /* 783 * Socket operation routines. These routines are called by the routines in 784 * sys_socket.c or from a system process, and implement the semantics of 785 * socket operations by switching out to the protocol specific routines. 786 */ 787 788 /* 789 * Get a socket structure from our zone, and initialize it. Note that it 790 * would probably be better to allocate socket and PCB at the same time, but 791 * I'm not convinced that all the protocols can be easily modified to do 792 * this. 793 * 794 * soalloc() returns a socket with a ref count of 0. 795 */ 796 static struct socket * 797 soalloc(struct vnet *vnet) 798 { 799 struct socket *so; 800 801 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 802 if (so == NULL) 803 return (NULL); 804 #ifdef MAC 805 if (mac_socket_init(so, M_NOWAIT) != 0) { 806 uma_zfree(socket_zone, so); 807 return (NULL); 808 } 809 #endif 810 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 811 uma_zfree(socket_zone, so); 812 return (NULL); 813 } 814 815 /* 816 * The socket locking protocol allows to lock 2 sockets at a time, 817 * however, the first one must be a listening socket. WITNESS lacks 818 * a feature to change class of an existing lock, so we use DUPOK. 819 */ 820 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 821 so->so_rcv.sb_sel = &so->so_rdsel; 822 so->so_snd.sb_sel = &so->so_wrsel; 823 sx_init(&so->so_snd_sx, "so_snd_sx"); 824 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 825 TAILQ_INIT(&so->so_snd.sb_aiojobq); 826 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 827 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 828 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 829 #ifdef VIMAGE 830 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 831 __func__, __LINE__, so)); 832 so->so_vnet = vnet; 833 #endif 834 #ifdef SOCKET_HHOOK 835 /* We shouldn't need the so_global_mtx */ 836 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 837 /* Do we need more comprehensive error returns? */ 838 uma_zfree(socket_zone, so); 839 return (NULL); 840 } 841 #endif 842 mtx_lock(&so_global_mtx); 843 so->so_gencnt = ++so_gencnt; 844 ++numopensockets; 845 #ifdef VIMAGE 846 vnet->vnet_sockcnt++; 847 #endif 848 mtx_unlock(&so_global_mtx); 849 850 return (so); 851 } 852 853 /* 854 * Free the storage associated with a socket at the socket layer, tear down 855 * locks, labels, etc. All protocol state is assumed already to have been 856 * torn down (and possibly never set up) by the caller. 857 */ 858 void 859 sodealloc(struct socket *so) 860 { 861 862 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 863 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 864 865 mtx_lock(&so_global_mtx); 866 so->so_gencnt = ++so_gencnt; 867 --numopensockets; /* Could be below, but faster here. */ 868 #ifdef VIMAGE 869 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 870 __func__, __LINE__, so)); 871 so->so_vnet->vnet_sockcnt--; 872 #endif 873 mtx_unlock(&so_global_mtx); 874 #ifdef MAC 875 mac_socket_destroy(so); 876 #endif 877 #ifdef SOCKET_HHOOK 878 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 879 #endif 880 881 khelp_destroy_osd(&so->osd); 882 if (SOLISTENING(so)) { 883 if (so->sol_accept_filter != NULL) 884 accept_filt_setopt(so, NULL); 885 } else { 886 if (so->so_rcv.sb_hiwat) 887 (void)chgsbsize(so->so_cred->cr_uidinfo, 888 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 889 if (so->so_snd.sb_hiwat) 890 (void)chgsbsize(so->so_cred->cr_uidinfo, 891 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 892 sx_destroy(&so->so_snd_sx); 893 sx_destroy(&so->so_rcv_sx); 894 } 895 crfree(so->so_cred); 896 mtx_destroy(&so->so_lock); 897 uma_zfree(socket_zone, so); 898 } 899 900 /* 901 * Shim to accomodate protocols that already do their own socket buffers 902 * management (marked with PR_SOCKBUF) with protocols that yet do not. 903 * 904 * Attach via socket(2) is different from attach via accept(2). In case of 905 * normal socket(2) syscall it is the pr_attach that calls soreserve(), even 906 * for protocols that don't yet do PR_SOCKBUF. In case of accepted connection 907 * it is our shim that calls soreserve() and the hiwat values are taken from 908 * the parent socket. The SCTP's sopeeloff() hands us a non-listening parent 909 * socket. 910 * 911 * This whole shim should go away when all major protocols fully manage their 912 * socket buffers. 913 */ 914 static int 915 soattach(struct socket *so, int proto, struct thread *td, struct socket *head) 916 { 917 int error; 918 919 VNET_ASSERT(curvnet == so->so_vnet, 920 ("%s: %p != %p", __func__, curvnet, so->so_vnet)); 921 922 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 923 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); 924 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); 925 so->so_snd.sb_mtx = &so->so_snd_mtx; 926 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 927 } 928 if (head == NULL || (error = soreserve(so, 929 SOLISTENING(head) ? head->sol_sbsnd_hiwat : head->so_snd.sb_hiwat, 930 SOLISTENING(head) ? head->sol_sbrcv_hiwat : head->so_rcv.sb_hiwat)) 931 == 0) 932 error = so->so_proto->pr_attach(so, proto, td); 933 if (error != 0 && (so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 934 mtx_destroy(&so->so_snd_mtx); 935 mtx_destroy(&so->so_rcv_mtx); 936 } 937 938 return (error); 939 } 940 941 /* 942 * socreate returns a socket with a ref count of 1 and a file descriptor 943 * reference. The socket should be closed with soclose(). 944 */ 945 int 946 socreate(int dom, struct socket **aso, int type, int proto, 947 struct ucred *cred, struct thread *td) 948 { 949 struct protosw *prp; 950 struct socket *so; 951 int error; 952 953 prp = pffindproto(dom, type, proto); 954 if (prp == NULL) { 955 /* No support for domain. */ 956 if (pffinddomain(dom) == NULL) 957 return (EAFNOSUPPORT); 958 /* No support for socket type. */ 959 if (proto == 0 && type != 0) 960 return (EPROTOTYPE); 961 return (EPROTONOSUPPORT); 962 } 963 964 MPASS(prp->pr_attach); 965 966 if ((prp->pr_flags & PR_CAPATTACH) == 0) { 967 if (CAP_TRACING(td)) 968 ktrcapfail(CAPFAIL_PROTO, &proto); 969 if (IN_CAPABILITY_MODE(td)) 970 return (ECAPMODE); 971 } 972 973 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 974 return (EPROTONOSUPPORT); 975 976 so = soalloc(CRED_TO_VNET(cred)); 977 if (so == NULL) 978 return (ENOBUFS); 979 980 so->so_type = type; 981 so->so_cred = crhold(cred); 982 if ((prp->pr_domain->dom_family == PF_INET) || 983 (prp->pr_domain->dom_family == PF_INET6) || 984 (prp->pr_domain->dom_family == PF_ROUTE)) 985 so->so_fibnum = td->td_proc->p_fibnum; 986 else 987 so->so_fibnum = 0; 988 so->so_proto = prp; 989 #ifdef MAC 990 mac_socket_create(cred, so); 991 #endif 992 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 993 so_rdknl_assert_lock); 994 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 995 so_wrknl_assert_lock); 996 CURVNET_SET(so->so_vnet); 997 error = soattach(so, proto, td, NULL); 998 CURVNET_RESTORE(); 999 if (error) { 1000 sodealloc(so); 1001 return (error); 1002 } 1003 soref(so); 1004 *aso = so; 1005 return (0); 1006 } 1007 1008 #ifdef REGRESSION 1009 static int regression_sonewconn_earlytest = 1; 1010 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 1011 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 1012 #endif 1013 1014 static int sooverprio = LOG_DEBUG; 1015 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, 1016 &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); 1017 1018 static struct timeval overinterval = { 60, 0 }; 1019 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 1020 &overinterval, 1021 "Delay in seconds between warnings for listen socket overflows"); 1022 1023 /* 1024 * When an attempt at a new connection is noted on a socket which supports 1025 * accept(2), the protocol has two options: 1026 * 1) Call legacy sonewconn() function, which would call protocol attach 1027 * method, same as used for socket(2). 1028 * 2) Call solisten_clone(), do attach that is specific to a cloned connection, 1029 * and then call solisten_enqueue(). 1030 * 1031 * Note: the ref count on the socket is 0 on return. 1032 */ 1033 struct socket * 1034 solisten_clone(struct socket *head) 1035 { 1036 struct sbuf descrsb; 1037 struct socket *so; 1038 int len, overcount; 1039 u_int qlen; 1040 const char localprefix[] = "local:"; 1041 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 1042 #if defined(INET6) 1043 char addrbuf[INET6_ADDRSTRLEN]; 1044 #elif defined(INET) 1045 char addrbuf[INET_ADDRSTRLEN]; 1046 #endif 1047 bool dolog, over; 1048 1049 SOLISTEN_LOCK(head); 1050 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 1051 #ifdef REGRESSION 1052 if (regression_sonewconn_earlytest && over) { 1053 #else 1054 if (over) { 1055 #endif 1056 head->sol_overcount++; 1057 dolog = (sooverprio >= 0) && 1058 !!ratecheck(&head->sol_lastover, &overinterval); 1059 1060 /* 1061 * If we're going to log, copy the overflow count and queue 1062 * length from the listen socket before dropping the lock. 1063 * Also, reset the overflow count. 1064 */ 1065 if (dolog) { 1066 overcount = head->sol_overcount; 1067 head->sol_overcount = 0; 1068 qlen = head->sol_qlen; 1069 } 1070 SOLISTEN_UNLOCK(head); 1071 1072 if (dolog) { 1073 /* 1074 * Try to print something descriptive about the 1075 * socket for the error message. 1076 */ 1077 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 1078 SBUF_FIXEDLEN); 1079 switch (head->so_proto->pr_domain->dom_family) { 1080 #if defined(INET) || defined(INET6) 1081 #ifdef INET 1082 case AF_INET: 1083 #endif 1084 #ifdef INET6 1085 case AF_INET6: 1086 if (head->so_proto->pr_domain->dom_family == 1087 AF_INET6 || 1088 (sotoinpcb(head)->inp_inc.inc_flags & 1089 INC_ISIPV6)) { 1090 ip6_sprintf(addrbuf, 1091 &sotoinpcb(head)->inp_inc.inc6_laddr); 1092 sbuf_printf(&descrsb, "[%s]", addrbuf); 1093 } else 1094 #endif 1095 { 1096 #ifdef INET 1097 inet_ntoa_r( 1098 sotoinpcb(head)->inp_inc.inc_laddr, 1099 addrbuf); 1100 sbuf_cat(&descrsb, addrbuf); 1101 #endif 1102 } 1103 sbuf_printf(&descrsb, ":%hu (proto %u)", 1104 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 1105 head->so_proto->pr_protocol); 1106 break; 1107 #endif /* INET || INET6 */ 1108 case AF_UNIX: 1109 sbuf_cat(&descrsb, localprefix); 1110 if (sotounpcb(head)->unp_addr != NULL) 1111 len = 1112 sotounpcb(head)->unp_addr->sun_len - 1113 offsetof(struct sockaddr_un, 1114 sun_path); 1115 else 1116 len = 0; 1117 if (len > 0) 1118 sbuf_bcat(&descrsb, 1119 sotounpcb(head)->unp_addr->sun_path, 1120 len); 1121 else 1122 sbuf_cat(&descrsb, "(unknown)"); 1123 break; 1124 } 1125 1126 /* 1127 * If we can't print something more specific, at least 1128 * print the domain name. 1129 */ 1130 if (sbuf_finish(&descrsb) != 0 || 1131 sbuf_len(&descrsb) <= 0) { 1132 sbuf_clear(&descrsb); 1133 sbuf_cat(&descrsb, 1134 head->so_proto->pr_domain->dom_name ?: 1135 "unknown"); 1136 sbuf_finish(&descrsb); 1137 } 1138 KASSERT(sbuf_len(&descrsb) > 0, 1139 ("%s: sbuf creation failed", __func__)); 1140 /* 1141 * Preserve the historic listen queue overflow log 1142 * message, that starts with "sonewconn:". It has 1143 * been known to sysadmins for years and also test 1144 * sys/kern/sonewconn_overflow checks for it. 1145 */ 1146 if (head->so_cred == 0) { 1147 log(LOG_PRI(sooverprio), 1148 "sonewconn: pcb %p (%s): " 1149 "Listen queue overflow: %i already in " 1150 "queue awaiting acceptance (%d " 1151 "occurrences)\n", head->so_pcb, 1152 sbuf_data(&descrsb), 1153 qlen, overcount); 1154 } else { 1155 log(LOG_PRI(sooverprio), 1156 "sonewconn: pcb %p (%s): " 1157 "Listen queue overflow: " 1158 "%i already in queue awaiting acceptance " 1159 "(%d occurrences), euid %d, rgid %d, jail %s\n", 1160 head->so_pcb, sbuf_data(&descrsb), qlen, 1161 overcount, head->so_cred->cr_uid, 1162 head->so_cred->cr_rgid, 1163 head->so_cred->cr_prison ? 1164 head->so_cred->cr_prison->pr_name : 1165 "not_jailed"); 1166 } 1167 sbuf_delete(&descrsb); 1168 1169 overcount = 0; 1170 } 1171 1172 return (NULL); 1173 } 1174 SOLISTEN_UNLOCK(head); 1175 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 1176 __func__, head)); 1177 so = soalloc(head->so_vnet); 1178 if (so == NULL) { 1179 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1180 "limit reached or out of memory\n", 1181 __func__, head->so_pcb); 1182 return (NULL); 1183 } 1184 so->so_listen = head; 1185 so->so_type = head->so_type; 1186 /* 1187 * POSIX is ambiguous on what options an accept(2)ed socket should 1188 * inherit from the listener. Words "create a new socket" may be 1189 * interpreted as not inheriting anything. Best programming practice 1190 * for application developers is to not rely on such inheritance. 1191 * FreeBSD had historically inherited all so_options excluding 1192 * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, 1193 * including those completely irrelevant to a new born socket. For 1194 * compatibility with older versions we will inherit a list of 1195 * meaningful options. 1196 * The crucial bit to inherit is SO_ACCEPTFILTER. We need it present 1197 * in the child socket for soisconnected() promoting socket from the 1198 * incomplete queue to complete. It will be cleared before the child 1199 * gets available to accept(2). 1200 */ 1201 so->so_options = head->so_options & (SO_ACCEPTFILTER | SO_KEEPALIVE | 1202 SO_DONTROUTE | SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); 1203 so->so_linger = head->so_linger; 1204 so->so_state = head->so_state; 1205 so->so_fibnum = head->so_fibnum; 1206 so->so_proto = head->so_proto; 1207 so->so_cred = crhold(head->so_cred); 1208 #ifdef SOCKET_HHOOK 1209 if (V_socket_hhh[HHOOK_SOCKET_NEWCONN]->hhh_nhooks > 0) { 1210 if (hhook_run_socket(so, head, HHOOK_SOCKET_NEWCONN)) { 1211 sodealloc(so); 1212 log(LOG_DEBUG, "%s: hhook run failed\n", __func__); 1213 return (NULL); 1214 } 1215 } 1216 #endif 1217 #ifdef MAC 1218 mac_socket_newconn(head, so); 1219 #endif 1220 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1221 so_rdknl_assert_lock); 1222 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1223 so_wrknl_assert_lock); 1224 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 1225 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 1226 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 1227 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 1228 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; 1229 so->so_snd.sb_flags = head->sol_sbsnd_flags & 1230 (SB_AUTOSIZE | SB_AUTOLOWAT); 1231 1232 return (so); 1233 } 1234 1235 /* Connstatus may be 0 or SS_ISCONNECTED. */ 1236 struct socket * 1237 sonewconn(struct socket *head, int connstatus) 1238 { 1239 struct socket *so; 1240 1241 if ((so = solisten_clone(head)) == NULL) 1242 return (NULL); 1243 1244 if (soattach(so, 0, NULL, head) != 0) { 1245 sodealloc(so); 1246 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 1247 __func__, head->so_pcb); 1248 return (NULL); 1249 } 1250 1251 (void)solisten_enqueue(so, connstatus); 1252 1253 return (so); 1254 } 1255 1256 /* 1257 * Enqueue socket cloned by solisten_clone() to the listen queue of the 1258 * listener it has been cloned from. 1259 * 1260 * Return 'true' if socket landed on complete queue, otherwise 'false'. 1261 */ 1262 bool 1263 solisten_enqueue(struct socket *so, int connstatus) 1264 { 1265 struct socket *head = so->so_listen; 1266 1267 MPASS(refcount_load(&so->so_count) == 0); 1268 refcount_init(&so->so_count, 1); 1269 1270 SOLISTEN_LOCK(head); 1271 if (head->sol_accept_filter != NULL) 1272 connstatus = 0; 1273 so->so_state |= connstatus; 1274 soref(head); /* A socket on (in)complete queue refs head. */ 1275 if (connstatus) { 1276 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 1277 so->so_qstate = SQ_COMP; 1278 head->sol_qlen++; 1279 solisten_wakeup(head); /* unlocks */ 1280 return (true); 1281 } else { 1282 /* 1283 * Keep removing sockets from the head until there's room for 1284 * us to insert on the tail. In pre-locking revisions, this 1285 * was a simple if(), but as we could be racing with other 1286 * threads and soabort() requires dropping locks, we must 1287 * loop waiting for the condition to be true. 1288 */ 1289 while (head->sol_incqlen > head->sol_qlimit) { 1290 struct socket *sp; 1291 1292 sp = TAILQ_FIRST(&head->sol_incomp); 1293 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 1294 head->sol_incqlen--; 1295 SOCK_LOCK(sp); 1296 sp->so_qstate = SQ_NONE; 1297 sp->so_listen = NULL; 1298 SOCK_UNLOCK(sp); 1299 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ 1300 soabort(sp); 1301 SOLISTEN_LOCK(head); 1302 } 1303 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 1304 so->so_qstate = SQ_INCOMP; 1305 head->sol_incqlen++; 1306 SOLISTEN_UNLOCK(head); 1307 return (false); 1308 } 1309 } 1310 1311 #if defined(SCTP) || defined(SCTP_SUPPORT) 1312 /* 1313 * Socket part of sctp_peeloff(). Create a new socket for an 1314 * association. The new socket is returned with a reference. 1315 * 1316 * XXXGL: reduce copy-paste with solisten_clone(). 1317 */ 1318 struct socket * 1319 sopeeloff(struct socket *head, struct protosw *so_proto) 1320 { 1321 struct socket *so; 1322 1323 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 1324 __func__, __LINE__, head)); 1325 KASSERT(head->so_type == SOCK_SEQPACKET, 1326 ("%s: unexpecte so_type: %d", __func__, head->so_type)); 1327 so = soalloc(head->so_vnet); 1328 if (so == NULL) { 1329 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1330 "limit reached or out of memory\n", 1331 __func__, head->so_pcb); 1332 return (NULL); 1333 } 1334 so->so_type = SOCK_STREAM; 1335 so->so_options = head->so_options; 1336 so->so_linger = head->so_linger; 1337 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 1338 so->so_fibnum = head->so_fibnum; 1339 so->so_proto = so_proto; 1340 so->so_cred = crhold(head->so_cred); 1341 #ifdef MAC 1342 mac_socket_newconn(head, so); 1343 #endif 1344 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1345 so_rdknl_assert_lock); 1346 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1347 so_wrknl_assert_lock); 1348 if (soattach(so, 0, NULL, head)) { 1349 sodealloc(so); 1350 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 1351 __func__, head->so_pcb); 1352 return (NULL); 1353 } 1354 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 1355 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 1356 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 1357 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 1358 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 1359 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 1360 1361 soref(so); 1362 1363 return (so); 1364 } 1365 #endif /* SCTP */ 1366 1367 int 1368 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 1369 { 1370 int error; 1371 1372 CURVNET_SET(so->so_vnet); 1373 error = so->so_proto->pr_bind(so, nam, td); 1374 CURVNET_RESTORE(); 1375 return (error); 1376 } 1377 1378 int 1379 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1380 { 1381 int error; 1382 1383 CURVNET_SET(so->so_vnet); 1384 error = so->so_proto->pr_bindat(fd, so, nam, td); 1385 CURVNET_RESTORE(); 1386 return (error); 1387 } 1388 1389 /* 1390 * solisten() transitions a socket from a non-listening state to a listening 1391 * state, but can also be used to update the listen queue depth on an 1392 * existing listen socket. The protocol will call back into the sockets 1393 * layer using solisten_proto_check() and solisten_proto() to check and set 1394 * socket-layer listen state. Call backs are used so that the protocol can 1395 * acquire both protocol and socket layer locks in whatever order is required 1396 * by the protocol. 1397 * 1398 * Protocol implementors are advised to hold the socket lock across the 1399 * socket-layer test and set to avoid races at the socket layer. 1400 */ 1401 int 1402 solisten(struct socket *so, int backlog, struct thread *td) 1403 { 1404 int error; 1405 1406 CURVNET_SET(so->so_vnet); 1407 error = so->so_proto->pr_listen(so, backlog, td); 1408 CURVNET_RESTORE(); 1409 return (error); 1410 } 1411 1412 /* 1413 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 1414 * order to interlock with socket I/O. 1415 */ 1416 int 1417 solisten_proto_check(struct socket *so) 1418 { 1419 SOCK_LOCK_ASSERT(so); 1420 1421 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1422 SS_ISDISCONNECTING)) != 0) 1423 return (EINVAL); 1424 1425 /* 1426 * Sleeping is not permitted here, so simply fail if userspace is 1427 * attempting to transmit or receive on the socket. This kind of 1428 * transient failure is not ideal, but it should occur only if userspace 1429 * is misusing the socket interfaces. 1430 */ 1431 if (!sx_try_xlock(&so->so_snd_sx)) 1432 return (EAGAIN); 1433 if (!sx_try_xlock(&so->so_rcv_sx)) { 1434 sx_xunlock(&so->so_snd_sx); 1435 return (EAGAIN); 1436 } 1437 mtx_lock(&so->so_snd_mtx); 1438 mtx_lock(&so->so_rcv_mtx); 1439 1440 /* Interlock with soo_aio_queue() and KTLS. */ 1441 if (!SOLISTENING(so)) { 1442 bool ktls; 1443 1444 #ifdef KERN_TLS 1445 ktls = so->so_snd.sb_tls_info != NULL || 1446 so->so_rcv.sb_tls_info != NULL; 1447 #else 1448 ktls = false; 1449 #endif 1450 if (ktls || 1451 (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || 1452 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { 1453 solisten_proto_abort(so); 1454 return (EINVAL); 1455 } 1456 } 1457 1458 return (0); 1459 } 1460 1461 /* 1462 * Undo the setup done by solisten_proto_check(). 1463 */ 1464 void 1465 solisten_proto_abort(struct socket *so) 1466 { 1467 mtx_unlock(&so->so_snd_mtx); 1468 mtx_unlock(&so->so_rcv_mtx); 1469 sx_xunlock(&so->so_snd_sx); 1470 sx_xunlock(&so->so_rcv_sx); 1471 } 1472 1473 void 1474 solisten_proto(struct socket *so, int backlog) 1475 { 1476 int sbrcv_lowat, sbsnd_lowat; 1477 u_int sbrcv_hiwat, sbsnd_hiwat; 1478 short sbrcv_flags, sbsnd_flags; 1479 sbintime_t sbrcv_timeo, sbsnd_timeo; 1480 1481 SOCK_LOCK_ASSERT(so); 1482 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1483 SS_ISDISCONNECTING)) == 0, 1484 ("%s: bad socket state %p", __func__, so)); 1485 1486 if (SOLISTENING(so)) 1487 goto listening; 1488 1489 /* 1490 * Change this socket to listening state. 1491 */ 1492 sbrcv_lowat = so->so_rcv.sb_lowat; 1493 sbsnd_lowat = so->so_snd.sb_lowat; 1494 sbrcv_hiwat = so->so_rcv.sb_hiwat; 1495 sbsnd_hiwat = so->so_snd.sb_hiwat; 1496 sbrcv_flags = so->so_rcv.sb_flags; 1497 sbsnd_flags = so->so_snd.sb_flags; 1498 sbrcv_timeo = so->so_rcv.sb_timeo; 1499 sbsnd_timeo = so->so_snd.sb_timeo; 1500 1501 #ifdef MAC 1502 mac_socketpeer_label_free(so->so_peerlabel); 1503 #endif 1504 1505 if (!(so->so_proto->pr_flags & PR_SOCKBUF)) { 1506 sbdestroy(so, SO_SND); 1507 sbdestroy(so, SO_RCV); 1508 } 1509 1510 #ifdef INVARIANTS 1511 bzero(&so->so_rcv, 1512 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 1513 #endif 1514 1515 so->sol_sbrcv_lowat = sbrcv_lowat; 1516 so->sol_sbsnd_lowat = sbsnd_lowat; 1517 so->sol_sbrcv_hiwat = sbrcv_hiwat; 1518 so->sol_sbsnd_hiwat = sbsnd_hiwat; 1519 so->sol_sbrcv_flags = sbrcv_flags; 1520 so->sol_sbsnd_flags = sbsnd_flags; 1521 so->sol_sbrcv_timeo = sbrcv_timeo; 1522 so->sol_sbsnd_timeo = sbsnd_timeo; 1523 1524 so->sol_qlen = so->sol_incqlen = 0; 1525 TAILQ_INIT(&so->sol_incomp); 1526 TAILQ_INIT(&so->sol_comp); 1527 1528 so->sol_accept_filter = NULL; 1529 so->sol_accept_filter_arg = NULL; 1530 so->sol_accept_filter_str = NULL; 1531 1532 so->sol_upcall = NULL; 1533 so->sol_upcallarg = NULL; 1534 1535 so->so_options |= SO_ACCEPTCONN; 1536 1537 listening: 1538 if (backlog < 0 || backlog > V_somaxconn) 1539 backlog = V_somaxconn; 1540 so->sol_qlimit = backlog; 1541 1542 mtx_unlock(&so->so_snd_mtx); 1543 mtx_unlock(&so->so_rcv_mtx); 1544 sx_xunlock(&so->so_snd_sx); 1545 sx_xunlock(&so->so_rcv_sx); 1546 } 1547 1548 /* 1549 * Wakeup listeners/subsystems once we have a complete connection. 1550 * Enters with lock, returns unlocked. 1551 */ 1552 void 1553 solisten_wakeup(struct socket *sol) 1554 { 1555 1556 if (sol->sol_upcall != NULL) 1557 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1558 else { 1559 selwakeuppri(&sol->so_rdsel, PSOCK); 1560 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1561 } 1562 SOLISTEN_UNLOCK(sol); 1563 wakeup_one(&sol->sol_comp); 1564 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1565 pgsigio(&sol->so_sigio, SIGIO, 0); 1566 } 1567 1568 /* 1569 * Return single connection off a listening socket queue. Main consumer of 1570 * the function is kern_accept4(). Some modules, that do their own accept 1571 * management also use the function. The socket reference held by the 1572 * listen queue is handed to the caller. 1573 * 1574 * Listening socket must be locked on entry and is returned unlocked on 1575 * return. 1576 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1577 */ 1578 int 1579 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1580 { 1581 struct socket *so; 1582 int error; 1583 1584 SOLISTEN_LOCK_ASSERT(head); 1585 1586 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1587 head->so_error == 0) { 1588 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1589 "accept", 0); 1590 if (error != 0) { 1591 SOLISTEN_UNLOCK(head); 1592 return (error); 1593 } 1594 } 1595 if (head->so_error) { 1596 error = head->so_error; 1597 head->so_error = 0; 1598 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1599 error = EWOULDBLOCK; 1600 else 1601 error = 0; 1602 if (error) { 1603 SOLISTEN_UNLOCK(head); 1604 return (error); 1605 } 1606 so = TAILQ_FIRST(&head->sol_comp); 1607 SOCK_LOCK(so); 1608 KASSERT(so->so_qstate == SQ_COMP, 1609 ("%s: so %p not SQ_COMP", __func__, so)); 1610 head->sol_qlen--; 1611 so->so_qstate = SQ_NONE; 1612 so->so_listen = NULL; 1613 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1614 if (flags & ACCEPT4_INHERIT) 1615 so->so_state |= (head->so_state & SS_NBIO); 1616 else 1617 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1618 SOCK_UNLOCK(so); 1619 sorele_locked(head); 1620 1621 *ret = so; 1622 return (0); 1623 } 1624 1625 static struct so_splice * 1626 so_splice_alloc(off_t max) 1627 { 1628 struct so_splice *sp; 1629 1630 sp = uma_zalloc(splice_zone, M_WAITOK); 1631 sp->src = NULL; 1632 sp->dst = NULL; 1633 sp->max = max > 0 ? max : -1; 1634 do { 1635 sp->wq_index = atomic_fetchadd_32(&splice_index, 1) % 1636 (mp_maxid + 1); 1637 } while (CPU_ABSENT(sp->wq_index)); 1638 sp->state = SPLICE_INIT; 1639 TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout, 1640 sp); 1641 return (sp); 1642 } 1643 1644 static void 1645 so_splice_free(struct so_splice *sp) 1646 { 1647 KASSERT(sp->state == SPLICE_CLOSED, 1648 ("so_splice_free: sp %p not closed", sp)); 1649 uma_zfree(splice_zone, sp); 1650 } 1651 1652 static void 1653 so_splice_timeout(void *arg, int pending __unused) 1654 { 1655 struct so_splice *sp; 1656 1657 sp = arg; 1658 (void)so_unsplice(sp->src, true); 1659 } 1660 1661 /* 1662 * Splice the output from so to the input of so2. 1663 */ 1664 static int 1665 so_splice(struct socket *so, struct socket *so2, struct splice *splice) 1666 { 1667 struct so_splice *sp; 1668 int error; 1669 1670 if (splice->sp_max < 0) 1671 return (EINVAL); 1672 /* Handle only TCP for now; TODO: other streaming protos */ 1673 if (so->so_proto->pr_protocol != IPPROTO_TCP || 1674 so2->so_proto->pr_protocol != IPPROTO_TCP) 1675 return (EPROTONOSUPPORT); 1676 if (so->so_vnet != so2->so_vnet) 1677 return (EINVAL); 1678 1679 /* so_splice_xfer() assumes that we're using these implementations. */ 1680 KASSERT(so->so_proto->pr_sosend == sosend_generic, 1681 ("so_splice: sosend not sosend_generic")); 1682 KASSERT(so2->so_proto->pr_soreceive == soreceive_generic || 1683 so2->so_proto->pr_soreceive == soreceive_stream, 1684 ("so_splice: soreceive not soreceive_generic/stream")); 1685 1686 sp = so_splice_alloc(splice->sp_max); 1687 so->so_splice_sent = 0; 1688 sp->src = so; 1689 sp->dst = so2; 1690 1691 error = 0; 1692 SOCK_LOCK(so); 1693 if (SOLISTENING(so)) 1694 error = EINVAL; 1695 else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1696 error = ENOTCONN; 1697 else if (so->so_splice != NULL) 1698 error = EBUSY; 1699 if (error != 0) { 1700 SOCK_UNLOCK(so); 1701 uma_zfree(splice_zone, sp); 1702 return (error); 1703 } 1704 SOCK_RECVBUF_LOCK(so); 1705 if (so->so_rcv.sb_tls_info != NULL) { 1706 SOCK_RECVBUF_UNLOCK(so); 1707 SOCK_UNLOCK(so); 1708 uma_zfree(splice_zone, sp); 1709 return (EINVAL); 1710 } 1711 so->so_rcv.sb_flags |= SB_SPLICED; 1712 so->so_splice = sp; 1713 soref(so); 1714 SOCK_RECVBUF_UNLOCK(so); 1715 SOCK_UNLOCK(so); 1716 1717 error = 0; 1718 SOCK_LOCK(so2); 1719 if (SOLISTENING(so2)) 1720 error = EINVAL; 1721 else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1722 error = ENOTCONN; 1723 else if (so2->so_splice_back != NULL) 1724 error = EBUSY; 1725 if (error != 0) { 1726 SOCK_UNLOCK(so2); 1727 mtx_lock(&sp->mtx); 1728 sp->dst = NULL; 1729 sp->state = SPLICE_EXCEPTION; 1730 mtx_unlock(&sp->mtx); 1731 so_unsplice(so, false); 1732 return (error); 1733 } 1734 SOCK_SENDBUF_LOCK(so2); 1735 if (so->so_snd.sb_tls_info != NULL) { 1736 SOCK_SENDBUF_UNLOCK(so2); 1737 SOCK_UNLOCK(so2); 1738 mtx_lock(&sp->mtx); 1739 sp->dst = NULL; 1740 sp->state = SPLICE_EXCEPTION; 1741 mtx_unlock(&sp->mtx); 1742 so_unsplice(so, false); 1743 return (EINVAL); 1744 } 1745 so2->so_snd.sb_flags |= SB_SPLICED; 1746 so2->so_splice_back = sp; 1747 soref(so2); 1748 mtx_lock(&sp->mtx); 1749 SOCK_SENDBUF_UNLOCK(so2); 1750 SOCK_UNLOCK(so2); 1751 1752 if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) { 1753 taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout, 1754 tvtosbt(splice->sp_idle), 0, C_PREL(4)); 1755 } 1756 1757 /* 1758 * Transfer any data already present in the socket buffer. 1759 */ 1760 KASSERT(sp->state == SPLICE_INIT, 1761 ("so_splice: splice %p state %d", sp, sp->state)); 1762 sp->state = SPLICE_QUEUED; 1763 so_splice_xfer(sp); 1764 return (0); 1765 } 1766 1767 static int 1768 so_unsplice(struct socket *so, bool timeout) 1769 { 1770 struct socket *so2; 1771 struct so_splice *sp; 1772 bool drain, so2rele; 1773 1774 /* 1775 * First unset SB_SPLICED and hide the splice structure so that 1776 * wakeup routines will stop enqueuing work. This also ensures that 1777 * a only a single thread will proceed with the unsplice. 1778 */ 1779 SOCK_LOCK(so); 1780 if (SOLISTENING(so)) { 1781 SOCK_UNLOCK(so); 1782 return (EINVAL); 1783 } 1784 SOCK_RECVBUF_LOCK(so); 1785 if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) { 1786 SOCK_RECVBUF_UNLOCK(so); 1787 SOCK_UNLOCK(so); 1788 return (ENOTCONN); 1789 } 1790 sp = so->so_splice; 1791 mtx_lock(&sp->mtx); 1792 if (sp->state == SPLICE_INIT) { 1793 /* 1794 * A splice is in the middle of being set up. 1795 */ 1796 mtx_unlock(&sp->mtx); 1797 SOCK_RECVBUF_UNLOCK(so); 1798 SOCK_UNLOCK(so); 1799 return (ENOTCONN); 1800 } 1801 mtx_unlock(&sp->mtx); 1802 so->so_rcv.sb_flags &= ~SB_SPLICED; 1803 so->so_splice = NULL; 1804 SOCK_RECVBUF_UNLOCK(so); 1805 SOCK_UNLOCK(so); 1806 1807 so2 = sp->dst; 1808 if (so2 != NULL) { 1809 SOCK_LOCK(so2); 1810 KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__)); 1811 SOCK_SENDBUF_LOCK(so2); 1812 KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0, 1813 ("%s: so2 is not spliced", __func__)); 1814 KASSERT(so2->so_splice_back == sp, 1815 ("%s: so_splice_back != sp", __func__)); 1816 so2->so_snd.sb_flags &= ~SB_SPLICED; 1817 so2rele = so2->so_splice_back != NULL; 1818 so2->so_splice_back = NULL; 1819 SOCK_SENDBUF_UNLOCK(so2); 1820 SOCK_UNLOCK(so2); 1821 } 1822 1823 /* 1824 * No new work is being enqueued. The worker thread might be 1825 * splicing data right now, in which case we want to wait for it to 1826 * finish before proceeding. 1827 */ 1828 mtx_lock(&sp->mtx); 1829 switch (sp->state) { 1830 case SPLICE_QUEUED: 1831 case SPLICE_RUNNING: 1832 sp->state = SPLICE_CLOSING; 1833 while (sp->state == SPLICE_CLOSING) 1834 msleep(sp, &sp->mtx, PSOCK, "unsplice", 0); 1835 break; 1836 case SPLICE_INIT: 1837 case SPLICE_IDLE: 1838 case SPLICE_EXCEPTION: 1839 sp->state = SPLICE_CLOSED; 1840 break; 1841 default: 1842 __assert_unreachable(); 1843 } 1844 if (!timeout) { 1845 drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout, 1846 NULL) != 0; 1847 } else { 1848 drain = false; 1849 } 1850 mtx_unlock(&sp->mtx); 1851 if (drain) 1852 taskqueue_drain_timeout(taskqueue_thread, &sp->timeout); 1853 1854 /* 1855 * Now we hold the sole reference to the splice structure. 1856 * Clean up: signal userspace and release socket references. 1857 */ 1858 sorwakeup(so); 1859 CURVNET_SET(so->so_vnet); 1860 sorele(so); 1861 if (so2 != NULL) { 1862 sowwakeup(so2); 1863 if (so2rele) 1864 sorele(so2); 1865 } 1866 CURVNET_RESTORE(); 1867 so_splice_free(sp); 1868 return (0); 1869 } 1870 1871 /* 1872 * Free socket upon release of the very last reference. 1873 */ 1874 static void 1875 sofree(struct socket *so) 1876 { 1877 struct protosw *pr = so->so_proto; 1878 1879 SOCK_LOCK_ASSERT(so); 1880 KASSERT(refcount_load(&so->so_count) == 0, 1881 ("%s: so %p has references", __func__, so)); 1882 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, 1883 ("%s: so %p is on listen queue", __func__, so)); 1884 KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0, 1885 ("%s: so %p rcvbuf is spliced", __func__, so)); 1886 KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0, 1887 ("%s: so %p sndbuf is spliced", __func__, so)); 1888 KASSERT(so->so_splice == NULL && so->so_splice_back == NULL, 1889 ("%s: so %p has spliced data", __func__, so)); 1890 1891 SOCK_UNLOCK(so); 1892 1893 if (so->so_dtor != NULL) 1894 so->so_dtor(so); 1895 1896 VNET_SO_ASSERT(so); 1897 if (pr->pr_detach != NULL) 1898 pr->pr_detach(so); 1899 1900 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { 1901 /* 1902 * From this point on, we assume that no other references to 1903 * this socket exist anywhere else in the stack. Therefore, 1904 * no locks need to be acquired or held. 1905 */ 1906 #ifdef INVARIANTS 1907 SOCK_SENDBUF_LOCK(so); 1908 SOCK_RECVBUF_LOCK(so); 1909 #endif 1910 sbdestroy(so, SO_SND); 1911 sbdestroy(so, SO_RCV); 1912 #ifdef INVARIANTS 1913 SOCK_SENDBUF_UNLOCK(so); 1914 SOCK_RECVBUF_UNLOCK(so); 1915 #endif 1916 mtx_destroy(&so->so_snd_mtx); 1917 mtx_destroy(&so->so_rcv_mtx); 1918 } 1919 seldrain(&so->so_rdsel); 1920 seldrain(&so->so_wrsel); 1921 knlist_destroy(&so->so_rdsel.si_note); 1922 knlist_destroy(&so->so_wrsel.si_note); 1923 sodealloc(so); 1924 } 1925 1926 /* 1927 * Release a reference on a socket while holding the socket lock. 1928 * Unlocks the socket lock before returning. 1929 */ 1930 void 1931 sorele_locked(struct socket *so) 1932 { 1933 SOCK_LOCK_ASSERT(so); 1934 if (refcount_release(&so->so_count)) 1935 sofree(so); 1936 else 1937 SOCK_UNLOCK(so); 1938 } 1939 1940 /* 1941 * Close a socket on last file table reference removal. Initiate disconnect 1942 * if connected. Free socket when disconnect complete. 1943 * 1944 * This function will sorele() the socket. Note that soclose() may be called 1945 * prior to the ref count reaching zero. The actual socket structure will 1946 * not be freed until the ref count reaches zero. 1947 */ 1948 int 1949 soclose(struct socket *so) 1950 { 1951 struct accept_queue lqueue; 1952 int error = 0; 1953 bool listening, last __diagused; 1954 1955 CURVNET_SET(so->so_vnet); 1956 funsetown(&so->so_sigio); 1957 if (so->so_state & SS_ISCONNECTED) { 1958 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1959 error = sodisconnect(so); 1960 if (error) { 1961 if (error == ENOTCONN) 1962 error = 0; 1963 goto drop; 1964 } 1965 } 1966 1967 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 1968 if ((so->so_state & SS_ISDISCONNECTING) && 1969 (so->so_state & SS_NBIO)) 1970 goto drop; 1971 while (so->so_state & SS_ISCONNECTED) { 1972 error = tsleep(&so->so_timeo, 1973 PSOCK | PCATCH, "soclos", 1974 so->so_linger * hz); 1975 if (error) 1976 break; 1977 } 1978 } 1979 } 1980 1981 drop: 1982 if (so->so_proto->pr_close != NULL) 1983 so->so_proto->pr_close(so); 1984 1985 SOCK_LOCK(so); 1986 if ((listening = SOLISTENING(so))) { 1987 struct socket *sp; 1988 1989 TAILQ_INIT(&lqueue); 1990 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1991 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1992 1993 so->sol_qlen = so->sol_incqlen = 0; 1994 1995 TAILQ_FOREACH(sp, &lqueue, so_list) { 1996 SOCK_LOCK(sp); 1997 sp->so_qstate = SQ_NONE; 1998 sp->so_listen = NULL; 1999 SOCK_UNLOCK(sp); 2000 last = refcount_release(&so->so_count); 2001 KASSERT(!last, ("%s: released last reference for %p", 2002 __func__, so)); 2003 } 2004 } 2005 sorele_locked(so); 2006 if (listening) { 2007 struct socket *sp, *tsp; 2008 2009 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) 2010 soabort(sp); 2011 } 2012 CURVNET_RESTORE(); 2013 return (error); 2014 } 2015 2016 /* 2017 * soabort() is used to abruptly tear down a connection, such as when a 2018 * resource limit is reached (listen queue depth exceeded), or if a listen 2019 * socket is closed while there are sockets waiting to be accepted. 2020 * 2021 * This interface is tricky, because it is called on an unreferenced socket, 2022 * and must be called only by a thread that has actually removed the socket 2023 * from the listen queue it was on. Likely this thread holds the last 2024 * reference on the socket and soabort() will proceed with sofree(). But 2025 * it might be not the last, as the sockets on the listen queues are seen 2026 * from the protocol side. 2027 * 2028 * This interface will call into the protocol code, so must not be called 2029 * with any socket locks held. Protocols do call it while holding their own 2030 * recursible protocol mutexes, but this is something that should be subject 2031 * to review in the future. 2032 * 2033 * Usually socket should have a single reference left, but this is not a 2034 * requirement. In the past, when we have had named references for file 2035 * descriptor and protocol, we asserted that none of them are being held. 2036 */ 2037 void 2038 soabort(struct socket *so) 2039 { 2040 2041 VNET_SO_ASSERT(so); 2042 2043 if (so->so_proto->pr_abort != NULL) 2044 so->so_proto->pr_abort(so); 2045 SOCK_LOCK(so); 2046 sorele_locked(so); 2047 } 2048 2049 int 2050 soaccept(struct socket *so, struct sockaddr *sa) 2051 { 2052 #ifdef INVARIANTS 2053 u_char len = sa->sa_len; 2054 #endif 2055 int error; 2056 2057 CURVNET_SET(so->so_vnet); 2058 error = so->so_proto->pr_accept(so, sa); 2059 KASSERT(sa->sa_len <= len, 2060 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2061 CURVNET_RESTORE(); 2062 return (error); 2063 } 2064 2065 int 2066 sopeeraddr(struct socket *so, struct sockaddr *sa) 2067 { 2068 #ifdef INVARIANTS 2069 u_char len = sa->sa_len; 2070 #endif 2071 int error; 2072 2073 CURVNET_ASSERT_SET(); 2074 2075 error = so->so_proto->pr_peeraddr(so, sa); 2076 KASSERT(sa->sa_len <= len, 2077 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2078 2079 return (error); 2080 } 2081 2082 int 2083 sosockaddr(struct socket *so, struct sockaddr *sa) 2084 { 2085 #ifdef INVARIANTS 2086 u_char len = sa->sa_len; 2087 #endif 2088 int error; 2089 2090 CURVNET_SET(so->so_vnet); 2091 error = so->so_proto->pr_sockaddr(so, sa); 2092 KASSERT(sa->sa_len <= len, 2093 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2094 CURVNET_RESTORE(); 2095 2096 return (error); 2097 } 2098 2099 int 2100 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 2101 { 2102 2103 return (soconnectat(AT_FDCWD, so, nam, td)); 2104 } 2105 2106 int 2107 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 2108 { 2109 int error; 2110 2111 CURVNET_SET(so->so_vnet); 2112 2113 /* 2114 * If protocol is connection-based, can only connect once. 2115 * Otherwise, if connected, try to disconnect first. This allows 2116 * user to disconnect by connecting to, e.g., a null address. 2117 * 2118 * Note, this check is racy and may need to be re-evaluated at the 2119 * protocol layer. 2120 */ 2121 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 2122 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 2123 (error = sodisconnect(so)))) { 2124 error = EISCONN; 2125 } else { 2126 /* 2127 * Prevent accumulated error from previous connection from 2128 * biting us. 2129 */ 2130 so->so_error = 0; 2131 if (fd == AT_FDCWD) { 2132 error = so->so_proto->pr_connect(so, nam, td); 2133 } else { 2134 error = so->so_proto->pr_connectat(fd, so, nam, td); 2135 } 2136 } 2137 CURVNET_RESTORE(); 2138 2139 return (error); 2140 } 2141 2142 int 2143 soconnect2(struct socket *so1, struct socket *so2) 2144 { 2145 int error; 2146 2147 CURVNET_SET(so1->so_vnet); 2148 error = so1->so_proto->pr_connect2(so1, so2); 2149 CURVNET_RESTORE(); 2150 return (error); 2151 } 2152 2153 int 2154 sodisconnect(struct socket *so) 2155 { 2156 int error; 2157 2158 if ((so->so_state & SS_ISCONNECTED) == 0) 2159 return (ENOTCONN); 2160 if (so->so_state & SS_ISDISCONNECTING) 2161 return (EALREADY); 2162 VNET_SO_ASSERT(so); 2163 error = so->so_proto->pr_disconnect(so); 2164 return (error); 2165 } 2166 2167 int 2168 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 2169 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2170 { 2171 long space; 2172 ssize_t resid; 2173 int clen = 0, error, dontroute; 2174 2175 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 2176 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 2177 ("sosend_dgram: !PR_ATOMIC")); 2178 2179 if (uio != NULL) 2180 resid = uio->uio_resid; 2181 else 2182 resid = top->m_pkthdr.len; 2183 /* 2184 * In theory resid should be unsigned. However, space must be 2185 * signed, as it might be less than 0 if we over-committed, and we 2186 * must use a signed comparison of space and resid. On the other 2187 * hand, a negative resid causes us to loop sending 0-length 2188 * segments to the protocol. 2189 */ 2190 if (resid < 0) { 2191 error = EINVAL; 2192 goto out; 2193 } 2194 2195 dontroute = 2196 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 2197 if (td != NULL) 2198 td->td_ru.ru_msgsnd++; 2199 if (control != NULL) 2200 clen = control->m_len; 2201 2202 SOCKBUF_LOCK(&so->so_snd); 2203 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2204 SOCKBUF_UNLOCK(&so->so_snd); 2205 error = EPIPE; 2206 goto out; 2207 } 2208 if (so->so_error) { 2209 error = so->so_error; 2210 so->so_error = 0; 2211 SOCKBUF_UNLOCK(&so->so_snd); 2212 goto out; 2213 } 2214 if ((so->so_state & SS_ISCONNECTED) == 0) { 2215 /* 2216 * `sendto' and `sendmsg' is allowed on a connection-based 2217 * socket if it supports implied connect. Return ENOTCONN if 2218 * not connected and no address is supplied. 2219 */ 2220 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2221 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2222 if (!(resid == 0 && clen != 0)) { 2223 SOCKBUF_UNLOCK(&so->so_snd); 2224 error = ENOTCONN; 2225 goto out; 2226 } 2227 } else if (addr == NULL) { 2228 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2229 error = ENOTCONN; 2230 else 2231 error = EDESTADDRREQ; 2232 SOCKBUF_UNLOCK(&so->so_snd); 2233 goto out; 2234 } 2235 } 2236 2237 /* 2238 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 2239 * problem and need fixing. 2240 */ 2241 space = sbspace(&so->so_snd); 2242 if (flags & MSG_OOB) 2243 space += 1024; 2244 space -= clen; 2245 SOCKBUF_UNLOCK(&so->so_snd); 2246 if (resid > space) { 2247 error = EMSGSIZE; 2248 goto out; 2249 } 2250 if (uio == NULL) { 2251 resid = 0; 2252 if (flags & MSG_EOR) 2253 top->m_flags |= M_EOR; 2254 } else { 2255 /* 2256 * Copy the data from userland into a mbuf chain. 2257 * If no data is to be copied in, a single empty mbuf 2258 * is returned. 2259 */ 2260 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 2261 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 2262 if (top == NULL) { 2263 error = EFAULT; /* only possible error */ 2264 goto out; 2265 } 2266 space -= resid - uio->uio_resid; 2267 resid = uio->uio_resid; 2268 } 2269 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 2270 /* 2271 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 2272 * than with. 2273 */ 2274 if (dontroute) { 2275 SOCK_LOCK(so); 2276 so->so_options |= SO_DONTROUTE; 2277 SOCK_UNLOCK(so); 2278 } 2279 /* 2280 * XXX all the SBS_CANTSENDMORE checks previously done could be out 2281 * of date. We could have received a reset packet in an interrupt or 2282 * maybe we slept while doing page faults in uiomove() etc. We could 2283 * probably recheck again inside the locking protection here, but 2284 * there are probably other places that this also happens. We must 2285 * rethink this. 2286 */ 2287 VNET_SO_ASSERT(so); 2288 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : 2289 /* 2290 * If the user set MSG_EOF, the protocol understands this flag and 2291 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 2292 */ 2293 ((flags & MSG_EOF) && 2294 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2295 (resid <= 0)) ? 2296 PRUS_EOF : 2297 /* If there is more to send set PRUS_MORETOCOME */ 2298 (flags & MSG_MORETOCOME) || 2299 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 2300 top, addr, control, td); 2301 if (dontroute) { 2302 SOCK_LOCK(so); 2303 so->so_options &= ~SO_DONTROUTE; 2304 SOCK_UNLOCK(so); 2305 } 2306 clen = 0; 2307 control = NULL; 2308 top = NULL; 2309 out: 2310 if (top != NULL) 2311 m_freem(top); 2312 if (control != NULL) 2313 m_freem(control); 2314 return (error); 2315 } 2316 2317 /* 2318 * Send on a socket. If send must go all at once and message is larger than 2319 * send buffering, then hard error. Lock against other senders. If must go 2320 * all at once and not enough room now, then inform user that this would 2321 * block and do nothing. Otherwise, if nonblocking, send as much as 2322 * possible. The data to be sent is described by "uio" if nonzero, otherwise 2323 * by the mbuf chain "top" (which must be null if uio is not). Data provided 2324 * in mbuf chain must be small enough to send all at once. 2325 * 2326 * Returns nonzero on error, timeout or signal; callers must check for short 2327 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 2328 * on return. 2329 */ 2330 static int 2331 sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, 2332 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2333 { 2334 long space; 2335 ssize_t resid; 2336 int clen = 0, error, dontroute; 2337 int atomic = sosendallatonce(so) || top; 2338 int pr_send_flag; 2339 #ifdef KERN_TLS 2340 struct ktls_session *tls; 2341 int tls_enq_cnt, tls_send_flag; 2342 uint8_t tls_rtype; 2343 2344 tls = NULL; 2345 tls_rtype = TLS_RLTYPE_APP; 2346 #endif 2347 2348 SOCK_IO_SEND_ASSERT_LOCKED(so); 2349 2350 if (uio != NULL) 2351 resid = uio->uio_resid; 2352 else if ((top->m_flags & M_PKTHDR) != 0) 2353 resid = top->m_pkthdr.len; 2354 else 2355 resid = m_length(top, NULL); 2356 /* 2357 * In theory resid should be unsigned. However, space must be 2358 * signed, as it might be less than 0 if we over-committed, and we 2359 * must use a signed comparison of space and resid. On the other 2360 * hand, a negative resid causes us to loop sending 0-length 2361 * segments to the protocol. 2362 * 2363 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 2364 * type sockets since that's an error. 2365 */ 2366 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 2367 error = EINVAL; 2368 goto out; 2369 } 2370 2371 dontroute = 2372 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 2373 (so->so_proto->pr_flags & PR_ATOMIC); 2374 if (td != NULL) 2375 td->td_ru.ru_msgsnd++; 2376 if (control != NULL) 2377 clen = control->m_len; 2378 2379 #ifdef KERN_TLS 2380 tls_send_flag = 0; 2381 tls = ktls_hold(so->so_snd.sb_tls_info); 2382 if (tls != NULL) { 2383 if (tls->mode == TCP_TLS_MODE_SW) 2384 tls_send_flag = PRUS_NOTREADY; 2385 2386 if (control != NULL) { 2387 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 2388 2389 if (clen >= sizeof(*cm) && 2390 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 2391 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 2392 clen = 0; 2393 m_freem(control); 2394 control = NULL; 2395 atomic = 1; 2396 } 2397 } 2398 2399 if (resid == 0 && !ktls_permit_empty_frames(tls)) { 2400 error = EINVAL; 2401 goto out; 2402 } 2403 } 2404 #endif 2405 2406 restart: 2407 do { 2408 SOCKBUF_LOCK(&so->so_snd); 2409 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2410 SOCKBUF_UNLOCK(&so->so_snd); 2411 error = EPIPE; 2412 goto out; 2413 } 2414 if (so->so_error) { 2415 error = so->so_error; 2416 so->so_error = 0; 2417 SOCKBUF_UNLOCK(&so->so_snd); 2418 goto out; 2419 } 2420 if ((so->so_state & SS_ISCONNECTED) == 0) { 2421 /* 2422 * `sendto' and `sendmsg' is allowed on a connection- 2423 * based socket if it supports implied connect. 2424 * Return ENOTCONN if not connected and no address is 2425 * supplied. 2426 */ 2427 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2428 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2429 if (!(resid == 0 && clen != 0)) { 2430 SOCKBUF_UNLOCK(&so->so_snd); 2431 error = ENOTCONN; 2432 goto out; 2433 } 2434 } else if (addr == NULL) { 2435 SOCKBUF_UNLOCK(&so->so_snd); 2436 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2437 error = ENOTCONN; 2438 else 2439 error = EDESTADDRREQ; 2440 goto out; 2441 } 2442 } 2443 space = sbspace(&so->so_snd); 2444 if (flags & MSG_OOB) 2445 space += 1024; 2446 if ((atomic && resid > so->so_snd.sb_hiwat) || 2447 clen > so->so_snd.sb_hiwat) { 2448 SOCKBUF_UNLOCK(&so->so_snd); 2449 error = EMSGSIZE; 2450 goto out; 2451 } 2452 if (space < resid + clen && 2453 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 2454 if ((so->so_state & SS_NBIO) || 2455 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 2456 SOCKBUF_UNLOCK(&so->so_snd); 2457 error = EWOULDBLOCK; 2458 goto out; 2459 } 2460 error = sbwait(so, SO_SND); 2461 SOCKBUF_UNLOCK(&so->so_snd); 2462 if (error) 2463 goto out; 2464 goto restart; 2465 } 2466 SOCKBUF_UNLOCK(&so->so_snd); 2467 space -= clen; 2468 do { 2469 if (uio == NULL) { 2470 resid = 0; 2471 if (flags & MSG_EOR) 2472 top->m_flags |= M_EOR; 2473 #ifdef KERN_TLS 2474 if (tls != NULL) { 2475 ktls_frame(top, tls, &tls_enq_cnt, 2476 tls_rtype); 2477 tls_rtype = TLS_RLTYPE_APP; 2478 } 2479 #endif 2480 } else { 2481 /* 2482 * Copy the data from userland into a mbuf 2483 * chain. If resid is 0, which can happen 2484 * only if we have control to send, then 2485 * a single empty mbuf is returned. This 2486 * is a workaround to prevent protocol send 2487 * methods to panic. 2488 */ 2489 #ifdef KERN_TLS 2490 if (tls != NULL) { 2491 top = m_uiotombuf(uio, M_WAITOK, space, 2492 tls->params.max_frame_len, 2493 M_EXTPG | 2494 ((flags & MSG_EOR) ? M_EOR : 0)); 2495 if (top != NULL) { 2496 ktls_frame(top, tls, 2497 &tls_enq_cnt, tls_rtype); 2498 } 2499 tls_rtype = TLS_RLTYPE_APP; 2500 } else 2501 #endif 2502 top = m_uiotombuf(uio, M_WAITOK, space, 2503 (atomic ? max_hdr : 0), 2504 (atomic ? M_PKTHDR : 0) | 2505 ((flags & MSG_EOR) ? M_EOR : 0)); 2506 if (top == NULL) { 2507 error = EFAULT; /* only possible error */ 2508 goto out; 2509 } 2510 space -= resid - uio->uio_resid; 2511 resid = uio->uio_resid; 2512 } 2513 if (dontroute) { 2514 SOCK_LOCK(so); 2515 so->so_options |= SO_DONTROUTE; 2516 SOCK_UNLOCK(so); 2517 } 2518 /* 2519 * XXX all the SBS_CANTSENDMORE checks previously 2520 * done could be out of date. We could have received 2521 * a reset packet in an interrupt or maybe we slept 2522 * while doing page faults in uiomove() etc. We 2523 * could probably recheck again inside the locking 2524 * protection here, but there are probably other 2525 * places that this also happens. We must rethink 2526 * this. 2527 */ 2528 VNET_SO_ASSERT(so); 2529 2530 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : 2531 /* 2532 * If the user set MSG_EOF, the protocol understands 2533 * this flag and nothing left to send then use 2534 * PRU_SEND_EOF instead of PRU_SEND. 2535 */ 2536 ((flags & MSG_EOF) && 2537 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2538 (resid <= 0)) ? 2539 PRUS_EOF : 2540 /* If there is more to send set PRUS_MORETOCOME. */ 2541 (flags & MSG_MORETOCOME) || 2542 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 2543 2544 #ifdef KERN_TLS 2545 pr_send_flag |= tls_send_flag; 2546 #endif 2547 2548 error = so->so_proto->pr_send(so, pr_send_flag, top, 2549 addr, control, td); 2550 2551 if (dontroute) { 2552 SOCK_LOCK(so); 2553 so->so_options &= ~SO_DONTROUTE; 2554 SOCK_UNLOCK(so); 2555 } 2556 2557 #ifdef KERN_TLS 2558 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 2559 if (error != 0) { 2560 m_freem(top); 2561 top = NULL; 2562 } else { 2563 soref(so); 2564 ktls_enqueue(top, so, tls_enq_cnt); 2565 } 2566 } 2567 #endif 2568 clen = 0; 2569 control = NULL; 2570 top = NULL; 2571 if (error) 2572 goto out; 2573 } while (resid && space > 0); 2574 } while (resid); 2575 2576 out: 2577 #ifdef KERN_TLS 2578 if (tls != NULL) 2579 ktls_free(tls); 2580 #endif 2581 if (top != NULL) 2582 m_freem(top); 2583 if (control != NULL) 2584 m_freem(control); 2585 return (error); 2586 } 2587 2588 int 2589 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 2590 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2591 { 2592 int error; 2593 2594 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 2595 if (error) 2596 return (error); 2597 error = sosend_generic_locked(so, addr, uio, top, control, flags, td); 2598 SOCK_IO_SEND_UNLOCK(so); 2599 return (error); 2600 } 2601 2602 /* 2603 * Send to a socket from a kernel thread. 2604 * 2605 * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. 2606 * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs 2607 * to be set at all. This function should just boil down to a static inline 2608 * calling the protocol method. 2609 */ 2610 int 2611 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2612 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2613 { 2614 int error; 2615 2616 CURVNET_SET(so->so_vnet); 2617 error = so->so_proto->pr_sosend(so, addr, uio, 2618 top, control, flags, td); 2619 CURVNET_RESTORE(); 2620 return (error); 2621 } 2622 2623 /* 2624 * send(2), write(2) or aio_write(2) on a socket. 2625 */ 2626 int 2627 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2628 struct mbuf *control, int flags, struct proc *userproc) 2629 { 2630 struct thread *td; 2631 ssize_t len; 2632 int error; 2633 2634 td = uio->uio_td; 2635 len = uio->uio_resid; 2636 CURVNET_SET(so->so_vnet); 2637 error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, 2638 td); 2639 CURVNET_RESTORE(); 2640 if (error != 0) { 2641 /* 2642 * Clear transient errors for stream protocols if they made 2643 * some progress. Make exclusion for aio(4) that would 2644 * schedule a new write in case of EWOULDBLOCK and clear 2645 * error itself. See soaio_process_job(). 2646 */ 2647 if (uio->uio_resid != len && 2648 (so->so_proto->pr_flags & PR_ATOMIC) == 0 && 2649 userproc == NULL && 2650 (error == ERESTART || error == EINTR || 2651 error == EWOULDBLOCK)) 2652 error = 0; 2653 /* Generation of SIGPIPE can be controlled per socket. */ 2654 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && 2655 (flags & MSG_NOSIGNAL) == 0) { 2656 if (userproc != NULL) { 2657 /* aio(4) job */ 2658 PROC_LOCK(userproc); 2659 kern_psignal(userproc, SIGPIPE); 2660 PROC_UNLOCK(userproc); 2661 } else { 2662 PROC_LOCK(td->td_proc); 2663 tdsignal(td, SIGPIPE); 2664 PROC_UNLOCK(td->td_proc); 2665 } 2666 } 2667 } 2668 return (error); 2669 } 2670 2671 /* 2672 * The part of soreceive() that implements reading non-inline out-of-band 2673 * data from a socket. For more complete comments, see soreceive(), from 2674 * which this code originated. 2675 * 2676 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 2677 * unable to return an mbuf chain to the caller. 2678 */ 2679 static int 2680 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 2681 { 2682 struct protosw *pr = so->so_proto; 2683 struct mbuf *m; 2684 int error; 2685 2686 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 2687 VNET_SO_ASSERT(so); 2688 2689 m = m_get(M_WAITOK, MT_DATA); 2690 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 2691 if (error) 2692 goto bad; 2693 do { 2694 error = uiomove(mtod(m, void *), 2695 (int) min(uio->uio_resid, m->m_len), uio); 2696 m = m_free(m); 2697 } while (uio->uio_resid && error == 0 && m); 2698 bad: 2699 if (m != NULL) 2700 m_freem(m); 2701 return (error); 2702 } 2703 2704 /* 2705 * Following replacement or removal of the first mbuf on the first mbuf chain 2706 * of a socket buffer, push necessary state changes back into the socket 2707 * buffer so that other consumers see the values consistently. 'nextrecord' 2708 * is the callers locally stored value of the original value of 2709 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 2710 * NOTE: 'nextrecord' may be NULL. 2711 */ 2712 static __inline void 2713 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 2714 { 2715 2716 SOCKBUF_LOCK_ASSERT(sb); 2717 /* 2718 * First, update for the new value of nextrecord. If necessary, make 2719 * it the first record. 2720 */ 2721 if (sb->sb_mb != NULL) 2722 sb->sb_mb->m_nextpkt = nextrecord; 2723 else 2724 sb->sb_mb = nextrecord; 2725 2726 /* 2727 * Now update any dependent socket buffer fields to reflect the new 2728 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 2729 * addition of a second clause that takes care of the case where 2730 * sb_mb has been updated, but remains the last record. 2731 */ 2732 if (sb->sb_mb == NULL) { 2733 sb->sb_mbtail = NULL; 2734 sb->sb_lastrecord = NULL; 2735 } else if (sb->sb_mb->m_nextpkt == NULL) 2736 sb->sb_lastrecord = sb->sb_mb; 2737 } 2738 2739 /* 2740 * Implement receive operations on a socket. We depend on the way that 2741 * records are added to the sockbuf by sbappend. In particular, each record 2742 * (mbufs linked through m_next) must begin with an address if the protocol 2743 * so specifies, followed by an optional mbuf or mbufs containing ancillary 2744 * data, and then zero or more mbufs of data. In order to allow parallelism 2745 * between network receive and copying to user space, as well as avoid 2746 * sleeping with a mutex held, we release the socket buffer mutex during the 2747 * user space copy. Although the sockbuf is locked, new data may still be 2748 * appended, and thus we must maintain consistency of the sockbuf during that 2749 * time. 2750 * 2751 * The caller may receive the data as a single mbuf chain by supplying an 2752 * mbuf **mp for use in returning the chain. The uio is then used only for 2753 * the count in uio_resid. 2754 */ 2755 static int 2756 soreceive_generic_locked(struct socket *so, struct sockaddr **psa, 2757 struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) 2758 { 2759 struct mbuf *m; 2760 int flags, error, offset; 2761 ssize_t len; 2762 struct protosw *pr = so->so_proto; 2763 struct mbuf *nextrecord; 2764 int moff, type = 0; 2765 ssize_t orig_resid = uio->uio_resid; 2766 bool report_real_len = false; 2767 2768 SOCK_IO_RECV_ASSERT_LOCKED(so); 2769 2770 error = 0; 2771 if (flagsp != NULL) { 2772 report_real_len = *flagsp & MSG_TRUNC; 2773 *flagsp &= ~MSG_TRUNC; 2774 flags = *flagsp &~ MSG_EOR; 2775 } else 2776 flags = 0; 2777 2778 restart: 2779 SOCKBUF_LOCK(&so->so_rcv); 2780 m = so->so_rcv.sb_mb; 2781 /* 2782 * If we have less data than requested, block awaiting more (subject 2783 * to any timeout) if: 2784 * 1. the current count is less than the low water mark, or 2785 * 2. MSG_DONTWAIT is not set 2786 */ 2787 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 2788 sbavail(&so->so_rcv) < uio->uio_resid) && 2789 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 2790 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 2791 KASSERT(m != NULL || !sbavail(&so->so_rcv), 2792 ("receive: m == %p sbavail == %u", 2793 m, sbavail(&so->so_rcv))); 2794 if (so->so_error || so->so_rerror) { 2795 if (m != NULL) 2796 goto dontblock; 2797 if (so->so_error) 2798 error = so->so_error; 2799 else 2800 error = so->so_rerror; 2801 if ((flags & MSG_PEEK) == 0) { 2802 if (so->so_error) 2803 so->so_error = 0; 2804 else 2805 so->so_rerror = 0; 2806 } 2807 SOCKBUF_UNLOCK(&so->so_rcv); 2808 goto release; 2809 } 2810 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2811 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2812 if (m != NULL) 2813 goto dontblock; 2814 #ifdef KERN_TLS 2815 else if (so->so_rcv.sb_tlsdcc == 0 && 2816 so->so_rcv.sb_tlscc == 0) { 2817 #else 2818 else { 2819 #endif 2820 SOCKBUF_UNLOCK(&so->so_rcv); 2821 goto release; 2822 } 2823 } 2824 for (; m != NULL; m = m->m_next) 2825 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2826 m = so->so_rcv.sb_mb; 2827 goto dontblock; 2828 } 2829 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2830 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2831 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2832 SOCKBUF_UNLOCK(&so->so_rcv); 2833 error = ENOTCONN; 2834 goto release; 2835 } 2836 if (uio->uio_resid == 0 && !report_real_len) { 2837 SOCKBUF_UNLOCK(&so->so_rcv); 2838 goto release; 2839 } 2840 if ((so->so_state & SS_NBIO) || 2841 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2842 SOCKBUF_UNLOCK(&so->so_rcv); 2843 error = EWOULDBLOCK; 2844 goto release; 2845 } 2846 SBLASTRECORDCHK(&so->so_rcv); 2847 SBLASTMBUFCHK(&so->so_rcv); 2848 error = sbwait(so, SO_RCV); 2849 SOCKBUF_UNLOCK(&so->so_rcv); 2850 if (error) 2851 goto release; 2852 goto restart; 2853 } 2854 dontblock: 2855 /* 2856 * From this point onward, we maintain 'nextrecord' as a cache of the 2857 * pointer to the next record in the socket buffer. We must keep the 2858 * various socket buffer pointers and local stack versions of the 2859 * pointers in sync, pushing out modifications before dropping the 2860 * socket buffer mutex, and re-reading them when picking it up. 2861 * 2862 * Otherwise, we will race with the network stack appending new data 2863 * or records onto the socket buffer by using inconsistent/stale 2864 * versions of the field, possibly resulting in socket buffer 2865 * corruption. 2866 * 2867 * By holding the high-level sblock(), we prevent simultaneous 2868 * readers from pulling off the front of the socket buffer. 2869 */ 2870 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2871 if (uio->uio_td) 2872 uio->uio_td->td_ru.ru_msgrcv++; 2873 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2874 SBLASTRECORDCHK(&so->so_rcv); 2875 SBLASTMBUFCHK(&so->so_rcv); 2876 nextrecord = m->m_nextpkt; 2877 if (pr->pr_flags & PR_ADDR) { 2878 KASSERT(m->m_type == MT_SONAME, 2879 ("m->m_type == %d", m->m_type)); 2880 orig_resid = 0; 2881 if (psa != NULL) 2882 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2883 M_NOWAIT); 2884 if (flags & MSG_PEEK) { 2885 m = m->m_next; 2886 } else { 2887 sbfree(&so->so_rcv, m); 2888 so->so_rcv.sb_mb = m_free(m); 2889 m = so->so_rcv.sb_mb; 2890 sockbuf_pushsync(&so->so_rcv, nextrecord); 2891 } 2892 } 2893 2894 /* 2895 * Process one or more MT_CONTROL mbufs present before any data mbufs 2896 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2897 * just copy the data; if !MSG_PEEK, we call into the protocol to 2898 * perform externalization (or freeing if controlp == NULL). 2899 */ 2900 if (m != NULL && m->m_type == MT_CONTROL) { 2901 struct mbuf *cm = NULL, *cmn; 2902 struct mbuf **cme = &cm; 2903 #ifdef KERN_TLS 2904 struct cmsghdr *cmsg; 2905 struct tls_get_record tgr; 2906 2907 /* 2908 * For MSG_TLSAPPDATA, check for an alert record. 2909 * If found, return ENXIO without removing 2910 * it from the receive queue. This allows a subsequent 2911 * call without MSG_TLSAPPDATA to receive it. 2912 * Note that, for TLS, there should only be a single 2913 * control mbuf with the TLS_GET_RECORD message in it. 2914 */ 2915 if (flags & MSG_TLSAPPDATA) { 2916 cmsg = mtod(m, struct cmsghdr *); 2917 if (cmsg->cmsg_type == TLS_GET_RECORD && 2918 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2919 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2920 if (__predict_false(tgr.tls_type == 2921 TLS_RLTYPE_ALERT)) { 2922 SOCKBUF_UNLOCK(&so->so_rcv); 2923 error = ENXIO; 2924 goto release; 2925 } 2926 } 2927 } 2928 #endif 2929 2930 do { 2931 if (flags & MSG_PEEK) { 2932 if (controlp != NULL) { 2933 *controlp = m_copym(m, 0, m->m_len, 2934 M_NOWAIT); 2935 controlp = &(*controlp)->m_next; 2936 } 2937 m = m->m_next; 2938 } else { 2939 sbfree(&so->so_rcv, m); 2940 so->so_rcv.sb_mb = m->m_next; 2941 m->m_next = NULL; 2942 *cme = m; 2943 cme = &(*cme)->m_next; 2944 m = so->so_rcv.sb_mb; 2945 } 2946 } while (m != NULL && m->m_type == MT_CONTROL); 2947 if ((flags & MSG_PEEK) == 0) 2948 sockbuf_pushsync(&so->so_rcv, nextrecord); 2949 while (cm != NULL) { 2950 cmn = cm->m_next; 2951 cm->m_next = NULL; 2952 if (controlp != NULL) 2953 *controlp = cm; 2954 else 2955 m_freem(cm); 2956 if (controlp != NULL) { 2957 while (*controlp != NULL) 2958 controlp = &(*controlp)->m_next; 2959 } 2960 cm = cmn; 2961 } 2962 if (m != NULL) 2963 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 2964 else 2965 nextrecord = so->so_rcv.sb_mb; 2966 orig_resid = 0; 2967 } 2968 if (m != NULL) { 2969 if ((flags & MSG_PEEK) == 0) { 2970 KASSERT(m->m_nextpkt == nextrecord, 2971 ("soreceive: post-control, nextrecord !sync")); 2972 if (nextrecord == NULL) { 2973 KASSERT(so->so_rcv.sb_mb == m, 2974 ("soreceive: post-control, sb_mb!=m")); 2975 KASSERT(so->so_rcv.sb_lastrecord == m, 2976 ("soreceive: post-control, lastrecord!=m")); 2977 } 2978 } 2979 type = m->m_type; 2980 if (type == MT_OOBDATA) 2981 flags |= MSG_OOB; 2982 } else { 2983 if ((flags & MSG_PEEK) == 0) { 2984 KASSERT(so->so_rcv.sb_mb == nextrecord, 2985 ("soreceive: sb_mb != nextrecord")); 2986 if (so->so_rcv.sb_mb == NULL) { 2987 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2988 ("soreceive: sb_lastercord != NULL")); 2989 } 2990 } 2991 } 2992 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2993 SBLASTRECORDCHK(&so->so_rcv); 2994 SBLASTMBUFCHK(&so->so_rcv); 2995 2996 /* 2997 * Now continue to read any data mbufs off of the head of the socket 2998 * buffer until the read request is satisfied. Note that 'type' is 2999 * used to store the type of any mbuf reads that have happened so far 3000 * such that soreceive() can stop reading if the type changes, which 3001 * causes soreceive() to return only one of regular data and inline 3002 * out-of-band data in a single socket receive operation. 3003 */ 3004 moff = 0; 3005 offset = 0; 3006 while (m != NULL && !(m->m_flags & M_NOTREADY) && uio->uio_resid > 0 && 3007 error == 0) { 3008 /* 3009 * If the type of mbuf has changed since the last mbuf 3010 * examined ('type'), end the receive operation. 3011 */ 3012 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3013 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 3014 if (type != m->m_type) 3015 break; 3016 } else if (type == MT_OOBDATA) 3017 break; 3018 else 3019 KASSERT(m->m_type == MT_DATA, 3020 ("m->m_type == %d", m->m_type)); 3021 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 3022 len = uio->uio_resid; 3023 if (so->so_oobmark && len > so->so_oobmark - offset) 3024 len = so->so_oobmark - offset; 3025 if (len > m->m_len - moff) 3026 len = m->m_len - moff; 3027 /* 3028 * If mp is set, just pass back the mbufs. Otherwise copy 3029 * them out via the uio, then free. Sockbuf must be 3030 * consistent here (points to current mbuf, it points to next 3031 * record) when we drop priority; we must note any additions 3032 * to the sockbuf when we block interrupts again. 3033 */ 3034 if (mp == NULL) { 3035 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3036 SBLASTRECORDCHK(&so->so_rcv); 3037 SBLASTMBUFCHK(&so->so_rcv); 3038 SOCKBUF_UNLOCK(&so->so_rcv); 3039 if ((m->m_flags & M_EXTPG) != 0) 3040 error = m_unmapped_uiomove(m, moff, uio, 3041 (int)len); 3042 else 3043 error = uiomove(mtod(m, char *) + moff, 3044 (int)len, uio); 3045 SOCKBUF_LOCK(&so->so_rcv); 3046 if (error) { 3047 /* 3048 * The MT_SONAME mbuf has already been removed 3049 * from the record, so it is necessary to 3050 * remove the data mbufs, if any, to preserve 3051 * the invariant in the case of PR_ADDR that 3052 * requires MT_SONAME mbufs at the head of 3053 * each record. 3054 */ 3055 if (pr->pr_flags & PR_ATOMIC && 3056 ((flags & MSG_PEEK) == 0)) 3057 (void)sbdroprecord_locked(&so->so_rcv); 3058 SOCKBUF_UNLOCK(&so->so_rcv); 3059 goto release; 3060 } 3061 } else 3062 uio->uio_resid -= len; 3063 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3064 if (len == m->m_len - moff) { 3065 if (m->m_flags & M_EOR) 3066 flags |= MSG_EOR; 3067 if (flags & MSG_PEEK) { 3068 m = m->m_next; 3069 moff = 0; 3070 } else { 3071 nextrecord = m->m_nextpkt; 3072 sbfree(&so->so_rcv, m); 3073 if (mp != NULL) { 3074 m->m_nextpkt = NULL; 3075 *mp = m; 3076 mp = &m->m_next; 3077 so->so_rcv.sb_mb = m = m->m_next; 3078 *mp = NULL; 3079 } else { 3080 so->so_rcv.sb_mb = m_free(m); 3081 m = so->so_rcv.sb_mb; 3082 } 3083 sockbuf_pushsync(&so->so_rcv, nextrecord); 3084 SBLASTRECORDCHK(&so->so_rcv); 3085 SBLASTMBUFCHK(&so->so_rcv); 3086 } 3087 } else { 3088 if (flags & MSG_PEEK) 3089 moff += len; 3090 else { 3091 if (mp != NULL) { 3092 if (flags & MSG_DONTWAIT) { 3093 *mp = m_copym(m, 0, len, 3094 M_NOWAIT); 3095 if (*mp == NULL) { 3096 /* 3097 * m_copym() couldn't 3098 * allocate an mbuf. 3099 * Adjust uio_resid back 3100 * (it was adjusted 3101 * down by len bytes, 3102 * which we didn't end 3103 * up "copying" over). 3104 */ 3105 uio->uio_resid += len; 3106 break; 3107 } 3108 } else { 3109 SOCKBUF_UNLOCK(&so->so_rcv); 3110 *mp = m_copym(m, 0, len, 3111 M_WAITOK); 3112 SOCKBUF_LOCK(&so->so_rcv); 3113 } 3114 } 3115 sbcut_locked(&so->so_rcv, len); 3116 } 3117 } 3118 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3119 if (so->so_oobmark) { 3120 if ((flags & MSG_PEEK) == 0) { 3121 so->so_oobmark -= len; 3122 if (so->so_oobmark == 0) { 3123 so->so_rcv.sb_state |= SBS_RCVATMARK; 3124 break; 3125 } 3126 } else { 3127 offset += len; 3128 if (offset == so->so_oobmark) 3129 break; 3130 } 3131 } 3132 if (flags & MSG_EOR) 3133 break; 3134 /* 3135 * If the MSG_WAITALL flag is set (for non-atomic socket), we 3136 * must not quit until "uio->uio_resid == 0" or an error 3137 * termination. If a signal/timeout occurs, return with a 3138 * short count but without error. Keep sockbuf locked 3139 * against other readers. 3140 */ 3141 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 3142 !sosendallatonce(so) && nextrecord == NULL) { 3143 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3144 if (so->so_error || so->so_rerror || 3145 so->so_rcv.sb_state & SBS_CANTRCVMORE) 3146 break; 3147 /* 3148 * Notify the protocol that some data has been 3149 * drained before blocking. 3150 */ 3151 if (pr->pr_flags & PR_WANTRCVD) { 3152 SOCKBUF_UNLOCK(&so->so_rcv); 3153 VNET_SO_ASSERT(so); 3154 pr->pr_rcvd(so, flags); 3155 SOCKBUF_LOCK(&so->so_rcv); 3156 if (__predict_false(so->so_rcv.sb_mb == NULL && 3157 (so->so_error || so->so_rerror || 3158 so->so_rcv.sb_state & SBS_CANTRCVMORE))) 3159 break; 3160 } 3161 SBLASTRECORDCHK(&so->so_rcv); 3162 SBLASTMBUFCHK(&so->so_rcv); 3163 /* 3164 * We could receive some data while was notifying 3165 * the protocol. Skip blocking in this case. 3166 */ 3167 if (so->so_rcv.sb_mb == NULL) { 3168 error = sbwait(so, SO_RCV); 3169 if (error) { 3170 SOCKBUF_UNLOCK(&so->so_rcv); 3171 goto release; 3172 } 3173 } 3174 m = so->so_rcv.sb_mb; 3175 if (m != NULL) 3176 nextrecord = m->m_nextpkt; 3177 } 3178 } 3179 3180 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3181 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 3182 if (report_real_len) 3183 uio->uio_resid -= m_length(m, NULL) - moff; 3184 flags |= MSG_TRUNC; 3185 if ((flags & MSG_PEEK) == 0) 3186 (void) sbdroprecord_locked(&so->so_rcv); 3187 } 3188 if ((flags & MSG_PEEK) == 0) { 3189 if (m == NULL) { 3190 /* 3191 * First part is an inline SB_EMPTY_FIXUP(). Second 3192 * part makes sure sb_lastrecord is up-to-date if 3193 * there is still data in the socket buffer. 3194 */ 3195 so->so_rcv.sb_mb = nextrecord; 3196 if (so->so_rcv.sb_mb == NULL) { 3197 so->so_rcv.sb_mbtail = NULL; 3198 so->so_rcv.sb_lastrecord = NULL; 3199 } else if (nextrecord->m_nextpkt == NULL) 3200 so->so_rcv.sb_lastrecord = nextrecord; 3201 } 3202 SBLASTRECORDCHK(&so->so_rcv); 3203 SBLASTMBUFCHK(&so->so_rcv); 3204 /* 3205 * If soreceive() is being done from the socket callback, 3206 * then don't need to generate ACK to peer to update window, 3207 * since ACK will be generated on return to TCP. 3208 */ 3209 if (!(flags & MSG_SOCALLBCK) && 3210 (pr->pr_flags & PR_WANTRCVD)) { 3211 SOCKBUF_UNLOCK(&so->so_rcv); 3212 VNET_SO_ASSERT(so); 3213 pr->pr_rcvd(so, flags); 3214 SOCKBUF_LOCK(&so->so_rcv); 3215 } 3216 } 3217 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3218 if (orig_resid == uio->uio_resid && orig_resid && 3219 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 3220 SOCKBUF_UNLOCK(&so->so_rcv); 3221 goto restart; 3222 } 3223 SOCKBUF_UNLOCK(&so->so_rcv); 3224 3225 if (flagsp != NULL) 3226 *flagsp |= flags; 3227 release: 3228 return (error); 3229 } 3230 3231 int 3232 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 3233 struct mbuf **mp, struct mbuf **controlp, int *flagsp) 3234 { 3235 int error, flags; 3236 3237 if (psa != NULL) 3238 *psa = NULL; 3239 if (controlp != NULL) 3240 *controlp = NULL; 3241 if (flagsp != NULL) { 3242 flags = *flagsp; 3243 if ((flags & MSG_OOB) != 0) 3244 return (soreceive_rcvoob(so, uio, flags)); 3245 } else { 3246 flags = 0; 3247 } 3248 if (mp != NULL) 3249 *mp = NULL; 3250 3251 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3252 if (error) 3253 return (error); 3254 error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp); 3255 SOCK_IO_RECV_UNLOCK(so); 3256 return (error); 3257 } 3258 3259 /* 3260 * Optimized version of soreceive() for stream (TCP) sockets. 3261 */ 3262 static int 3263 soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 3264 struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, 3265 struct mbuf **controlp, int flags) 3266 { 3267 int len = 0, error = 0, oresid; 3268 struct mbuf *m, *n = NULL; 3269 3270 SOCK_IO_RECV_ASSERT_LOCKED(so); 3271 3272 /* Easy one, no space to copyout anything. */ 3273 if (uio->uio_resid == 0) 3274 return (EINVAL); 3275 oresid = uio->uio_resid; 3276 3277 SOCKBUF_LOCK(sb); 3278 /* We will never ever get anything unless we are or were connected. */ 3279 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 3280 error = ENOTCONN; 3281 goto out; 3282 } 3283 3284 restart: 3285 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3286 3287 /* Abort if socket has reported problems. */ 3288 if (so->so_error) { 3289 if (sbavail(sb) > 0) 3290 goto deliver; 3291 if (oresid > uio->uio_resid) 3292 goto out; 3293 error = so->so_error; 3294 if (!(flags & MSG_PEEK)) 3295 so->so_error = 0; 3296 goto out; 3297 } 3298 3299 /* Door is closed. Deliver what is left, if any. */ 3300 if (sb->sb_state & SBS_CANTRCVMORE) { 3301 if (sbavail(sb) > 0) 3302 goto deliver; 3303 else 3304 goto out; 3305 } 3306 3307 /* Socket buffer is empty and we shall not block. */ 3308 if (sbavail(sb) == 0 && 3309 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 3310 error = EAGAIN; 3311 goto out; 3312 } 3313 3314 /* Socket buffer got some data that we shall deliver now. */ 3315 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 3316 ((so->so_state & SS_NBIO) || 3317 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 3318 sbavail(sb) >= sb->sb_lowat || 3319 sbavail(sb) >= uio->uio_resid || 3320 sbavail(sb) >= sb->sb_hiwat) ) { 3321 goto deliver; 3322 } 3323 3324 /* On MSG_WAITALL we must wait until all data or error arrives. */ 3325 if ((flags & MSG_WAITALL) && 3326 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 3327 goto deliver; 3328 3329 /* 3330 * Wait and block until (more) data comes in. 3331 * NB: Drops the sockbuf lock during wait. 3332 */ 3333 error = sbwait(so, SO_RCV); 3334 if (error) 3335 goto out; 3336 goto restart; 3337 3338 deliver: 3339 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3340 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 3341 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 3342 3343 /* Statistics. */ 3344 if (uio->uio_td) 3345 uio->uio_td->td_ru.ru_msgrcv++; 3346 3347 /* Fill uio until full or current end of socket buffer is reached. */ 3348 len = min(uio->uio_resid, sbavail(sb)); 3349 if (mp0 != NULL) { 3350 /* Dequeue as many mbufs as possible. */ 3351 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 3352 if (*mp0 == NULL) 3353 *mp0 = sb->sb_mb; 3354 else 3355 m_cat(*mp0, sb->sb_mb); 3356 for (m = sb->sb_mb; 3357 m != NULL && m->m_len <= len; 3358 m = m->m_next) { 3359 KASSERT(!(m->m_flags & M_NOTREADY), 3360 ("%s: m %p not available", __func__, m)); 3361 len -= m->m_len; 3362 uio->uio_resid -= m->m_len; 3363 sbfree(sb, m); 3364 n = m; 3365 } 3366 n->m_next = NULL; 3367 sb->sb_mb = m; 3368 sb->sb_lastrecord = sb->sb_mb; 3369 if (sb->sb_mb == NULL) 3370 SB_EMPTY_FIXUP(sb); 3371 } 3372 /* Copy the remainder. */ 3373 if (len > 0) { 3374 KASSERT(sb->sb_mb != NULL, 3375 ("%s: len > 0 && sb->sb_mb empty", __func__)); 3376 3377 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 3378 if (m == NULL) 3379 len = 0; /* Don't flush data from sockbuf. */ 3380 else 3381 uio->uio_resid -= len; 3382 if (*mp0 != NULL) 3383 m_cat(*mp0, m); 3384 else 3385 *mp0 = m; 3386 if (*mp0 == NULL) { 3387 error = ENOBUFS; 3388 goto out; 3389 } 3390 } 3391 } else { 3392 /* NB: Must unlock socket buffer as uiomove may sleep. */ 3393 SOCKBUF_UNLOCK(sb); 3394 error = m_mbuftouio(uio, sb->sb_mb, len); 3395 SOCKBUF_LOCK(sb); 3396 if (error) 3397 goto out; 3398 } 3399 SBLASTRECORDCHK(sb); 3400 SBLASTMBUFCHK(sb); 3401 3402 /* 3403 * Remove the delivered data from the socket buffer unless we 3404 * were only peeking. 3405 */ 3406 if (!(flags & MSG_PEEK)) { 3407 if (len > 0) 3408 sbdrop_locked(sb, len); 3409 3410 /* Notify protocol that we drained some data. */ 3411 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 3412 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 3413 !(flags & MSG_SOCALLBCK))) { 3414 SOCKBUF_UNLOCK(sb); 3415 VNET_SO_ASSERT(so); 3416 so->so_proto->pr_rcvd(so, flags); 3417 SOCKBUF_LOCK(sb); 3418 } 3419 } 3420 3421 /* 3422 * For MSG_WAITALL we may have to loop again and wait for 3423 * more data to come in. 3424 */ 3425 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 3426 goto restart; 3427 out: 3428 SBLASTRECORDCHK(sb); 3429 SBLASTMBUFCHK(sb); 3430 SOCKBUF_UNLOCK(sb); 3431 return (error); 3432 } 3433 3434 int 3435 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 3436 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3437 { 3438 struct sockbuf *sb; 3439 int error, flags; 3440 3441 sb = &so->so_rcv; 3442 3443 /* We only do stream sockets. */ 3444 if (so->so_type != SOCK_STREAM) 3445 return (EINVAL); 3446 if (psa != NULL) 3447 *psa = NULL; 3448 if (flagsp != NULL) 3449 flags = *flagsp & ~MSG_EOR; 3450 else 3451 flags = 0; 3452 if (controlp != NULL) 3453 *controlp = NULL; 3454 if (flags & MSG_OOB) 3455 return (soreceive_rcvoob(so, uio, flags)); 3456 if (mp0 != NULL) 3457 *mp0 = NULL; 3458 3459 #ifdef KERN_TLS 3460 /* 3461 * KTLS store TLS records as records with a control message to 3462 * describe the framing. 3463 * 3464 * We check once here before acquiring locks to optimize the 3465 * common case. 3466 */ 3467 if (sb->sb_tls_info != NULL) 3468 return (soreceive_generic(so, psa, uio, mp0, controlp, 3469 flagsp)); 3470 #endif 3471 3472 /* 3473 * Prevent other threads from reading from the socket. This lock may be 3474 * dropped in order to sleep waiting for data to arrive. 3475 */ 3476 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3477 if (error) 3478 return (error); 3479 #ifdef KERN_TLS 3480 if (__predict_false(sb->sb_tls_info != NULL)) { 3481 SOCK_IO_RECV_UNLOCK(so); 3482 return (soreceive_generic(so, psa, uio, mp0, controlp, 3483 flagsp)); 3484 } 3485 #endif 3486 error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags); 3487 SOCK_IO_RECV_UNLOCK(so); 3488 return (error); 3489 } 3490 3491 /* 3492 * Optimized version of soreceive() for simple datagram cases from userspace. 3493 * Unlike in the stream case, we're able to drop a datagram if copyout() 3494 * fails, and because we handle datagrams atomically, we don't need to use a 3495 * sleep lock to prevent I/O interlacing. 3496 */ 3497 int 3498 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 3499 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3500 { 3501 struct mbuf *m, *m2; 3502 int flags, error; 3503 ssize_t len; 3504 struct protosw *pr = so->so_proto; 3505 struct mbuf *nextrecord; 3506 3507 if (psa != NULL) 3508 *psa = NULL; 3509 if (controlp != NULL) 3510 *controlp = NULL; 3511 if (flagsp != NULL) 3512 flags = *flagsp &~ MSG_EOR; 3513 else 3514 flags = 0; 3515 3516 /* 3517 * For any complicated cases, fall back to the full 3518 * soreceive_generic(). 3519 */ 3520 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) 3521 return (soreceive_generic(so, psa, uio, mp0, controlp, 3522 flagsp)); 3523 3524 /* 3525 * Enforce restrictions on use. 3526 */ 3527 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 3528 ("soreceive_dgram: wantrcvd")); 3529 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 3530 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 3531 ("soreceive_dgram: SBS_RCVATMARK")); 3532 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 3533 ("soreceive_dgram: P_CONNREQUIRED")); 3534 3535 /* 3536 * Loop blocking while waiting for a datagram. 3537 */ 3538 SOCKBUF_LOCK(&so->so_rcv); 3539 while ((m = so->so_rcv.sb_mb) == NULL) { 3540 KASSERT(sbavail(&so->so_rcv) == 0, 3541 ("soreceive_dgram: sb_mb NULL but sbavail %u", 3542 sbavail(&so->so_rcv))); 3543 if (so->so_error) { 3544 error = so->so_error; 3545 so->so_error = 0; 3546 SOCKBUF_UNLOCK(&so->so_rcv); 3547 return (error); 3548 } 3549 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 3550 uio->uio_resid == 0) { 3551 SOCKBUF_UNLOCK(&so->so_rcv); 3552 return (0); 3553 } 3554 if ((so->so_state & SS_NBIO) || 3555 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 3556 SOCKBUF_UNLOCK(&so->so_rcv); 3557 return (EWOULDBLOCK); 3558 } 3559 SBLASTRECORDCHK(&so->so_rcv); 3560 SBLASTMBUFCHK(&so->so_rcv); 3561 error = sbwait(so, SO_RCV); 3562 if (error) { 3563 SOCKBUF_UNLOCK(&so->so_rcv); 3564 return (error); 3565 } 3566 } 3567 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3568 3569 if (uio->uio_td) 3570 uio->uio_td->td_ru.ru_msgrcv++; 3571 SBLASTRECORDCHK(&so->so_rcv); 3572 SBLASTMBUFCHK(&so->so_rcv); 3573 nextrecord = m->m_nextpkt; 3574 if (nextrecord == NULL) { 3575 KASSERT(so->so_rcv.sb_lastrecord == m, 3576 ("soreceive_dgram: lastrecord != m")); 3577 } 3578 3579 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 3580 ("soreceive_dgram: m_nextpkt != nextrecord")); 3581 3582 /* 3583 * Pull 'm' and its chain off the front of the packet queue. 3584 */ 3585 so->so_rcv.sb_mb = NULL; 3586 sockbuf_pushsync(&so->so_rcv, nextrecord); 3587 3588 /* 3589 * Walk 'm's chain and free that many bytes from the socket buffer. 3590 */ 3591 for (m2 = m; m2 != NULL; m2 = m2->m_next) 3592 sbfree(&so->so_rcv, m2); 3593 3594 /* 3595 * Do a few last checks before we let go of the lock. 3596 */ 3597 SBLASTRECORDCHK(&so->so_rcv); 3598 SBLASTMBUFCHK(&so->so_rcv); 3599 SOCKBUF_UNLOCK(&so->so_rcv); 3600 3601 if (pr->pr_flags & PR_ADDR) { 3602 KASSERT(m->m_type == MT_SONAME, 3603 ("m->m_type == %d", m->m_type)); 3604 if (psa != NULL) 3605 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 3606 M_WAITOK); 3607 m = m_free(m); 3608 } 3609 KASSERT(m, ("%s: no data or control after soname", __func__)); 3610 3611 /* 3612 * Packet to copyout() is now in 'm' and it is disconnected from the 3613 * queue. 3614 * 3615 * Process one or more MT_CONTROL mbufs present before any data mbufs 3616 * in the first mbuf chain on the socket buffer. We call into the 3617 * protocol to perform externalization (or freeing if controlp == 3618 * NULL). In some cases there can be only MT_CONTROL mbufs without 3619 * MT_DATA mbufs. 3620 */ 3621 if (m->m_type == MT_CONTROL) { 3622 struct mbuf *cm = NULL, *cmn; 3623 struct mbuf **cme = &cm; 3624 3625 do { 3626 m2 = m->m_next; 3627 m->m_next = NULL; 3628 *cme = m; 3629 cme = &(*cme)->m_next; 3630 m = m2; 3631 } while (m != NULL && m->m_type == MT_CONTROL); 3632 while (cm != NULL) { 3633 cmn = cm->m_next; 3634 cm->m_next = NULL; 3635 if (controlp != NULL) 3636 *controlp = cm; 3637 else 3638 m_freem(cm); 3639 if (controlp != NULL) { 3640 while (*controlp != NULL) 3641 controlp = &(*controlp)->m_next; 3642 } 3643 cm = cmn; 3644 } 3645 } 3646 KASSERT(m == NULL || m->m_type == MT_DATA, 3647 ("soreceive_dgram: !data")); 3648 while (m != NULL && uio->uio_resid > 0) { 3649 len = uio->uio_resid; 3650 if (len > m->m_len) 3651 len = m->m_len; 3652 error = uiomove(mtod(m, char *), (int)len, uio); 3653 if (error) { 3654 m_freem(m); 3655 return (error); 3656 } 3657 if (len == m->m_len) 3658 m = m_free(m); 3659 else { 3660 m->m_data += len; 3661 m->m_len -= len; 3662 } 3663 } 3664 if (m != NULL) { 3665 flags |= MSG_TRUNC; 3666 m_freem(m); 3667 } 3668 if (flagsp != NULL) 3669 *flagsp |= flags; 3670 return (0); 3671 } 3672 3673 int 3674 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 3675 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3676 { 3677 int error; 3678 3679 CURVNET_SET(so->so_vnet); 3680 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); 3681 CURVNET_RESTORE(); 3682 return (error); 3683 } 3684 3685 int 3686 soshutdown(struct socket *so, enum shutdown_how how) 3687 { 3688 int error; 3689 3690 CURVNET_SET(so->so_vnet); 3691 error = so->so_proto->pr_shutdown(so, how); 3692 CURVNET_RESTORE(); 3693 3694 return (error); 3695 } 3696 3697 /* 3698 * Used by several pr_shutdown implementations that use generic socket buffers. 3699 */ 3700 void 3701 sorflush(struct socket *so) 3702 { 3703 int error; 3704 3705 VNET_SO_ASSERT(so); 3706 3707 /* 3708 * Dislodge threads currently blocked in receive and wait to acquire 3709 * a lock against other simultaneous readers before clearing the 3710 * socket buffer. Don't let our acquire be interrupted by a signal 3711 * despite any existing socket disposition on interruptable waiting. 3712 * 3713 * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive 3714 * methods that read the top of the socket buffer without acquisition 3715 * of the socket buffer mutex, assuming that top of the buffer 3716 * exclusively belongs to the read(2) syscall. This is handy when 3717 * performing MSG_PEEK. 3718 */ 3719 socantrcvmore(so); 3720 3721 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 3722 if (error != 0) { 3723 KASSERT(SOLISTENING(so), 3724 ("%s: soiolock(%p) failed", __func__, so)); 3725 return; 3726 } 3727 3728 sbrelease(so, SO_RCV); 3729 SOCK_IO_RECV_UNLOCK(so); 3730 3731 } 3732 3733 int 3734 sosetfib(struct socket *so, int fibnum) 3735 { 3736 if (fibnum < 0 || fibnum >= rt_numfibs) 3737 return (EINVAL); 3738 3739 SOCK_LOCK(so); 3740 so->so_fibnum = fibnum; 3741 SOCK_UNLOCK(so); 3742 3743 return (0); 3744 } 3745 3746 #ifdef SOCKET_HHOOK 3747 /* 3748 * Wrapper for Socket established helper hook. 3749 * Parameters: socket, context of the hook point, hook id. 3750 */ 3751 static inline int 3752 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 3753 { 3754 struct socket_hhook_data hhook_data = { 3755 .so = so, 3756 .hctx = hctx, 3757 .m = NULL, 3758 .status = 0 3759 }; 3760 3761 CURVNET_SET(so->so_vnet); 3762 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 3763 CURVNET_RESTORE(); 3764 3765 /* Ugly but needed, since hhooks return void for now */ 3766 return (hhook_data.status); 3767 } 3768 #endif 3769 3770 /* 3771 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 3772 * additional variant to handle the case where the option value needs to be 3773 * some kind of integer, but not a specific size. In addition to their use 3774 * here, these functions are also called by the protocol-level pr_ctloutput() 3775 * routines. 3776 */ 3777 int 3778 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 3779 { 3780 size_t valsize; 3781 3782 /* 3783 * If the user gives us more than we wanted, we ignore it, but if we 3784 * don't get the minimum length the caller wants, we return EINVAL. 3785 * On success, sopt->sopt_valsize is set to however much we actually 3786 * retrieved. 3787 */ 3788 if ((valsize = sopt->sopt_valsize) < minlen) 3789 return EINVAL; 3790 if (valsize > len) 3791 sopt->sopt_valsize = valsize = len; 3792 3793 if (sopt->sopt_td != NULL) 3794 return (copyin(sopt->sopt_val, buf, valsize)); 3795 3796 bcopy(sopt->sopt_val, buf, valsize); 3797 return (0); 3798 } 3799 3800 /* 3801 * Kernel version of setsockopt(2). 3802 * 3803 * XXX: optlen is size_t, not socklen_t 3804 */ 3805 int 3806 so_setsockopt(struct socket *so, int level, int optname, void *optval, 3807 size_t optlen) 3808 { 3809 struct sockopt sopt; 3810 3811 sopt.sopt_level = level; 3812 sopt.sopt_name = optname; 3813 sopt.sopt_dir = SOPT_SET; 3814 sopt.sopt_val = optval; 3815 sopt.sopt_valsize = optlen; 3816 sopt.sopt_td = NULL; 3817 return (sosetopt(so, &sopt)); 3818 } 3819 3820 int 3821 sosetopt(struct socket *so, struct sockopt *sopt) 3822 { 3823 int error, optval; 3824 struct linger l; 3825 struct timeval tv; 3826 sbintime_t val, *valp; 3827 uint32_t val32; 3828 #ifdef MAC 3829 struct mac extmac; 3830 #endif 3831 3832 CURVNET_SET(so->so_vnet); 3833 error = 0; 3834 if (sopt->sopt_level != SOL_SOCKET) { 3835 error = so->so_proto->pr_ctloutput(so, sopt); 3836 } else { 3837 switch (sopt->sopt_name) { 3838 case SO_ACCEPTFILTER: 3839 error = accept_filt_setopt(so, sopt); 3840 if (error) 3841 goto bad; 3842 break; 3843 3844 case SO_LINGER: 3845 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3846 if (error) 3847 goto bad; 3848 if (l.l_linger < 0 || 3849 l.l_linger > USHRT_MAX || 3850 l.l_linger > (INT_MAX / hz)) { 3851 error = EDOM; 3852 goto bad; 3853 } 3854 SOCK_LOCK(so); 3855 so->so_linger = l.l_linger; 3856 if (l.l_onoff) 3857 so->so_options |= SO_LINGER; 3858 else 3859 so->so_options &= ~SO_LINGER; 3860 SOCK_UNLOCK(so); 3861 break; 3862 3863 case SO_DEBUG: 3864 case SO_KEEPALIVE: 3865 case SO_DONTROUTE: 3866 case SO_USELOOPBACK: 3867 case SO_BROADCAST: 3868 case SO_REUSEADDR: 3869 case SO_REUSEPORT: 3870 case SO_REUSEPORT_LB: 3871 case SO_OOBINLINE: 3872 case SO_TIMESTAMP: 3873 case SO_BINTIME: 3874 case SO_NOSIGPIPE: 3875 case SO_NO_DDP: 3876 case SO_NO_OFFLOAD: 3877 case SO_RERROR: 3878 error = sooptcopyin(sopt, &optval, sizeof optval, 3879 sizeof optval); 3880 if (error) 3881 goto bad; 3882 SOCK_LOCK(so); 3883 if (optval) 3884 so->so_options |= sopt->sopt_name; 3885 else 3886 so->so_options &= ~sopt->sopt_name; 3887 SOCK_UNLOCK(so); 3888 break; 3889 3890 case SO_SETFIB: 3891 error = so->so_proto->pr_ctloutput(so, sopt); 3892 break; 3893 3894 case SO_USER_COOKIE: 3895 error = sooptcopyin(sopt, &val32, sizeof val32, 3896 sizeof val32); 3897 if (error) 3898 goto bad; 3899 so->so_user_cookie = val32; 3900 break; 3901 3902 case SO_SNDBUF: 3903 case SO_RCVBUF: 3904 case SO_SNDLOWAT: 3905 case SO_RCVLOWAT: 3906 error = so->so_proto->pr_setsbopt(so, sopt); 3907 if (error) 3908 goto bad; 3909 break; 3910 3911 case SO_SNDTIMEO: 3912 case SO_RCVTIMEO: 3913 #ifdef COMPAT_FREEBSD32 3914 if (SV_CURPROC_FLAG(SV_ILP32)) { 3915 struct timeval32 tv32; 3916 3917 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3918 sizeof tv32); 3919 CP(tv32, tv, tv_sec); 3920 CP(tv32, tv, tv_usec); 3921 } else 3922 #endif 3923 error = sooptcopyin(sopt, &tv, sizeof tv, 3924 sizeof tv); 3925 if (error) 3926 goto bad; 3927 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3928 tv.tv_usec >= 1000000) { 3929 error = EDOM; 3930 goto bad; 3931 } 3932 if (tv.tv_sec > INT32_MAX) 3933 val = SBT_MAX; 3934 else 3935 val = tvtosbt(tv); 3936 SOCK_LOCK(so); 3937 valp = sopt->sopt_name == SO_SNDTIMEO ? 3938 (SOLISTENING(so) ? &so->sol_sbsnd_timeo : 3939 &so->so_snd.sb_timeo) : 3940 (SOLISTENING(so) ? &so->sol_sbrcv_timeo : 3941 &so->so_rcv.sb_timeo); 3942 *valp = val; 3943 SOCK_UNLOCK(so); 3944 break; 3945 3946 case SO_LABEL: 3947 #ifdef MAC 3948 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3949 sizeof extmac); 3950 if (error) 3951 goto bad; 3952 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3953 so, &extmac); 3954 #else 3955 error = EOPNOTSUPP; 3956 #endif 3957 break; 3958 3959 case SO_TS_CLOCK: 3960 error = sooptcopyin(sopt, &optval, sizeof optval, 3961 sizeof optval); 3962 if (error) 3963 goto bad; 3964 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 3965 error = EINVAL; 3966 goto bad; 3967 } 3968 so->so_ts_clock = optval; 3969 break; 3970 3971 case SO_MAX_PACING_RATE: 3972 error = sooptcopyin(sopt, &val32, sizeof(val32), 3973 sizeof(val32)); 3974 if (error) 3975 goto bad; 3976 so->so_max_pacing_rate = val32; 3977 break; 3978 3979 case SO_SPLICE: { 3980 struct splice splice; 3981 3982 #ifdef COMPAT_FREEBSD32 3983 if (SV_CURPROC_FLAG(SV_ILP32)) { 3984 struct splice32 splice32; 3985 3986 error = sooptcopyin(sopt, &splice32, 3987 sizeof(splice32), sizeof(splice32)); 3988 if (error == 0) { 3989 splice.sp_fd = splice32.sp_fd; 3990 splice.sp_max = splice32.sp_max; 3991 CP(splice32.sp_idle, splice.sp_idle, 3992 tv_sec); 3993 CP(splice32.sp_idle, splice.sp_idle, 3994 tv_usec); 3995 } 3996 } else 3997 #endif 3998 { 3999 error = sooptcopyin(sopt, &splice, 4000 sizeof(splice), sizeof(splice)); 4001 } 4002 if (error) 4003 goto bad; 4004 #ifdef KTRACE 4005 if (KTRPOINT(curthread, KTR_STRUCT)) 4006 ktrsplice(&splice); 4007 #endif 4008 4009 error = splice_init(); 4010 if (error != 0) 4011 goto bad; 4012 4013 if (splice.sp_fd >= 0) { 4014 struct file *fp; 4015 struct socket *so2; 4016 4017 if (!cap_rights_contains(sopt->sopt_rights, 4018 &cap_recv_rights)) { 4019 error = ENOTCAPABLE; 4020 goto bad; 4021 } 4022 error = getsock(sopt->sopt_td, splice.sp_fd, 4023 &cap_send_rights, &fp); 4024 if (error != 0) 4025 goto bad; 4026 so2 = fp->f_data; 4027 4028 error = so_splice(so, so2, &splice); 4029 fdrop(fp, sopt->sopt_td); 4030 } else { 4031 error = so_unsplice(so, false); 4032 } 4033 break; 4034 } 4035 default: 4036 #ifdef SOCKET_HHOOK 4037 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 4038 error = hhook_run_socket(so, sopt, 4039 HHOOK_SOCKET_OPT); 4040 else 4041 #endif 4042 error = ENOPROTOOPT; 4043 break; 4044 } 4045 if (error == 0) 4046 (void)so->so_proto->pr_ctloutput(so, sopt); 4047 } 4048 bad: 4049 CURVNET_RESTORE(); 4050 return (error); 4051 } 4052 4053 /* 4054 * Helper routine for getsockopt. 4055 */ 4056 int 4057 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 4058 { 4059 int error; 4060 size_t valsize; 4061 4062 error = 0; 4063 4064 /* 4065 * Documented get behavior is that we always return a value, possibly 4066 * truncated to fit in the user's buffer. Traditional behavior is 4067 * that we always tell the user precisely how much we copied, rather 4068 * than something useful like the total amount we had available for 4069 * her. Note that this interface is not idempotent; the entire 4070 * answer must be generated ahead of time. 4071 */ 4072 valsize = min(len, sopt->sopt_valsize); 4073 sopt->sopt_valsize = valsize; 4074 if (sopt->sopt_val != NULL) { 4075 if (sopt->sopt_td != NULL) 4076 error = copyout(buf, sopt->sopt_val, valsize); 4077 else 4078 bcopy(buf, sopt->sopt_val, valsize); 4079 } 4080 return (error); 4081 } 4082 4083 int 4084 sogetopt(struct socket *so, struct sockopt *sopt) 4085 { 4086 int error, optval; 4087 struct linger l; 4088 struct timeval tv; 4089 #ifdef MAC 4090 struct mac extmac; 4091 #endif 4092 4093 CURVNET_SET(so->so_vnet); 4094 error = 0; 4095 if (sopt->sopt_level != SOL_SOCKET) { 4096 error = so->so_proto->pr_ctloutput(so, sopt); 4097 CURVNET_RESTORE(); 4098 return (error); 4099 } else { 4100 switch (sopt->sopt_name) { 4101 case SO_ACCEPTFILTER: 4102 error = accept_filt_getopt(so, sopt); 4103 break; 4104 4105 case SO_LINGER: 4106 SOCK_LOCK(so); 4107 l.l_onoff = so->so_options & SO_LINGER; 4108 l.l_linger = so->so_linger; 4109 SOCK_UNLOCK(so); 4110 error = sooptcopyout(sopt, &l, sizeof l); 4111 break; 4112 4113 case SO_USELOOPBACK: 4114 case SO_DONTROUTE: 4115 case SO_DEBUG: 4116 case SO_KEEPALIVE: 4117 case SO_REUSEADDR: 4118 case SO_REUSEPORT: 4119 case SO_REUSEPORT_LB: 4120 case SO_BROADCAST: 4121 case SO_OOBINLINE: 4122 case SO_ACCEPTCONN: 4123 case SO_TIMESTAMP: 4124 case SO_BINTIME: 4125 case SO_NOSIGPIPE: 4126 case SO_NO_DDP: 4127 case SO_NO_OFFLOAD: 4128 case SO_RERROR: 4129 optval = so->so_options & sopt->sopt_name; 4130 integer: 4131 error = sooptcopyout(sopt, &optval, sizeof optval); 4132 break; 4133 4134 case SO_FIB: 4135 SOCK_LOCK(so); 4136 optval = so->so_fibnum; 4137 SOCK_UNLOCK(so); 4138 goto integer; 4139 4140 case SO_DOMAIN: 4141 optval = so->so_proto->pr_domain->dom_family; 4142 goto integer; 4143 4144 case SO_TYPE: 4145 optval = so->so_type; 4146 goto integer; 4147 4148 case SO_PROTOCOL: 4149 optval = so->so_proto->pr_protocol; 4150 goto integer; 4151 4152 case SO_ERROR: 4153 SOCK_LOCK(so); 4154 if (so->so_error) { 4155 optval = so->so_error; 4156 so->so_error = 0; 4157 } else { 4158 optval = so->so_rerror; 4159 so->so_rerror = 0; 4160 } 4161 SOCK_UNLOCK(so); 4162 goto integer; 4163 4164 case SO_SNDBUF: 4165 SOCK_LOCK(so); 4166 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 4167 so->so_snd.sb_hiwat; 4168 SOCK_UNLOCK(so); 4169 goto integer; 4170 4171 case SO_RCVBUF: 4172 SOCK_LOCK(so); 4173 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 4174 so->so_rcv.sb_hiwat; 4175 SOCK_UNLOCK(so); 4176 goto integer; 4177 4178 case SO_SNDLOWAT: 4179 SOCK_LOCK(so); 4180 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 4181 so->so_snd.sb_lowat; 4182 SOCK_UNLOCK(so); 4183 goto integer; 4184 4185 case SO_RCVLOWAT: 4186 SOCK_LOCK(so); 4187 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 4188 so->so_rcv.sb_lowat; 4189 SOCK_UNLOCK(so); 4190 goto integer; 4191 4192 case SO_SNDTIMEO: 4193 case SO_RCVTIMEO: 4194 SOCK_LOCK(so); 4195 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 4196 (SOLISTENING(so) ? so->sol_sbsnd_timeo : 4197 so->so_snd.sb_timeo) : 4198 (SOLISTENING(so) ? so->sol_sbrcv_timeo : 4199 so->so_rcv.sb_timeo)); 4200 SOCK_UNLOCK(so); 4201 #ifdef COMPAT_FREEBSD32 4202 if (SV_CURPROC_FLAG(SV_ILP32)) { 4203 struct timeval32 tv32; 4204 4205 CP(tv, tv32, tv_sec); 4206 CP(tv, tv32, tv_usec); 4207 error = sooptcopyout(sopt, &tv32, sizeof tv32); 4208 } else 4209 #endif 4210 error = sooptcopyout(sopt, &tv, sizeof tv); 4211 break; 4212 4213 case SO_LABEL: 4214 #ifdef MAC 4215 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4216 sizeof(extmac)); 4217 if (error) 4218 goto bad; 4219 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 4220 so, &extmac); 4221 if (error) 4222 goto bad; 4223 /* Don't copy out extmac, it is unchanged. */ 4224 #else 4225 error = EOPNOTSUPP; 4226 #endif 4227 break; 4228 4229 case SO_PEERLABEL: 4230 #ifdef MAC 4231 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4232 sizeof(extmac)); 4233 if (error) 4234 goto bad; 4235 error = mac_getsockopt_peerlabel( 4236 sopt->sopt_td->td_ucred, so, &extmac); 4237 if (error) 4238 goto bad; 4239 /* Don't copy out extmac, it is unchanged. */ 4240 #else 4241 error = EOPNOTSUPP; 4242 #endif 4243 break; 4244 4245 case SO_LISTENQLIMIT: 4246 SOCK_LOCK(so); 4247 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 4248 SOCK_UNLOCK(so); 4249 goto integer; 4250 4251 case SO_LISTENQLEN: 4252 SOCK_LOCK(so); 4253 optval = SOLISTENING(so) ? so->sol_qlen : 0; 4254 SOCK_UNLOCK(so); 4255 goto integer; 4256 4257 case SO_LISTENINCQLEN: 4258 SOCK_LOCK(so); 4259 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 4260 SOCK_UNLOCK(so); 4261 goto integer; 4262 4263 case SO_TS_CLOCK: 4264 optval = so->so_ts_clock; 4265 goto integer; 4266 4267 case SO_MAX_PACING_RATE: 4268 optval = so->so_max_pacing_rate; 4269 goto integer; 4270 4271 case SO_SPLICE: { 4272 off_t n; 4273 4274 /* 4275 * Acquire the I/O lock to serialize with 4276 * so_splice_xfer(). This is not required for 4277 * correctness, but makes testing simpler: once a byte 4278 * has been transmitted to the sink and observed (e.g., 4279 * by reading from the socket to which the sink is 4280 * connected), a subsequent getsockopt(SO_SPLICE) will 4281 * return an up-to-date value. 4282 */ 4283 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT); 4284 if (error != 0) 4285 goto bad; 4286 SOCK_LOCK(so); 4287 if (SOLISTENING(so)) { 4288 n = 0; 4289 } else { 4290 n = so->so_splice_sent; 4291 } 4292 SOCK_UNLOCK(so); 4293 SOCK_IO_RECV_UNLOCK(so); 4294 error = sooptcopyout(sopt, &n, sizeof(n)); 4295 break; 4296 } 4297 4298 default: 4299 #ifdef SOCKET_HHOOK 4300 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 4301 error = hhook_run_socket(so, sopt, 4302 HHOOK_SOCKET_OPT); 4303 else 4304 #endif 4305 error = ENOPROTOOPT; 4306 break; 4307 } 4308 } 4309 bad: 4310 CURVNET_RESTORE(); 4311 return (error); 4312 } 4313 4314 int 4315 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 4316 { 4317 struct mbuf *m, *m_prev; 4318 int sopt_size = sopt->sopt_valsize; 4319 4320 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4321 if (m == NULL) 4322 return ENOBUFS; 4323 if (sopt_size > MLEN) { 4324 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 4325 if ((m->m_flags & M_EXT) == 0) { 4326 m_free(m); 4327 return ENOBUFS; 4328 } 4329 m->m_len = min(MCLBYTES, sopt_size); 4330 } else { 4331 m->m_len = min(MLEN, sopt_size); 4332 } 4333 sopt_size -= m->m_len; 4334 *mp = m; 4335 m_prev = m; 4336 4337 while (sopt_size) { 4338 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4339 if (m == NULL) { 4340 m_freem(*mp); 4341 return ENOBUFS; 4342 } 4343 if (sopt_size > MLEN) { 4344 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 4345 M_NOWAIT); 4346 if ((m->m_flags & M_EXT) == 0) { 4347 m_freem(m); 4348 m_freem(*mp); 4349 return ENOBUFS; 4350 } 4351 m->m_len = min(MCLBYTES, sopt_size); 4352 } else { 4353 m->m_len = min(MLEN, sopt_size); 4354 } 4355 sopt_size -= m->m_len; 4356 m_prev->m_next = m; 4357 m_prev = m; 4358 } 4359 return (0); 4360 } 4361 4362 int 4363 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 4364 { 4365 struct mbuf *m0 = m; 4366 4367 if (sopt->sopt_val == NULL) 4368 return (0); 4369 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4370 if (sopt->sopt_td != NULL) { 4371 int error; 4372 4373 error = copyin(sopt->sopt_val, mtod(m, char *), 4374 m->m_len); 4375 if (error != 0) { 4376 m_freem(m0); 4377 return(error); 4378 } 4379 } else 4380 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 4381 sopt->sopt_valsize -= m->m_len; 4382 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4383 m = m->m_next; 4384 } 4385 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 4386 panic("ip6_sooptmcopyin"); 4387 return (0); 4388 } 4389 4390 int 4391 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 4392 { 4393 struct mbuf *m0 = m; 4394 size_t valsize = 0; 4395 4396 if (sopt->sopt_val == NULL) 4397 return (0); 4398 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4399 if (sopt->sopt_td != NULL) { 4400 int error; 4401 4402 error = copyout(mtod(m, char *), sopt->sopt_val, 4403 m->m_len); 4404 if (error != 0) { 4405 m_freem(m0); 4406 return(error); 4407 } 4408 } else 4409 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 4410 sopt->sopt_valsize -= m->m_len; 4411 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4412 valsize += m->m_len; 4413 m = m->m_next; 4414 } 4415 if (m != NULL) { 4416 /* enough soopt buffer should be given from user-land */ 4417 m_freem(m0); 4418 return(EINVAL); 4419 } 4420 sopt->sopt_valsize = valsize; 4421 return (0); 4422 } 4423 4424 /* 4425 * sohasoutofband(): protocol notifies socket layer of the arrival of new 4426 * out-of-band data, which will then notify socket consumers. 4427 */ 4428 void 4429 sohasoutofband(struct socket *so) 4430 { 4431 4432 if (so->so_sigio != NULL) 4433 pgsigio(&so->so_sigio, SIGURG, 0); 4434 selwakeuppri(&so->so_rdsel, PSOCK); 4435 } 4436 4437 int 4438 sopoll_generic(struct socket *so, int events, struct thread *td) 4439 { 4440 int revents; 4441 4442 SOCK_LOCK(so); 4443 if (SOLISTENING(so)) { 4444 if (!(events & (POLLIN | POLLRDNORM))) 4445 revents = 0; 4446 else if (!TAILQ_EMPTY(&so->sol_comp)) 4447 revents = events & (POLLIN | POLLRDNORM); 4448 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 4449 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 4450 else { 4451 selrecord(td, &so->so_rdsel); 4452 revents = 0; 4453 } 4454 } else { 4455 revents = 0; 4456 SOCK_SENDBUF_LOCK(so); 4457 SOCK_RECVBUF_LOCK(so); 4458 if (events & (POLLIN | POLLRDNORM)) 4459 if (soreadabledata(so) && !isspliced(so)) 4460 revents |= events & (POLLIN | POLLRDNORM); 4461 if (events & (POLLOUT | POLLWRNORM)) 4462 if (sowriteable(so) && !issplicedback(so)) 4463 revents |= events & (POLLOUT | POLLWRNORM); 4464 if (events & (POLLPRI | POLLRDBAND)) 4465 if (so->so_oobmark || 4466 (so->so_rcv.sb_state & SBS_RCVATMARK)) 4467 revents |= events & (POLLPRI | POLLRDBAND); 4468 if ((events & POLLINIGNEOF) == 0) { 4469 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4470 revents |= events & (POLLIN | POLLRDNORM); 4471 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 4472 revents |= POLLHUP; 4473 } 4474 } 4475 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4476 revents |= events & POLLRDHUP; 4477 if (revents == 0) { 4478 if (events & 4479 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 4480 selrecord(td, &so->so_rdsel); 4481 so->so_rcv.sb_flags |= SB_SEL; 4482 } 4483 if (events & (POLLOUT | POLLWRNORM)) { 4484 selrecord(td, &so->so_wrsel); 4485 so->so_snd.sb_flags |= SB_SEL; 4486 } 4487 } 4488 SOCK_RECVBUF_UNLOCK(so); 4489 SOCK_SENDBUF_UNLOCK(so); 4490 } 4491 SOCK_UNLOCK(so); 4492 return (revents); 4493 } 4494 4495 int 4496 sokqfilter_generic(struct socket *so, struct knote *kn) 4497 { 4498 struct sockbuf *sb; 4499 sb_which which; 4500 struct knlist *knl; 4501 4502 switch (kn->kn_filter) { 4503 case EVFILT_READ: 4504 kn->kn_fop = &soread_filtops; 4505 knl = &so->so_rdsel.si_note; 4506 sb = &so->so_rcv; 4507 which = SO_RCV; 4508 break; 4509 case EVFILT_WRITE: 4510 kn->kn_fop = &sowrite_filtops; 4511 knl = &so->so_wrsel.si_note; 4512 sb = &so->so_snd; 4513 which = SO_SND; 4514 break; 4515 case EVFILT_EMPTY: 4516 kn->kn_fop = &soempty_filtops; 4517 knl = &so->so_wrsel.si_note; 4518 sb = &so->so_snd; 4519 which = SO_SND; 4520 break; 4521 default: 4522 return (EINVAL); 4523 } 4524 4525 SOCK_LOCK(so); 4526 if (SOLISTENING(so)) { 4527 knlist_add(knl, kn, 1); 4528 } else { 4529 SOCK_BUF_LOCK(so, which); 4530 knlist_add(knl, kn, 1); 4531 sb->sb_flags |= SB_KNOTE; 4532 if ((kn->kn_sfflags & NOTE_LOWAT) && 4533 (sb->sb_flags & SB_AUTOLOWAT)) 4534 sb->sb_flags &= ~SB_AUTOLOWAT; 4535 SOCK_BUF_UNLOCK(so, which); 4536 } 4537 SOCK_UNLOCK(so); 4538 return (0); 4539 } 4540 4541 static void 4542 filt_sordetach(struct knote *kn) 4543 { 4544 struct socket *so = kn->kn_fp->f_data; 4545 4546 so_rdknl_lock(so); 4547 knlist_remove(&so->so_rdsel.si_note, kn, 1); 4548 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 4549 so->so_rcv.sb_flags &= ~SB_KNOTE; 4550 so_rdknl_unlock(so); 4551 } 4552 4553 /*ARGSUSED*/ 4554 static int 4555 filt_soread(struct knote *kn, long hint) 4556 { 4557 struct socket *so; 4558 4559 so = kn->kn_fp->f_data; 4560 4561 if (SOLISTENING(so)) { 4562 SOCK_LOCK_ASSERT(so); 4563 kn->kn_data = so->sol_qlen; 4564 if (so->so_error) { 4565 kn->kn_flags |= EV_EOF; 4566 kn->kn_fflags = so->so_error; 4567 return (1); 4568 } 4569 return (!TAILQ_EMPTY(&so->sol_comp)); 4570 } 4571 4572 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 4573 return (0); 4574 4575 SOCK_RECVBUF_LOCK_ASSERT(so); 4576 4577 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 4578 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4579 kn->kn_flags |= EV_EOF; 4580 kn->kn_fflags = so->so_error; 4581 return (1); 4582 } else if (so->so_error || so->so_rerror) 4583 return (1); 4584 4585 if (kn->kn_sfflags & NOTE_LOWAT) { 4586 if (kn->kn_data >= kn->kn_sdata) 4587 return (1); 4588 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 4589 return (1); 4590 4591 #ifdef SOCKET_HHOOK 4592 /* This hook returning non-zero indicates an event, not error */ 4593 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 4594 #else 4595 return (0); 4596 #endif 4597 } 4598 4599 static void 4600 filt_sowdetach(struct knote *kn) 4601 { 4602 struct socket *so = kn->kn_fp->f_data; 4603 4604 so_wrknl_lock(so); 4605 knlist_remove(&so->so_wrsel.si_note, kn, 1); 4606 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 4607 so->so_snd.sb_flags &= ~SB_KNOTE; 4608 so_wrknl_unlock(so); 4609 } 4610 4611 /*ARGSUSED*/ 4612 static int 4613 filt_sowrite(struct knote *kn, long hint) 4614 { 4615 struct socket *so; 4616 4617 so = kn->kn_fp->f_data; 4618 4619 if (SOLISTENING(so)) 4620 return (0); 4621 4622 SOCK_SENDBUF_LOCK_ASSERT(so); 4623 kn->kn_data = sbspace(&so->so_snd); 4624 4625 #ifdef SOCKET_HHOOK 4626 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 4627 #endif 4628 4629 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 4630 kn->kn_flags |= EV_EOF; 4631 kn->kn_fflags = so->so_error; 4632 return (1); 4633 } else if (so->so_error) /* temporary udp error */ 4634 return (1); 4635 else if (((so->so_state & SS_ISCONNECTED) == 0) && 4636 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 4637 return (0); 4638 else if (kn->kn_sfflags & NOTE_LOWAT) 4639 return (kn->kn_data >= kn->kn_sdata); 4640 else 4641 return (kn->kn_data >= so->so_snd.sb_lowat); 4642 } 4643 4644 static int 4645 filt_soempty(struct knote *kn, long hint) 4646 { 4647 struct socket *so; 4648 4649 so = kn->kn_fp->f_data; 4650 4651 if (SOLISTENING(so)) 4652 return (1); 4653 4654 SOCK_SENDBUF_LOCK_ASSERT(so); 4655 kn->kn_data = sbused(&so->so_snd); 4656 4657 if (kn->kn_data == 0) 4658 return (1); 4659 else 4660 return (0); 4661 } 4662 4663 int 4664 socheckuid(struct socket *so, uid_t uid) 4665 { 4666 4667 if (so == NULL) 4668 return (EPERM); 4669 if (so->so_cred->cr_uid != uid) 4670 return (EPERM); 4671 return (0); 4672 } 4673 4674 /* 4675 * These functions are used by protocols to notify the socket layer (and its 4676 * consumers) of state changes in the sockets driven by protocol-side events. 4677 */ 4678 4679 /* 4680 * Procedures to manipulate state flags of socket and do appropriate wakeups. 4681 * 4682 * Normal sequence from the active (originating) side is that 4683 * soisconnecting() is called during processing of connect() call, resulting 4684 * in an eventual call to soisconnected() if/when the connection is 4685 * established. When the connection is torn down soisdisconnecting() is 4686 * called during processing of disconnect() call, and soisdisconnected() is 4687 * called when the connection to the peer is totally severed. The semantics 4688 * of these routines are such that connectionless protocols can call 4689 * soisconnected() and soisdisconnected() only, bypassing the in-progress 4690 * calls when setting up a ``connection'' takes no time. 4691 * 4692 * From the passive side, a socket is created with two queues of sockets: 4693 * so_incomp for connections in progress and so_comp for connections already 4694 * made and awaiting user acceptance. As a protocol is preparing incoming 4695 * connections, it creates a socket structure queued on so_incomp by calling 4696 * sonewconn(). When the connection is established, soisconnected() is 4697 * called, and transfers the socket structure to so_comp, making it available 4698 * to accept(). 4699 * 4700 * If a socket is closed with sockets on either so_incomp or so_comp, these 4701 * sockets are dropped. 4702 * 4703 * If higher-level protocols are implemented in the kernel, the wakeups done 4704 * here will sometimes cause software-interrupt process scheduling. 4705 */ 4706 void 4707 soisconnecting(struct socket *so) 4708 { 4709 4710 SOCK_LOCK(so); 4711 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 4712 so->so_state |= SS_ISCONNECTING; 4713 SOCK_UNLOCK(so); 4714 } 4715 4716 void 4717 soisconnected(struct socket *so) 4718 { 4719 bool last __diagused; 4720 4721 SOCK_LOCK(so); 4722 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 4723 so->so_state |= SS_ISCONNECTED; 4724 4725 if (so->so_qstate == SQ_INCOMP) { 4726 struct socket *head = so->so_listen; 4727 int ret; 4728 4729 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 4730 /* 4731 * Promoting a socket from incomplete queue to complete, we 4732 * need to go through reverse order of locking. We first do 4733 * trylock, and if that doesn't succeed, we go the hard way 4734 * leaving a reference and rechecking consistency after proper 4735 * locking. 4736 */ 4737 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 4738 soref(head); 4739 SOCK_UNLOCK(so); 4740 SOLISTEN_LOCK(head); 4741 SOCK_LOCK(so); 4742 if (__predict_false(head != so->so_listen)) { 4743 /* 4744 * The socket went off the listen queue, 4745 * should be lost race to close(2) of sol. 4746 * The socket is about to soabort(). 4747 */ 4748 SOCK_UNLOCK(so); 4749 sorele_locked(head); 4750 return; 4751 } 4752 last = refcount_release(&head->so_count); 4753 KASSERT(!last, ("%s: released last reference for %p", 4754 __func__, head)); 4755 } 4756 again: 4757 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 4758 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 4759 head->sol_incqlen--; 4760 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 4761 head->sol_qlen++; 4762 so->so_qstate = SQ_COMP; 4763 SOCK_UNLOCK(so); 4764 solisten_wakeup(head); /* unlocks */ 4765 } else { 4766 SOCK_RECVBUF_LOCK(so); 4767 soupcall_set(so, SO_RCV, 4768 head->sol_accept_filter->accf_callback, 4769 head->sol_accept_filter_arg); 4770 so->so_options &= ~SO_ACCEPTFILTER; 4771 ret = head->sol_accept_filter->accf_callback(so, 4772 head->sol_accept_filter_arg, M_NOWAIT); 4773 if (ret == SU_ISCONNECTED) { 4774 soupcall_clear(so, SO_RCV); 4775 SOCK_RECVBUF_UNLOCK(so); 4776 goto again; 4777 } 4778 SOCK_RECVBUF_UNLOCK(so); 4779 SOCK_UNLOCK(so); 4780 SOLISTEN_UNLOCK(head); 4781 } 4782 return; 4783 } 4784 SOCK_UNLOCK(so); 4785 wakeup(&so->so_timeo); 4786 sorwakeup(so); 4787 sowwakeup(so); 4788 } 4789 4790 void 4791 soisdisconnecting(struct socket *so) 4792 { 4793 4794 SOCK_LOCK(so); 4795 so->so_state &= ~SS_ISCONNECTING; 4796 so->so_state |= SS_ISDISCONNECTING; 4797 4798 if (!SOLISTENING(so)) { 4799 SOCK_RECVBUF_LOCK(so); 4800 socantrcvmore_locked(so); 4801 SOCK_SENDBUF_LOCK(so); 4802 socantsendmore_locked(so); 4803 } 4804 SOCK_UNLOCK(so); 4805 wakeup(&so->so_timeo); 4806 } 4807 4808 void 4809 soisdisconnected(struct socket *so) 4810 { 4811 4812 SOCK_LOCK(so); 4813 4814 /* 4815 * There is at least one reader of so_state that does not 4816 * acquire socket lock, namely soreceive_generic(). Ensure 4817 * that it never sees all flags that track connection status 4818 * cleared, by ordering the update with a barrier semantic of 4819 * our release thread fence. 4820 */ 4821 so->so_state |= SS_ISDISCONNECTED; 4822 atomic_thread_fence_rel(); 4823 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 4824 4825 if (!SOLISTENING(so)) { 4826 SOCK_UNLOCK(so); 4827 SOCK_RECVBUF_LOCK(so); 4828 socantrcvmore_locked(so); 4829 SOCK_SENDBUF_LOCK(so); 4830 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 4831 socantsendmore_locked(so); 4832 } else 4833 SOCK_UNLOCK(so); 4834 wakeup(&so->so_timeo); 4835 } 4836 4837 int 4838 soiolock(struct socket *so, struct sx *sx, int flags) 4839 { 4840 int error; 4841 4842 KASSERT((flags & SBL_VALID) == flags, 4843 ("soiolock: invalid flags %#x", flags)); 4844 4845 if ((flags & SBL_WAIT) != 0) { 4846 if ((flags & SBL_NOINTR) != 0) { 4847 sx_xlock(sx); 4848 } else { 4849 error = sx_xlock_sig(sx); 4850 if (error != 0) 4851 return (error); 4852 } 4853 } else if (!sx_try_xlock(sx)) { 4854 return (EWOULDBLOCK); 4855 } 4856 4857 if (__predict_false(SOLISTENING(so))) { 4858 sx_xunlock(sx); 4859 return (ENOTCONN); 4860 } 4861 return (0); 4862 } 4863 4864 void 4865 soiounlock(struct sx *sx) 4866 { 4867 sx_xunlock(sx); 4868 } 4869 4870 /* 4871 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 4872 */ 4873 struct sockaddr * 4874 sodupsockaddr(const struct sockaddr *sa, int mflags) 4875 { 4876 struct sockaddr *sa2; 4877 4878 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 4879 if (sa2) 4880 bcopy(sa, sa2, sa->sa_len); 4881 return sa2; 4882 } 4883 4884 /* 4885 * Register per-socket destructor. 4886 */ 4887 void 4888 sodtor_set(struct socket *so, so_dtor_t *func) 4889 { 4890 4891 SOCK_LOCK_ASSERT(so); 4892 so->so_dtor = func; 4893 } 4894 4895 /* 4896 * Register per-socket buffer upcalls. 4897 */ 4898 void 4899 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) 4900 { 4901 struct sockbuf *sb; 4902 4903 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4904 4905 switch (which) { 4906 case SO_RCV: 4907 sb = &so->so_rcv; 4908 break; 4909 case SO_SND: 4910 sb = &so->so_snd; 4911 break; 4912 } 4913 SOCK_BUF_LOCK_ASSERT(so, which); 4914 sb->sb_upcall = func; 4915 sb->sb_upcallarg = arg; 4916 sb->sb_flags |= SB_UPCALL; 4917 } 4918 4919 void 4920 soupcall_clear(struct socket *so, sb_which which) 4921 { 4922 struct sockbuf *sb; 4923 4924 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4925 4926 switch (which) { 4927 case SO_RCV: 4928 sb = &so->so_rcv; 4929 break; 4930 case SO_SND: 4931 sb = &so->so_snd; 4932 break; 4933 } 4934 SOCK_BUF_LOCK_ASSERT(so, which); 4935 KASSERT(sb->sb_upcall != NULL, 4936 ("%s: so %p no upcall to clear", __func__, so)); 4937 sb->sb_upcall = NULL; 4938 sb->sb_upcallarg = NULL; 4939 sb->sb_flags &= ~SB_UPCALL; 4940 } 4941 4942 void 4943 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4944 { 4945 4946 SOLISTEN_LOCK_ASSERT(so); 4947 so->sol_upcall = func; 4948 so->sol_upcallarg = arg; 4949 } 4950 4951 static void 4952 so_rdknl_lock(void *arg) 4953 { 4954 struct socket *so = arg; 4955 4956 retry: 4957 if (SOLISTENING(so)) { 4958 SOLISTEN_LOCK(so); 4959 } else { 4960 SOCK_RECVBUF_LOCK(so); 4961 if (__predict_false(SOLISTENING(so))) { 4962 SOCK_RECVBUF_UNLOCK(so); 4963 goto retry; 4964 } 4965 } 4966 } 4967 4968 static void 4969 so_rdknl_unlock(void *arg) 4970 { 4971 struct socket *so = arg; 4972 4973 if (SOLISTENING(so)) 4974 SOLISTEN_UNLOCK(so); 4975 else 4976 SOCK_RECVBUF_UNLOCK(so); 4977 } 4978 4979 static void 4980 so_rdknl_assert_lock(void *arg, int what) 4981 { 4982 struct socket *so = arg; 4983 4984 if (what == LA_LOCKED) { 4985 if (SOLISTENING(so)) 4986 SOLISTEN_LOCK_ASSERT(so); 4987 else 4988 SOCK_RECVBUF_LOCK_ASSERT(so); 4989 } else { 4990 if (SOLISTENING(so)) 4991 SOLISTEN_UNLOCK_ASSERT(so); 4992 else 4993 SOCK_RECVBUF_UNLOCK_ASSERT(so); 4994 } 4995 } 4996 4997 static void 4998 so_wrknl_lock(void *arg) 4999 { 5000 struct socket *so = arg; 5001 5002 retry: 5003 if (SOLISTENING(so)) { 5004 SOLISTEN_LOCK(so); 5005 } else { 5006 SOCK_SENDBUF_LOCK(so); 5007 if (__predict_false(SOLISTENING(so))) { 5008 SOCK_SENDBUF_UNLOCK(so); 5009 goto retry; 5010 } 5011 } 5012 } 5013 5014 static void 5015 so_wrknl_unlock(void *arg) 5016 { 5017 struct socket *so = arg; 5018 5019 if (SOLISTENING(so)) 5020 SOLISTEN_UNLOCK(so); 5021 else 5022 SOCK_SENDBUF_UNLOCK(so); 5023 } 5024 5025 static void 5026 so_wrknl_assert_lock(void *arg, int what) 5027 { 5028 struct socket *so = arg; 5029 5030 if (what == LA_LOCKED) { 5031 if (SOLISTENING(so)) 5032 SOLISTEN_LOCK_ASSERT(so); 5033 else 5034 SOCK_SENDBUF_LOCK_ASSERT(so); 5035 } else { 5036 if (SOLISTENING(so)) 5037 SOLISTEN_UNLOCK_ASSERT(so); 5038 else 5039 SOCK_SENDBUF_UNLOCK_ASSERT(so); 5040 } 5041 } 5042 5043 /* 5044 * Create an external-format (``xsocket'') structure using the information in 5045 * the kernel-format socket structure pointed to by so. This is done to 5046 * reduce the spew of irrelevant information over this interface, to isolate 5047 * user code from changes in the kernel structure, and potentially to provide 5048 * information-hiding if we decide that some of this information should be 5049 * hidden from users. 5050 */ 5051 void 5052 sotoxsocket(struct socket *so, struct xsocket *xso) 5053 { 5054 5055 bzero(xso, sizeof(*xso)); 5056 xso->xso_len = sizeof *xso; 5057 xso->xso_so = (uintptr_t)so; 5058 xso->so_type = so->so_type; 5059 xso->so_options = so->so_options; 5060 xso->so_linger = so->so_linger; 5061 xso->so_state = so->so_state; 5062 xso->so_pcb = (uintptr_t)so->so_pcb; 5063 xso->xso_protocol = so->so_proto->pr_protocol; 5064 xso->xso_family = so->so_proto->pr_domain->dom_family; 5065 xso->so_timeo = so->so_timeo; 5066 xso->so_error = so->so_error; 5067 xso->so_uid = so->so_cred->cr_uid; 5068 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 5069 SOCK_LOCK(so); 5070 xso->so_fibnum = so->so_fibnum; 5071 if (SOLISTENING(so)) { 5072 xso->so_qlen = so->sol_qlen; 5073 xso->so_incqlen = so->sol_incqlen; 5074 xso->so_qlimit = so->sol_qlimit; 5075 xso->so_oobmark = 0; 5076 } else { 5077 xso->so_state |= so->so_qstate; 5078 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 5079 xso->so_oobmark = so->so_oobmark; 5080 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 5081 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 5082 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 5083 xso->so_splice_so = (uintptr_t)so->so_splice->dst; 5084 } 5085 SOCK_UNLOCK(so); 5086 } 5087 5088 int 5089 so_options_get(const struct socket *so) 5090 { 5091 5092 return (so->so_options); 5093 } 5094 5095 void 5096 so_options_set(struct socket *so, int val) 5097 { 5098 5099 so->so_options = val; 5100 } 5101 5102 int 5103 so_error_get(const struct socket *so) 5104 { 5105 5106 return (so->so_error); 5107 } 5108 5109 void 5110 so_error_set(struct socket *so, int val) 5111 { 5112 5113 so->so_error = val; 5114 } 5115