1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pr_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pr_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pr_attach() has 50 * been successfully called. If pr_attach() returned an error, 51 * pr_detach() will not be called. Socket layer private. 52 * 53 * pr_abort() and pr_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pr_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. pr_fdclose() is called when userspace invokes close(2) on a socket 58 * file descriptor. 59 * 60 * socreate() creates a socket and attaches protocol state. This is a public 61 * interface that may be used by socket layer consumers to create new 62 * sockets. 63 * 64 * sonewconn() creates a socket and attaches protocol state. This is a 65 * public interface that may be used by protocols to create new sockets when 66 * a new connection is received and will be available for accept() on a 67 * listen socket. 68 * 69 * soclose() destroys a socket after possibly waiting for it to disconnect. 70 * This is a public interface that socket consumers should use to close and 71 * release a socket when done with it. 72 * 73 * soabort() destroys a socket without waiting for it to disconnect (used 74 * only for incoming connections that are already partially or fully 75 * connected). This is used internally by the socket layer when clearing 76 * listen socket queues (due to overflow or close on the listen socket), but 77 * is also a public interface protocols may use to abort connections in 78 * their incomplete listen queues should they no longer be required. Sockets 79 * placed in completed connection listen queues should not be aborted for 80 * reasons described in the comment above the soclose() implementation. This 81 * is not a general purpose close routine, and except in the specific 82 * circumstances described here, should not be used. 83 * 84 * sofree() will free a socket and its protocol state if all references on 85 * the socket have been released, and is the public interface to attempt to 86 * free a socket when a reference is removed. This is a socket layer private 87 * interface. 88 * 89 * NOTE: In addition to socreate() and soclose(), which provide a single 90 * socket reference to the consumer to be managed as required, there are two 91 * calls to explicitly manage socket references, soref(), and sorele(). 92 * Currently, these are generally required only when transitioning a socket 93 * from a listen queue to a file descriptor, in order to prevent garbage 94 * collection of the socket at an untimely moment. For a number of reasons, 95 * these interfaces are not preferred, and should be avoided. 96 * 97 * NOTE: With regard to VNETs the general rule is that callers do not set 98 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 99 * sofree(), sorele(), sonewconn() and sorflush(), which are usually called 100 * from a pre-set VNET context. sopoll_generic() currently does not need a 101 * VNET context to be set. 102 */ 103 104 #include <sys/cdefs.h> 105 #include "opt_inet.h" 106 #include "opt_inet6.h" 107 #include "opt_kern_tls.h" 108 #include "opt_ktrace.h" 109 #include "opt_sctp.h" 110 111 #include <sys/param.h> 112 #include <sys/systm.h> 113 #include <sys/capsicum.h> 114 #include <sys/fcntl.h> 115 #include <sys/limits.h> 116 #include <sys/lock.h> 117 #include <sys/mac.h> 118 #include <sys/malloc.h> 119 #include <sys/mbuf.h> 120 #include <sys/mutex.h> 121 #include <sys/domain.h> 122 #include <sys/file.h> /* for struct knote */ 123 #include <sys/hhook.h> 124 #include <sys/kernel.h> 125 #include <sys/khelp.h> 126 #include <sys/kthread.h> 127 #include <sys/ktls.h> 128 #include <sys/event.h> 129 #include <sys/eventhandler.h> 130 #include <sys/poll.h> 131 #include <sys/proc.h> 132 #include <sys/protosw.h> 133 #include <sys/sbuf.h> 134 #include <sys/socket.h> 135 #include <sys/socketvar.h> 136 #include <sys/resourcevar.h> 137 #include <net/route.h> 138 #include <sys/sched.h> 139 #include <sys/signalvar.h> 140 #include <sys/smp.h> 141 #include <sys/stat.h> 142 #include <sys/sx.h> 143 #include <sys/sysctl.h> 144 #include <sys/taskqueue.h> 145 #include <sys/uio.h> 146 #include <sys/un.h> 147 #include <sys/unpcb.h> 148 #include <sys/jail.h> 149 #include <sys/syslog.h> 150 #include <netinet/in.h> 151 #include <netinet/in_pcb.h> 152 #include <netinet/tcp.h> 153 154 #include <net/vnet.h> 155 156 #include <security/mac/mac_framework.h> 157 #include <security/mac/mac_internal.h> 158 159 #include <vm/uma.h> 160 161 #ifdef COMPAT_FREEBSD32 162 #include <sys/mount.h> 163 #include <sys/sysent.h> 164 #include <compat/freebsd32/freebsd32.h> 165 #endif 166 167 static int soreceive_generic_locked(struct socket *so, 168 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 169 struct mbuf **controlp, int *flagsp); 170 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 171 int flags); 172 static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 173 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 174 struct mbuf **controlp, int flags); 175 static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, 176 struct uio *uio, struct mbuf *top, struct mbuf *control, 177 int flags, struct thread *td); 178 static void so_rdknl_lock(void *); 179 static void so_rdknl_unlock(void *); 180 static void so_rdknl_assert_lock(void *, int); 181 static void so_wrknl_lock(void *); 182 static void so_wrknl_unlock(void *); 183 static void so_wrknl_assert_lock(void *, int); 184 185 static void filt_sordetach(struct knote *kn); 186 static int filt_soread(struct knote *kn, long hint); 187 static void filt_sowdetach(struct knote *kn); 188 static int filt_sowrite(struct knote *kn, long hint); 189 static int filt_soempty(struct knote *kn, long hint); 190 191 static const struct filterops soread_filtops = { 192 .f_isfd = 1, 193 .f_detach = filt_sordetach, 194 .f_event = filt_soread, 195 .f_copy = knote_triv_copy, 196 }; 197 static const struct filterops sowrite_filtops = { 198 .f_isfd = 1, 199 .f_detach = filt_sowdetach, 200 .f_event = filt_sowrite, 201 .f_copy = knote_triv_copy, 202 }; 203 static const struct filterops soempty_filtops = { 204 .f_isfd = 1, 205 .f_detach = filt_sowdetach, 206 .f_event = filt_soempty, 207 .f_copy = knote_triv_copy, 208 }; 209 210 so_gen_t so_gencnt; /* generation count for sockets */ 211 212 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 213 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 214 215 #define VNET_SO_ASSERT(so) \ 216 VNET_ASSERT(curvnet != NULL, \ 217 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 218 219 #ifdef SOCKET_HHOOK 220 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 221 #define V_socket_hhh VNET(socket_hhh) 222 static inline int hhook_run_socket(struct socket *, void *, int32_t); 223 #endif 224 225 #ifdef COMPAT_FREEBSD32 226 #ifdef __amd64__ 227 /* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */ 228 #define __splice32_packed __packed 229 #else 230 #define __splice32_packed 231 #endif 232 struct splice32 { 233 int32_t sp_fd; 234 int64_t sp_max; 235 struct timeval32 sp_idle; 236 } __splice32_packed; 237 #undef __splice32_packed 238 #endif 239 240 /* 241 * Limit on the number of connections in the listen queue waiting 242 * for accept(2). 243 * NB: The original sysctl somaxconn is still available but hidden 244 * to prevent confusion about the actual purpose of this number. 245 */ 246 VNET_DEFINE_STATIC(u_int, somaxconn) = SOMAXCONN; 247 #define V_somaxconn VNET(somaxconn) 248 249 static int 250 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 251 { 252 int error; 253 u_int val; 254 255 val = V_somaxconn; 256 error = sysctl_handle_int(oidp, &val, 0, req); 257 if (error || !req->newptr ) 258 return (error); 259 260 /* 261 * The purpose of the UINT_MAX / 3 limit, is so that the formula 262 * 3 * sol_qlimit / 2 263 * below, will not overflow. 264 */ 265 266 if (val < 1 || val > UINT_MAX / 3) 267 return (EINVAL); 268 269 V_somaxconn = val; 270 return (0); 271 } 272 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 273 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int), 274 sysctl_somaxconn, "IU", 275 "Maximum listen socket pending connection accept queue size"); 276 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 277 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, 278 sizeof(u_int), sysctl_somaxconn, "IU", 279 "Maximum listen socket pending connection accept queue size (compat)"); 280 281 static u_int numopensockets; 282 static int 283 sysctl_numopensockets(SYSCTL_HANDLER_ARGS) 284 { 285 u_int val; 286 287 #ifdef VIMAGE 288 if(!IS_DEFAULT_VNET(curvnet)) 289 val = curvnet->vnet_sockcnt; 290 else 291 #endif 292 val = numopensockets; 293 return (sysctl_handle_int(oidp, &val, 0, req)); 294 } 295 SYSCTL_PROC(_kern_ipc, OID_AUTO, numopensockets, 296 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int), 297 sysctl_numopensockets, "IU", "Number of open sockets"); 298 299 /* 300 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 301 * so_gencnt field. 302 */ 303 static struct mtx so_global_mtx; 304 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 305 306 /* 307 * General IPC sysctl name space, used by sockets and a variety of other IPC 308 * types. 309 */ 310 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 311 "IPC"); 312 313 /* 314 * Initialize the socket subsystem and set up the socket 315 * memory allocator. 316 */ 317 static uma_zone_t socket_zone; 318 int maxsockets; 319 320 static void 321 socket_zone_change(void *tag) 322 { 323 324 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 325 } 326 327 static int splice_init_state; 328 static struct sx splice_init_lock; 329 SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init"); 330 331 static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0, 332 "Settings relating to the SO_SPLICE socket option"); 333 334 static bool splice_receive_stream = true; 335 SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN, 336 &splice_receive_stream, 0, 337 "Use soreceive_stream() for stream splices"); 338 339 static uma_zone_t splice_zone; 340 static struct proc *splice_proc; 341 struct splice_wq { 342 struct mtx mtx; 343 STAILQ_HEAD(, so_splice) head; 344 bool running; 345 } __aligned(CACHE_LINE_SIZE); 346 static struct splice_wq *splice_wq; 347 static uint32_t splice_index = 0; 348 349 static void so_splice_timeout(void *arg, int pending); 350 static void so_splice_xfer(struct so_splice *s); 351 static int so_unsplice(struct socket *so, bool timeout); 352 353 static void 354 splice_work_thread(void *ctx) 355 { 356 struct splice_wq *wq = ctx; 357 struct so_splice *s, *s_temp; 358 STAILQ_HEAD(, so_splice) local_head; 359 int cpu; 360 361 cpu = wq - splice_wq; 362 if (bootverbose) 363 printf("starting so_splice worker thread for CPU %d\n", cpu); 364 365 for (;;) { 366 mtx_lock(&wq->mtx); 367 while (STAILQ_EMPTY(&wq->head)) { 368 wq->running = false; 369 mtx_sleep(wq, &wq->mtx, 0, "-", 0); 370 wq->running = true; 371 } 372 STAILQ_INIT(&local_head); 373 STAILQ_CONCAT(&local_head, &wq->head); 374 STAILQ_INIT(&wq->head); 375 mtx_unlock(&wq->mtx); 376 STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) { 377 mtx_lock(&s->mtx); 378 CURVNET_SET(s->src->so_vnet); 379 so_splice_xfer(s); 380 CURVNET_RESTORE(); 381 } 382 } 383 } 384 385 static void 386 so_splice_dispatch_async(struct so_splice *sp) 387 { 388 struct splice_wq *wq; 389 bool running; 390 391 wq = &splice_wq[sp->wq_index]; 392 mtx_lock(&wq->mtx); 393 STAILQ_INSERT_TAIL(&wq->head, sp, next); 394 running = wq->running; 395 mtx_unlock(&wq->mtx); 396 if (!running) 397 wakeup(wq); 398 } 399 400 void 401 so_splice_dispatch(struct so_splice *sp) 402 { 403 mtx_assert(&sp->mtx, MA_OWNED); 404 405 if (sp->state != SPLICE_IDLE) { 406 mtx_unlock(&sp->mtx); 407 } else { 408 sp->state = SPLICE_QUEUED; 409 mtx_unlock(&sp->mtx); 410 so_splice_dispatch_async(sp); 411 } 412 } 413 414 static int 415 splice_zinit(void *mem, int size __unused, int flags __unused) 416 { 417 struct so_splice *s; 418 419 s = (struct so_splice *)mem; 420 mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF); 421 return (0); 422 } 423 424 static void 425 splice_zfini(void *mem, int size) 426 { 427 struct so_splice *s; 428 429 s = (struct so_splice *)mem; 430 mtx_destroy(&s->mtx); 431 } 432 433 static int 434 splice_init(void) 435 { 436 struct thread *td; 437 int error, i, state; 438 439 state = atomic_load_acq_int(&splice_init_state); 440 if (__predict_true(state > 0)) 441 return (0); 442 if (state < 0) 443 return (ENXIO); 444 sx_xlock(&splice_init_lock); 445 if (splice_init_state != 0) { 446 sx_xunlock(&splice_init_lock); 447 return (0); 448 } 449 450 splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL, 451 NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0); 452 453 splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP, 454 M_WAITOK | M_ZERO); 455 456 /* 457 * Initialize the workqueues to run the splice work. We create a 458 * work queue for each CPU. 459 */ 460 CPU_FOREACH(i) { 461 STAILQ_INIT(&splice_wq[i].head); 462 mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF); 463 } 464 465 /* Start kthreads for each workqueue. */ 466 error = 0; 467 CPU_FOREACH(i) { 468 error = kproc_kthread_add(splice_work_thread, &splice_wq[i], 469 &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i); 470 if (error) { 471 printf("Can't add so_splice thread %d error %d\n", 472 i, error); 473 break; 474 } 475 476 /* 477 * It's possible to create loops with SO_SPLICE; ensure that 478 * worker threads aren't able to starve the system too easily. 479 */ 480 thread_lock(td); 481 sched_prio(td, PUSER); 482 thread_unlock(td); 483 } 484 485 splice_init_state = error != 0 ? -1 : 1; 486 sx_xunlock(&splice_init_lock); 487 488 return (error); 489 } 490 491 /* 492 * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding 493 * one lock in order to avoid potential deadlocks in case there is some other 494 * code path which acquires more than one I/O lock at a time. 495 */ 496 static void 497 splice_lock_pair(struct socket *so_src, struct socket *so_dst) 498 { 499 int error; 500 501 for (;;) { 502 error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR); 503 KASSERT(error == 0, 504 ("%s: failed to lock send I/O lock: %d", __func__, error)); 505 error = SOCK_IO_RECV_LOCK(so_src, 0); 506 KASSERT(error == 0 || error == EWOULDBLOCK, 507 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 508 if (error == 0) 509 break; 510 SOCK_IO_SEND_UNLOCK(so_dst); 511 512 error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR); 513 KASSERT(error == 0, 514 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 515 error = SOCK_IO_SEND_LOCK(so_dst, 0); 516 KASSERT(error == 0 || error == EWOULDBLOCK, 517 ("%s: failed to lock send I/O lock: %d", __func__, error)); 518 if (error == 0) 519 break; 520 SOCK_IO_RECV_UNLOCK(so_src); 521 } 522 } 523 524 static void 525 splice_unlock_pair(struct socket *so_src, struct socket *so_dst) 526 { 527 SOCK_IO_RECV_UNLOCK(so_src); 528 SOCK_IO_SEND_UNLOCK(so_dst); 529 } 530 531 /* 532 * Move data from the source to the sink. Assumes that both of the relevant 533 * socket I/O locks are held. 534 */ 535 static int 536 so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max, 537 ssize_t *lenp) 538 { 539 struct uio uio; 540 struct mbuf *m; 541 struct sockbuf *sb_src, *sb_dst; 542 ssize_t len; 543 long space; 544 int error, flags; 545 546 SOCK_IO_RECV_ASSERT_LOCKED(so_src); 547 SOCK_IO_SEND_ASSERT_LOCKED(so_dst); 548 549 error = 0; 550 m = NULL; 551 memset(&uio, 0, sizeof(uio)); 552 553 sb_src = &so_src->so_rcv; 554 sb_dst = &so_dst->so_snd; 555 556 space = sbspace(sb_dst); 557 if (space < 0) 558 space = 0; 559 len = MIN(max, MIN(space, sbavail(sb_src))); 560 if (len == 0) { 561 SOCK_RECVBUF_LOCK(so_src); 562 if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0) 563 error = EPIPE; 564 SOCK_RECVBUF_UNLOCK(so_src); 565 } else { 566 flags = MSG_DONTWAIT; 567 uio.uio_resid = len; 568 if (splice_receive_stream && sb_src->sb_tls_info == NULL) { 569 error = soreceive_stream_locked(so_src, sb_src, NULL, 570 &uio, &m, NULL, flags); 571 } else { 572 error = soreceive_generic_locked(so_src, NULL, 573 &uio, &m, NULL, &flags); 574 } 575 if (error != 0 && m != NULL) { 576 m_freem(m); 577 m = NULL; 578 } 579 } 580 if (m != NULL) { 581 len -= uio.uio_resid; 582 error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL, 583 MSG_DONTWAIT, curthread); 584 } else if (error == 0) { 585 len = 0; 586 SOCK_SENDBUF_LOCK(so_dst); 587 if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0) 588 error = EPIPE; 589 SOCK_SENDBUF_UNLOCK(so_dst); 590 } 591 if (error == 0) 592 *lenp = len; 593 return (error); 594 } 595 596 /* 597 * Transfer data from the source to the sink. 598 */ 599 static void 600 so_splice_xfer(struct so_splice *sp) 601 { 602 struct socket *so_src, *so_dst; 603 off_t max; 604 ssize_t len; 605 int error; 606 607 mtx_assert(&sp->mtx, MA_OWNED); 608 KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING, 609 ("so_splice_xfer: invalid state %d", sp->state)); 610 KASSERT(sp->max != 0, ("so_splice_xfer: max == 0")); 611 612 if (sp->state == SPLICE_CLOSING) { 613 /* Userspace asked us to close the splice. */ 614 goto closing; 615 } 616 617 sp->state = SPLICE_RUNNING; 618 so_src = sp->src; 619 so_dst = sp->dst; 620 max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX; 621 if (max < 0) 622 max = 0; 623 624 /* 625 * Lock the sockets in order to block userspace from doing anything 626 * sneaky. If an error occurs or one of the sockets can no longer 627 * transfer data, we will automatically unsplice. 628 */ 629 mtx_unlock(&sp->mtx); 630 splice_lock_pair(so_src, so_dst); 631 632 error = so_splice_xfer_data(so_src, so_dst, max, &len); 633 634 mtx_lock(&sp->mtx); 635 636 /* 637 * Update our stats while still holding the socket locks. This 638 * synchronizes with getsockopt(SO_SPLICE), see the comment there. 639 */ 640 if (error == 0) { 641 KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len)); 642 so_src->so_splice_sent += len; 643 } 644 splice_unlock_pair(so_src, so_dst); 645 646 switch (sp->state) { 647 case SPLICE_CLOSING: 648 closing: 649 sp->state = SPLICE_CLOSED; 650 wakeup(sp); 651 mtx_unlock(&sp->mtx); 652 break; 653 case SPLICE_RUNNING: 654 if (error != 0 || 655 (sp->max > 0 && so_src->so_splice_sent >= sp->max)) { 656 sp->state = SPLICE_EXCEPTION; 657 soref(so_src); 658 mtx_unlock(&sp->mtx); 659 (void)so_unsplice(so_src, false); 660 sorele(so_src); 661 } else { 662 /* 663 * Locklessly check for additional bytes in the source's 664 * receive buffer and queue more work if possible. We 665 * may end up queuing needless work, but that's ok, and 666 * if we race with a thread inserting more data into the 667 * buffer and observe sbavail() == 0, the splice mutex 668 * ensures that splice_push() will queue more work for 669 * us. 670 */ 671 if (sbavail(&so_src->so_rcv) > 0 && 672 sbspace(&so_dst->so_snd) > 0) { 673 sp->state = SPLICE_QUEUED; 674 mtx_unlock(&sp->mtx); 675 so_splice_dispatch_async(sp); 676 } else { 677 sp->state = SPLICE_IDLE; 678 mtx_unlock(&sp->mtx); 679 } 680 } 681 break; 682 default: 683 __assert_unreachable(); 684 } 685 } 686 687 static void 688 socket_init(void *tag) 689 { 690 691 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 692 NULL, NULL, UMA_ALIGN_PTR, 0); 693 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 694 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 695 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 696 EVENTHANDLER_PRI_FIRST); 697 } 698 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 699 700 #ifdef SOCKET_HHOOK 701 static void 702 socket_hhook_register(int subtype) 703 { 704 705 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 706 &V_socket_hhh[subtype], 707 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 708 printf("%s: WARNING: unable to register hook\n", __func__); 709 } 710 711 static void 712 socket_hhook_deregister(int subtype) 713 { 714 715 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 716 printf("%s: WARNING: unable to deregister hook\n", __func__); 717 } 718 719 static void 720 socket_vnet_init(const void *unused __unused) 721 { 722 int i; 723 724 /* We expect a contiguous range */ 725 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 726 socket_hhook_register(i); 727 } 728 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 729 socket_vnet_init, NULL); 730 731 static void 732 socket_vnet_uninit(const void *unused __unused) 733 { 734 int i; 735 736 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 737 socket_hhook_deregister(i); 738 } 739 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 740 socket_vnet_uninit, NULL); 741 #endif /* SOCKET_HHOOK */ 742 743 /* 744 * Initialise maxsockets. This SYSINIT must be run after 745 * tunable_mbinit(). 746 */ 747 static void 748 init_maxsockets(void *ignored) 749 { 750 751 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 752 maxsockets = imax(maxsockets, maxfiles); 753 } 754 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 755 756 /* 757 * Sysctl to get and set the maximum global sockets limit. Notify protocols 758 * of the change so that they can update their dependent limits as required. 759 */ 760 static int 761 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 762 { 763 int error, newmaxsockets; 764 765 newmaxsockets = maxsockets; 766 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 767 if (error == 0 && req->newptr && newmaxsockets != maxsockets) { 768 if (newmaxsockets > maxsockets && 769 newmaxsockets <= maxfiles) { 770 maxsockets = newmaxsockets; 771 EVENTHANDLER_INVOKE(maxsockets_change); 772 } else 773 error = EINVAL; 774 } 775 return (error); 776 } 777 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 778 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 779 &maxsockets, 0, sysctl_maxsockets, "IU", 780 "Maximum number of sockets available"); 781 782 /* 783 * Socket operation routines. These routines are called by the routines in 784 * sys_socket.c or from a system process, and implement the semantics of 785 * socket operations by switching out to the protocol specific routines. 786 */ 787 788 /* 789 * Get a socket structure from our zone, and initialize it. Note that it 790 * would probably be better to allocate socket and PCB at the same time, but 791 * I'm not convinced that all the protocols can be easily modified to do 792 * this. 793 * 794 * soalloc() returns a socket with a ref count of 0. 795 */ 796 static struct socket * 797 soalloc(struct vnet *vnet) 798 { 799 struct socket *so; 800 801 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 802 if (so == NULL) 803 return (NULL); 804 #ifdef MAC 805 if (mac_socket_init(so, M_NOWAIT) != 0) { 806 uma_zfree(socket_zone, so); 807 return (NULL); 808 } 809 #endif 810 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 811 uma_zfree(socket_zone, so); 812 return (NULL); 813 } 814 815 /* 816 * The socket locking protocol allows to lock 2 sockets at a time, 817 * however, the first one must be a listening socket. WITNESS lacks 818 * a feature to change class of an existing lock, so we use DUPOK. 819 */ 820 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 821 so->so_rcv.sb_sel = &so->so_rdsel; 822 so->so_snd.sb_sel = &so->so_wrsel; 823 sx_init(&so->so_snd_sx, "so_snd_sx"); 824 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 825 TAILQ_INIT(&so->so_snd.sb_aiojobq); 826 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 827 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 828 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 829 #ifdef VIMAGE 830 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 831 __func__, __LINE__, so)); 832 so->so_vnet = vnet; 833 #endif 834 #ifdef SOCKET_HHOOK 835 /* We shouldn't need the so_global_mtx */ 836 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 837 /* Do we need more comprehensive error returns? */ 838 uma_zfree(socket_zone, so); 839 return (NULL); 840 } 841 #endif 842 mtx_lock(&so_global_mtx); 843 so->so_gencnt = ++so_gencnt; 844 ++numopensockets; 845 #ifdef VIMAGE 846 vnet->vnet_sockcnt++; 847 #endif 848 mtx_unlock(&so_global_mtx); 849 850 return (so); 851 } 852 853 /* 854 * Free the storage associated with a socket at the socket layer, tear down 855 * locks, labels, etc. All protocol state is assumed already to have been 856 * torn down (and possibly never set up) by the caller. 857 */ 858 void 859 sodealloc(struct socket *so) 860 { 861 862 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 863 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 864 865 mtx_lock(&so_global_mtx); 866 so->so_gencnt = ++so_gencnt; 867 --numopensockets; /* Could be below, but faster here. */ 868 #ifdef VIMAGE 869 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 870 __func__, __LINE__, so)); 871 so->so_vnet->vnet_sockcnt--; 872 #endif 873 mtx_unlock(&so_global_mtx); 874 #ifdef MAC 875 mac_socket_destroy(so); 876 #endif 877 #ifdef SOCKET_HHOOK 878 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 879 #endif 880 881 khelp_destroy_osd(&so->osd); 882 if (SOLISTENING(so)) { 883 if (so->sol_accept_filter != NULL) 884 accept_filt_setopt(so, NULL); 885 } else { 886 if (so->so_rcv.sb_hiwat) 887 (void)chgsbsize(so->so_cred->cr_uidinfo, 888 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 889 if (so->so_snd.sb_hiwat) 890 (void)chgsbsize(so->so_cred->cr_uidinfo, 891 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 892 sx_destroy(&so->so_snd_sx); 893 sx_destroy(&so->so_rcv_sx); 894 } 895 crfree(so->so_cred); 896 mtx_destroy(&so->so_lock); 897 uma_zfree(socket_zone, so); 898 } 899 900 /* 901 * Shim to accomodate protocols that already do their own socket buffers 902 * management (marked with PR_SOCKBUF) with protocols that yet do not. 903 * 904 * Attach via socket(2) is different from attach via accept(2). In case of 905 * normal socket(2) syscall it is the pr_attach that calls soreserve(), even 906 * for protocols that don't yet do PR_SOCKBUF. In case of accepted connection 907 * it is our shim that calls soreserve() and the hiwat values are taken from 908 * the parent socket. 909 */ 910 static int 911 soattach(struct socket *so, int proto, struct thread *td, struct socket *head) 912 { 913 int error; 914 915 VNET_ASSERT(curvnet == so->so_vnet, 916 ("%s: %p != %p", __func__, curvnet, so->so_vnet)); 917 918 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 919 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); 920 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); 921 so->so_snd.sb_mtx = &so->so_snd_mtx; 922 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 923 } 924 if (head == NULL || (error = soreserve(so, head->sol_sbsnd_hiwat, 925 head->sol_sbrcv_hiwat)) == 0) 926 error = so->so_proto->pr_attach(so, proto, td); 927 if (error != 0 && (so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 928 mtx_destroy(&so->so_snd_mtx); 929 mtx_destroy(&so->so_rcv_mtx); 930 } 931 932 return (error); 933 } 934 935 /* 936 * socreate returns a socket with a ref count of 1 and a file descriptor 937 * reference. The socket should be closed with soclose(). 938 */ 939 int 940 socreate(int dom, struct socket **aso, int type, int proto, 941 struct ucred *cred, struct thread *td) 942 { 943 struct protosw *prp; 944 struct socket *so; 945 int error; 946 947 prp = pffindproto(dom, type, proto); 948 if (prp == NULL) { 949 /* No support for domain. */ 950 if (pffinddomain(dom) == NULL) 951 return (EAFNOSUPPORT); 952 /* No support for socket type. */ 953 if (proto == 0 && type != 0) 954 return (EPROTOTYPE); 955 return (EPROTONOSUPPORT); 956 } 957 958 MPASS(prp->pr_attach); 959 960 if ((prp->pr_flags & PR_CAPATTACH) == 0) { 961 if (CAP_TRACING(td)) 962 ktrcapfail(CAPFAIL_PROTO, &proto); 963 if (IN_CAPABILITY_MODE(td)) 964 return (ECAPMODE); 965 } 966 967 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 968 return (EPROTONOSUPPORT); 969 970 so = soalloc(CRED_TO_VNET(cred)); 971 if (so == NULL) 972 return (ENOBUFS); 973 974 so->so_type = type; 975 so->so_cred = crhold(cred); 976 if ((prp->pr_domain->dom_family == PF_INET) || 977 (prp->pr_domain->dom_family == PF_INET6) || 978 (prp->pr_domain->dom_family == PF_ROUTE)) 979 so->so_fibnum = td->td_proc->p_fibnum; 980 else 981 so->so_fibnum = 0; 982 so->so_proto = prp; 983 #ifdef MAC 984 mac_socket_create(cred, so); 985 #endif 986 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 987 so_rdknl_assert_lock); 988 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 989 so_wrknl_assert_lock); 990 CURVNET_SET(so->so_vnet); 991 error = soattach(so, proto, td, NULL); 992 CURVNET_RESTORE(); 993 if (error) { 994 sodealloc(so); 995 return (error); 996 } 997 soref(so); 998 *aso = so; 999 return (0); 1000 } 1001 1002 #ifdef REGRESSION 1003 static int regression_sonewconn_earlytest = 1; 1004 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 1005 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 1006 #endif 1007 1008 static int sooverprio = LOG_DEBUG; 1009 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, 1010 &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); 1011 1012 static struct timeval overinterval = { 60, 0 }; 1013 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 1014 &overinterval, 1015 "Delay in seconds between warnings for listen socket overflows"); 1016 1017 /* 1018 * When an attempt at a new connection is noted on a socket which supports 1019 * accept(2), the protocol has two options: 1020 * 1) Call legacy sonewconn() function, which would call protocol attach 1021 * method, same as used for socket(2). 1022 * 2) Call solisten_clone(), do attach that is specific to a cloned connection, 1023 * and then call solisten_enqueue(). 1024 * 1025 * Note: the ref count on the socket is 0 on return. 1026 */ 1027 struct socket * 1028 solisten_clone(struct socket *head) 1029 { 1030 struct sbuf descrsb; 1031 struct socket *so; 1032 int len, overcount; 1033 u_int qlen; 1034 const char localprefix[] = "local:"; 1035 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 1036 #if defined(INET6) 1037 char addrbuf[INET6_ADDRSTRLEN]; 1038 #elif defined(INET) 1039 char addrbuf[INET_ADDRSTRLEN]; 1040 #endif 1041 bool dolog, over; 1042 1043 SOLISTEN_LOCK(head); 1044 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 1045 #ifdef REGRESSION 1046 if (regression_sonewconn_earlytest && over) { 1047 #else 1048 if (over) { 1049 #endif 1050 head->sol_overcount++; 1051 dolog = (sooverprio >= 0) && 1052 !!ratecheck(&head->sol_lastover, &overinterval); 1053 1054 /* 1055 * If we're going to log, copy the overflow count and queue 1056 * length from the listen socket before dropping the lock. 1057 * Also, reset the overflow count. 1058 */ 1059 if (dolog) { 1060 overcount = head->sol_overcount; 1061 head->sol_overcount = 0; 1062 qlen = head->sol_qlen; 1063 } 1064 SOLISTEN_UNLOCK(head); 1065 1066 if (dolog) { 1067 /* 1068 * Try to print something descriptive about the 1069 * socket for the error message. 1070 */ 1071 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 1072 SBUF_FIXEDLEN); 1073 switch (head->so_proto->pr_domain->dom_family) { 1074 #if defined(INET) || defined(INET6) 1075 #ifdef INET 1076 case AF_INET: 1077 #endif 1078 #ifdef INET6 1079 case AF_INET6: 1080 if (head->so_proto->pr_domain->dom_family == 1081 AF_INET6 || 1082 (sotoinpcb(head)->inp_inc.inc_flags & 1083 INC_ISIPV6)) { 1084 ip6_sprintf(addrbuf, 1085 &sotoinpcb(head)->inp_inc.inc6_laddr); 1086 sbuf_printf(&descrsb, "[%s]", addrbuf); 1087 } else 1088 #endif 1089 { 1090 #ifdef INET 1091 inet_ntoa_r( 1092 sotoinpcb(head)->inp_inc.inc_laddr, 1093 addrbuf); 1094 sbuf_cat(&descrsb, addrbuf); 1095 #endif 1096 } 1097 sbuf_printf(&descrsb, ":%hu (proto %u)", 1098 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 1099 head->so_proto->pr_protocol); 1100 break; 1101 #endif /* INET || INET6 */ 1102 case AF_UNIX: 1103 sbuf_cat(&descrsb, localprefix); 1104 if (sotounpcb(head)->unp_addr != NULL) 1105 len = 1106 sotounpcb(head)->unp_addr->sun_len - 1107 offsetof(struct sockaddr_un, 1108 sun_path); 1109 else 1110 len = 0; 1111 if (len > 0) 1112 sbuf_bcat(&descrsb, 1113 sotounpcb(head)->unp_addr->sun_path, 1114 len); 1115 else 1116 sbuf_cat(&descrsb, "(unknown)"); 1117 break; 1118 } 1119 1120 /* 1121 * If we can't print something more specific, at least 1122 * print the domain name. 1123 */ 1124 if (sbuf_finish(&descrsb) != 0 || 1125 sbuf_len(&descrsb) <= 0) { 1126 sbuf_clear(&descrsb); 1127 sbuf_cat(&descrsb, 1128 head->so_proto->pr_domain->dom_name ?: 1129 "unknown"); 1130 sbuf_finish(&descrsb); 1131 } 1132 KASSERT(sbuf_len(&descrsb) > 0, 1133 ("%s: sbuf creation failed", __func__)); 1134 /* 1135 * Preserve the historic listen queue overflow log 1136 * message, that starts with "sonewconn:". It has 1137 * been known to sysadmins for years and also test 1138 * sys/kern/sonewconn_overflow checks for it. 1139 */ 1140 if (head->so_cred == 0) { 1141 log(LOG_PRI(sooverprio), 1142 "sonewconn: pcb %p (%s): " 1143 "Listen queue overflow: %i already in " 1144 "queue awaiting acceptance (%d " 1145 "occurrences)\n", head->so_pcb, 1146 sbuf_data(&descrsb), 1147 qlen, overcount); 1148 } else { 1149 log(LOG_PRI(sooverprio), 1150 "sonewconn: pcb %p (%s): " 1151 "Listen queue overflow: " 1152 "%i already in queue awaiting acceptance " 1153 "(%d occurrences), euid %d, rgid %d, jail %s\n", 1154 head->so_pcb, sbuf_data(&descrsb), qlen, 1155 overcount, head->so_cred->cr_uid, 1156 head->so_cred->cr_rgid, 1157 head->so_cred->cr_prison ? 1158 head->so_cred->cr_prison->pr_name : 1159 "not_jailed"); 1160 } 1161 sbuf_delete(&descrsb); 1162 1163 overcount = 0; 1164 } 1165 1166 return (NULL); 1167 } 1168 SOLISTEN_UNLOCK(head); 1169 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 1170 __func__, head)); 1171 so = soalloc(head->so_vnet); 1172 if (so == NULL) { 1173 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1174 "limit reached or out of memory\n", 1175 __func__, head->so_pcb); 1176 return (NULL); 1177 } 1178 so->so_listen = head; 1179 so->so_type = head->so_type; 1180 /* 1181 * POSIX is ambiguous on what options an accept(2)ed socket should 1182 * inherit from the listener. Words "create a new socket" may be 1183 * interpreted as not inheriting anything. Best programming practice 1184 * for application developers is to not rely on such inheritance. 1185 * FreeBSD had historically inherited all so_options excluding 1186 * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, 1187 * including those completely irrelevant to a new born socket. For 1188 * compatibility with older versions we will inherit a list of 1189 * meaningful options. 1190 * The crucial bit to inherit is SO_ACCEPTFILTER. We need it present 1191 * in the child socket for soisconnected() promoting socket from the 1192 * incomplete queue to complete. It will be cleared before the child 1193 * gets available to accept(2). 1194 */ 1195 so->so_options = head->so_options & (SO_ACCEPTFILTER | SO_KEEPALIVE | 1196 SO_DONTROUTE | SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); 1197 so->so_linger = head->so_linger; 1198 so->so_state = head->so_state; 1199 so->so_fibnum = head->so_fibnum; 1200 so->so_proto = head->so_proto; 1201 so->so_cred = crhold(head->so_cred); 1202 #ifdef SOCKET_HHOOK 1203 if (V_socket_hhh[HHOOK_SOCKET_NEWCONN]->hhh_nhooks > 0) { 1204 if (hhook_run_socket(so, head, HHOOK_SOCKET_NEWCONN)) { 1205 sodealloc(so); 1206 log(LOG_DEBUG, "%s: hhook run failed\n", __func__); 1207 return (NULL); 1208 } 1209 } 1210 #endif 1211 #ifdef MAC 1212 mac_socket_newconn(head, so); 1213 #endif 1214 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1215 so_rdknl_assert_lock); 1216 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1217 so_wrknl_assert_lock); 1218 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 1219 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 1220 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 1221 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 1222 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; 1223 so->so_snd.sb_flags = head->sol_sbsnd_flags & 1224 (SB_AUTOSIZE | SB_AUTOLOWAT); 1225 1226 return (so); 1227 } 1228 1229 /* Connstatus may be 0 or SS_ISCONNECTED. */ 1230 struct socket * 1231 sonewconn(struct socket *head, int connstatus) 1232 { 1233 struct socket *so; 1234 1235 if ((so = solisten_clone(head)) == NULL) 1236 return (NULL); 1237 1238 if (soattach(so, 0, NULL, head) != 0) { 1239 sodealloc(so); 1240 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 1241 __func__, head->so_pcb); 1242 return (NULL); 1243 } 1244 1245 (void)solisten_enqueue(so, connstatus); 1246 1247 return (so); 1248 } 1249 1250 /* 1251 * Enqueue socket cloned by solisten_clone() to the listen queue of the 1252 * listener it has been cloned from. 1253 * 1254 * Return 'true' if socket landed on complete queue, otherwise 'false'. 1255 */ 1256 bool 1257 solisten_enqueue(struct socket *so, int connstatus) 1258 { 1259 struct socket *head = so->so_listen; 1260 1261 MPASS(refcount_load(&so->so_count) == 0); 1262 refcount_init(&so->so_count, 1); 1263 1264 SOLISTEN_LOCK(head); 1265 if (head->sol_accept_filter != NULL) 1266 connstatus = 0; 1267 so->so_state |= connstatus; 1268 soref(head); /* A socket on (in)complete queue refs head. */ 1269 if (connstatus) { 1270 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 1271 so->so_qstate = SQ_COMP; 1272 head->sol_qlen++; 1273 solisten_wakeup(head); /* unlocks */ 1274 return (true); 1275 } else { 1276 /* 1277 * Keep removing sockets from the head until there's room for 1278 * us to insert on the tail. In pre-locking revisions, this 1279 * was a simple if(), but as we could be racing with other 1280 * threads and soabort() requires dropping locks, we must 1281 * loop waiting for the condition to be true. 1282 */ 1283 while (head->sol_incqlen > head->sol_qlimit) { 1284 struct socket *sp; 1285 1286 sp = TAILQ_FIRST(&head->sol_incomp); 1287 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 1288 head->sol_incqlen--; 1289 SOCK_LOCK(sp); 1290 sp->so_qstate = SQ_NONE; 1291 sp->so_listen = NULL; 1292 SOCK_UNLOCK(sp); 1293 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ 1294 soabort(sp); 1295 SOLISTEN_LOCK(head); 1296 } 1297 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 1298 so->so_qstate = SQ_INCOMP; 1299 head->sol_incqlen++; 1300 SOLISTEN_UNLOCK(head); 1301 return (false); 1302 } 1303 } 1304 1305 #if defined(SCTP) || defined(SCTP_SUPPORT) 1306 /* 1307 * Socket part of sctp_peeloff(). Create a new socket for an 1308 * association. The new socket is returned with a reference. 1309 * 1310 * XXXGL: reduce copy-paste with solisten_clone(). 1311 */ 1312 struct socket * 1313 sopeeloff(struct socket *head) 1314 { 1315 struct socket *so; 1316 1317 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 1318 __func__, __LINE__, head)); 1319 KASSERT(head->so_type == SOCK_SEQPACKET, 1320 ("%s: unexpecte so_type: %d", __func__, head->so_type)); 1321 so = soalloc(head->so_vnet); 1322 if (so == NULL) { 1323 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1324 "limit reached or out of memory\n", 1325 __func__, head->so_pcb); 1326 return (NULL); 1327 } 1328 so->so_type = SOCK_STREAM; 1329 so->so_options = head->so_options; 1330 so->so_linger = head->so_linger; 1331 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 1332 so->so_fibnum = head->so_fibnum; 1333 so->so_proto = head->so_proto; 1334 so->so_cred = crhold(head->so_cred); 1335 #ifdef MAC 1336 mac_socket_newconn(head, so); 1337 #endif 1338 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1339 so_rdknl_assert_lock); 1340 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1341 so_wrknl_assert_lock); 1342 if (soattach(so, 0, NULL, head)) { 1343 sodealloc(so); 1344 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 1345 __func__, head->so_pcb); 1346 return (NULL); 1347 } 1348 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 1349 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 1350 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 1351 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 1352 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 1353 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 1354 1355 soref(so); 1356 1357 return (so); 1358 } 1359 #endif /* SCTP */ 1360 1361 int 1362 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 1363 { 1364 int error; 1365 1366 CURVNET_SET(so->so_vnet); 1367 error = so->so_proto->pr_bind(so, nam, td); 1368 CURVNET_RESTORE(); 1369 return (error); 1370 } 1371 1372 int 1373 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1374 { 1375 int error; 1376 1377 CURVNET_SET(so->so_vnet); 1378 error = so->so_proto->pr_bindat(fd, so, nam, td); 1379 CURVNET_RESTORE(); 1380 return (error); 1381 } 1382 1383 /* 1384 * solisten() transitions a socket from a non-listening state to a listening 1385 * state, but can also be used to update the listen queue depth on an 1386 * existing listen socket. The protocol will call back into the sockets 1387 * layer using solisten_proto_check() and solisten_proto() to check and set 1388 * socket-layer listen state. Call backs are used so that the protocol can 1389 * acquire both protocol and socket layer locks in whatever order is required 1390 * by the protocol. 1391 * 1392 * Protocol implementors are advised to hold the socket lock across the 1393 * socket-layer test and set to avoid races at the socket layer. 1394 */ 1395 int 1396 solisten(struct socket *so, int backlog, struct thread *td) 1397 { 1398 int error; 1399 1400 CURVNET_SET(so->so_vnet); 1401 error = so->so_proto->pr_listen(so, backlog, td); 1402 CURVNET_RESTORE(); 1403 return (error); 1404 } 1405 1406 /* 1407 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 1408 * order to interlock with socket I/O. 1409 */ 1410 int 1411 solisten_proto_check(struct socket *so) 1412 { 1413 SOCK_LOCK_ASSERT(so); 1414 1415 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1416 SS_ISDISCONNECTING)) != 0) 1417 return (EINVAL); 1418 1419 /* 1420 * Sleeping is not permitted here, so simply fail if userspace is 1421 * attempting to transmit or receive on the socket. This kind of 1422 * transient failure is not ideal, but it should occur only if userspace 1423 * is misusing the socket interfaces. 1424 */ 1425 if (!sx_try_xlock(&so->so_snd_sx)) 1426 return (EAGAIN); 1427 if (!sx_try_xlock(&so->so_rcv_sx)) { 1428 sx_xunlock(&so->so_snd_sx); 1429 return (EAGAIN); 1430 } 1431 mtx_lock(&so->so_snd_mtx); 1432 mtx_lock(&so->so_rcv_mtx); 1433 1434 /* Interlock with soo_aio_queue() and KTLS. */ 1435 if (!SOLISTENING(so)) { 1436 bool ktls; 1437 1438 #ifdef KERN_TLS 1439 ktls = so->so_snd.sb_tls_info != NULL || 1440 so->so_rcv.sb_tls_info != NULL; 1441 #else 1442 ktls = false; 1443 #endif 1444 if (ktls || 1445 (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || 1446 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { 1447 solisten_proto_abort(so); 1448 return (EINVAL); 1449 } 1450 } 1451 1452 return (0); 1453 } 1454 1455 /* 1456 * Undo the setup done by solisten_proto_check(). 1457 */ 1458 void 1459 solisten_proto_abort(struct socket *so) 1460 { 1461 mtx_unlock(&so->so_snd_mtx); 1462 mtx_unlock(&so->so_rcv_mtx); 1463 sx_xunlock(&so->so_snd_sx); 1464 sx_xunlock(&so->so_rcv_sx); 1465 } 1466 1467 void 1468 solisten_proto(struct socket *so, int backlog) 1469 { 1470 int sbrcv_lowat, sbsnd_lowat; 1471 u_int sbrcv_hiwat, sbsnd_hiwat; 1472 short sbrcv_flags, sbsnd_flags; 1473 sbintime_t sbrcv_timeo, sbsnd_timeo; 1474 1475 SOCK_LOCK_ASSERT(so); 1476 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1477 SS_ISDISCONNECTING)) == 0, 1478 ("%s: bad socket state %p", __func__, so)); 1479 1480 if (SOLISTENING(so)) 1481 goto listening; 1482 1483 /* 1484 * Change this socket to listening state. 1485 */ 1486 sbrcv_lowat = so->so_rcv.sb_lowat; 1487 sbsnd_lowat = so->so_snd.sb_lowat; 1488 sbrcv_hiwat = so->so_rcv.sb_hiwat; 1489 sbsnd_hiwat = so->so_snd.sb_hiwat; 1490 sbrcv_flags = so->so_rcv.sb_flags; 1491 sbsnd_flags = so->so_snd.sb_flags; 1492 sbrcv_timeo = so->so_rcv.sb_timeo; 1493 sbsnd_timeo = so->so_snd.sb_timeo; 1494 1495 #ifdef MAC 1496 mac_socketpeer_label_free(so->so_peerlabel); 1497 #endif 1498 1499 if (!(so->so_proto->pr_flags & PR_SOCKBUF)) { 1500 sbdestroy(so, SO_SND); 1501 sbdestroy(so, SO_RCV); 1502 } 1503 1504 #ifdef INVARIANTS 1505 bzero(&so->so_rcv, 1506 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 1507 #endif 1508 1509 so->sol_sbrcv_lowat = sbrcv_lowat; 1510 so->sol_sbsnd_lowat = sbsnd_lowat; 1511 so->sol_sbrcv_hiwat = sbrcv_hiwat; 1512 so->sol_sbsnd_hiwat = sbsnd_hiwat; 1513 so->sol_sbrcv_flags = sbrcv_flags; 1514 so->sol_sbsnd_flags = sbsnd_flags; 1515 so->sol_sbrcv_timeo = sbrcv_timeo; 1516 so->sol_sbsnd_timeo = sbsnd_timeo; 1517 1518 so->sol_qlen = so->sol_incqlen = 0; 1519 TAILQ_INIT(&so->sol_incomp); 1520 TAILQ_INIT(&so->sol_comp); 1521 1522 so->sol_accept_filter = NULL; 1523 so->sol_accept_filter_arg = NULL; 1524 so->sol_accept_filter_str = NULL; 1525 1526 so->sol_upcall = NULL; 1527 so->sol_upcallarg = NULL; 1528 1529 so->so_options |= SO_ACCEPTCONN; 1530 1531 listening: 1532 if (backlog < 0 || backlog > V_somaxconn) 1533 backlog = V_somaxconn; 1534 so->sol_qlimit = backlog; 1535 1536 mtx_unlock(&so->so_snd_mtx); 1537 mtx_unlock(&so->so_rcv_mtx); 1538 sx_xunlock(&so->so_snd_sx); 1539 sx_xunlock(&so->so_rcv_sx); 1540 } 1541 1542 /* 1543 * Wakeup listeners/subsystems once we have a complete connection. 1544 * Enters with lock, returns unlocked. 1545 */ 1546 void 1547 solisten_wakeup(struct socket *sol) 1548 { 1549 1550 if (sol->sol_upcall != NULL) 1551 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1552 else { 1553 selwakeuppri(&sol->so_rdsel, PSOCK); 1554 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1555 } 1556 SOLISTEN_UNLOCK(sol); 1557 wakeup_one(&sol->sol_comp); 1558 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1559 pgsigio(&sol->so_sigio, SIGIO, 0); 1560 } 1561 1562 /* 1563 * Return single connection off a listening socket queue. Main consumer of 1564 * the function is kern_accept4(). Some modules, that do their own accept 1565 * management also use the function. The socket reference held by the 1566 * listen queue is handed to the caller. 1567 * 1568 * Listening socket must be locked on entry and is returned unlocked on 1569 * return. 1570 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1571 */ 1572 int 1573 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1574 { 1575 struct socket *so; 1576 int error; 1577 1578 SOLISTEN_LOCK_ASSERT(head); 1579 1580 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1581 head->so_error == 0) { 1582 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1583 "accept", 0); 1584 if (error != 0) { 1585 SOLISTEN_UNLOCK(head); 1586 return (error); 1587 } 1588 } 1589 if (head->so_error) { 1590 error = head->so_error; 1591 head->so_error = 0; 1592 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1593 error = EWOULDBLOCK; 1594 else 1595 error = 0; 1596 if (error) { 1597 SOLISTEN_UNLOCK(head); 1598 return (error); 1599 } 1600 so = TAILQ_FIRST(&head->sol_comp); 1601 SOCK_LOCK(so); 1602 KASSERT(so->so_qstate == SQ_COMP, 1603 ("%s: so %p not SQ_COMP", __func__, so)); 1604 head->sol_qlen--; 1605 so->so_qstate = SQ_NONE; 1606 so->so_listen = NULL; 1607 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1608 if (flags & ACCEPT4_INHERIT) 1609 so->so_state |= (head->so_state & SS_NBIO); 1610 else 1611 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1612 SOCK_UNLOCK(so); 1613 sorele_locked(head); 1614 1615 *ret = so; 1616 return (0); 1617 } 1618 1619 static struct so_splice * 1620 so_splice_alloc(off_t max) 1621 { 1622 struct so_splice *sp; 1623 1624 sp = uma_zalloc(splice_zone, M_WAITOK); 1625 sp->src = NULL; 1626 sp->dst = NULL; 1627 sp->max = max > 0 ? max : -1; 1628 do { 1629 sp->wq_index = atomic_fetchadd_32(&splice_index, 1) % 1630 (mp_maxid + 1); 1631 } while (CPU_ABSENT(sp->wq_index)); 1632 sp->state = SPLICE_INIT; 1633 TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout, 1634 sp); 1635 return (sp); 1636 } 1637 1638 static void 1639 so_splice_free(struct so_splice *sp) 1640 { 1641 KASSERT(sp->state == SPLICE_CLOSED, 1642 ("so_splice_free: sp %p not closed", sp)); 1643 uma_zfree(splice_zone, sp); 1644 } 1645 1646 static void 1647 so_splice_timeout(void *arg, int pending __unused) 1648 { 1649 struct so_splice *sp; 1650 1651 sp = arg; 1652 (void)so_unsplice(sp->src, true); 1653 } 1654 1655 /* 1656 * Splice the output from so to the input of so2. 1657 */ 1658 static int 1659 so_splice(struct socket *so, struct socket *so2, struct splice *splice) 1660 { 1661 struct so_splice *sp; 1662 int error; 1663 1664 if (splice->sp_max < 0) 1665 return (EINVAL); 1666 /* Handle only TCP for now; TODO: other streaming protos */ 1667 if (so->so_proto->pr_protocol != IPPROTO_TCP || 1668 so2->so_proto->pr_protocol != IPPROTO_TCP) 1669 return (EPROTONOSUPPORT); 1670 if (so->so_vnet != so2->so_vnet) 1671 return (EINVAL); 1672 1673 /* so_splice_xfer() assumes that we're using these implementations. */ 1674 KASSERT(so->so_proto->pr_sosend == sosend_generic, 1675 ("so_splice: sosend not sosend_generic")); 1676 KASSERT(so2->so_proto->pr_soreceive == soreceive_generic || 1677 so2->so_proto->pr_soreceive == soreceive_stream, 1678 ("so_splice: soreceive not soreceive_generic/stream")); 1679 1680 sp = so_splice_alloc(splice->sp_max); 1681 so->so_splice_sent = 0; 1682 sp->src = so; 1683 sp->dst = so2; 1684 1685 error = 0; 1686 SOCK_LOCK(so); 1687 if (SOLISTENING(so)) 1688 error = EINVAL; 1689 else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1690 error = ENOTCONN; 1691 else if (so->so_splice != NULL) 1692 error = EBUSY; 1693 if (error != 0) { 1694 SOCK_UNLOCK(so); 1695 uma_zfree(splice_zone, sp); 1696 return (error); 1697 } 1698 SOCK_RECVBUF_LOCK(so); 1699 if (so->so_rcv.sb_tls_info != NULL) { 1700 SOCK_RECVBUF_UNLOCK(so); 1701 SOCK_UNLOCK(so); 1702 uma_zfree(splice_zone, sp); 1703 return (EINVAL); 1704 } 1705 so->so_rcv.sb_flags |= SB_SPLICED; 1706 so->so_splice = sp; 1707 soref(so); 1708 SOCK_RECVBUF_UNLOCK(so); 1709 SOCK_UNLOCK(so); 1710 1711 error = 0; 1712 SOCK_LOCK(so2); 1713 if (SOLISTENING(so2)) 1714 error = EINVAL; 1715 else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1716 error = ENOTCONN; 1717 else if (so2->so_splice_back != NULL) 1718 error = EBUSY; 1719 if (error != 0) { 1720 SOCK_UNLOCK(so2); 1721 mtx_lock(&sp->mtx); 1722 sp->dst = NULL; 1723 sp->state = SPLICE_EXCEPTION; 1724 mtx_unlock(&sp->mtx); 1725 so_unsplice(so, false); 1726 return (error); 1727 } 1728 SOCK_SENDBUF_LOCK(so2); 1729 if (so->so_snd.sb_tls_info != NULL) { 1730 SOCK_SENDBUF_UNLOCK(so2); 1731 SOCK_UNLOCK(so2); 1732 mtx_lock(&sp->mtx); 1733 sp->dst = NULL; 1734 sp->state = SPLICE_EXCEPTION; 1735 mtx_unlock(&sp->mtx); 1736 so_unsplice(so, false); 1737 return (EINVAL); 1738 } 1739 so2->so_snd.sb_flags |= SB_SPLICED; 1740 so2->so_splice_back = sp; 1741 soref(so2); 1742 mtx_lock(&sp->mtx); 1743 SOCK_SENDBUF_UNLOCK(so2); 1744 SOCK_UNLOCK(so2); 1745 1746 if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) { 1747 taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout, 1748 tvtosbt(splice->sp_idle), 0, C_PREL(4)); 1749 } 1750 1751 /* 1752 * Transfer any data already present in the socket buffer. 1753 */ 1754 KASSERT(sp->state == SPLICE_INIT, 1755 ("so_splice: splice %p state %d", sp, sp->state)); 1756 sp->state = SPLICE_QUEUED; 1757 so_splice_xfer(sp); 1758 return (0); 1759 } 1760 1761 static int 1762 so_unsplice(struct socket *so, bool timeout) 1763 { 1764 struct socket *so2; 1765 struct so_splice *sp; 1766 bool drain, so2rele; 1767 1768 /* 1769 * First unset SB_SPLICED and hide the splice structure so that 1770 * wakeup routines will stop enqueuing work. This also ensures that 1771 * a only a single thread will proceed with the unsplice. 1772 */ 1773 SOCK_LOCK(so); 1774 if (SOLISTENING(so)) { 1775 SOCK_UNLOCK(so); 1776 return (EINVAL); 1777 } 1778 SOCK_RECVBUF_LOCK(so); 1779 if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) { 1780 SOCK_RECVBUF_UNLOCK(so); 1781 SOCK_UNLOCK(so); 1782 return (ENOTCONN); 1783 } 1784 sp = so->so_splice; 1785 mtx_lock(&sp->mtx); 1786 if (sp->state == SPLICE_INIT) { 1787 /* 1788 * A splice is in the middle of being set up. 1789 */ 1790 mtx_unlock(&sp->mtx); 1791 SOCK_RECVBUF_UNLOCK(so); 1792 SOCK_UNLOCK(so); 1793 return (ENOTCONN); 1794 } 1795 mtx_unlock(&sp->mtx); 1796 so->so_rcv.sb_flags &= ~SB_SPLICED; 1797 so->so_splice = NULL; 1798 SOCK_RECVBUF_UNLOCK(so); 1799 SOCK_UNLOCK(so); 1800 1801 so2 = sp->dst; 1802 if (so2 != NULL) { 1803 SOCK_LOCK(so2); 1804 KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__)); 1805 SOCK_SENDBUF_LOCK(so2); 1806 KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0, 1807 ("%s: so2 is not spliced", __func__)); 1808 KASSERT(so2->so_splice_back == sp, 1809 ("%s: so_splice_back != sp", __func__)); 1810 so2->so_snd.sb_flags &= ~SB_SPLICED; 1811 so2rele = so2->so_splice_back != NULL; 1812 so2->so_splice_back = NULL; 1813 SOCK_SENDBUF_UNLOCK(so2); 1814 SOCK_UNLOCK(so2); 1815 } 1816 1817 /* 1818 * No new work is being enqueued. The worker thread might be 1819 * splicing data right now, in which case we want to wait for it to 1820 * finish before proceeding. 1821 */ 1822 mtx_lock(&sp->mtx); 1823 switch (sp->state) { 1824 case SPLICE_QUEUED: 1825 case SPLICE_RUNNING: 1826 sp->state = SPLICE_CLOSING; 1827 while (sp->state == SPLICE_CLOSING) 1828 msleep(sp, &sp->mtx, PSOCK, "unsplice", 0); 1829 break; 1830 case SPLICE_INIT: 1831 case SPLICE_IDLE: 1832 case SPLICE_EXCEPTION: 1833 sp->state = SPLICE_CLOSED; 1834 break; 1835 default: 1836 __assert_unreachable(); 1837 } 1838 if (!timeout) { 1839 drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout, 1840 NULL) != 0; 1841 } else { 1842 drain = false; 1843 } 1844 mtx_unlock(&sp->mtx); 1845 if (drain) 1846 taskqueue_drain_timeout(taskqueue_thread, &sp->timeout); 1847 1848 /* 1849 * Now we hold the sole reference to the splice structure. 1850 * Clean up: signal userspace and release socket references. 1851 */ 1852 sorwakeup(so); 1853 CURVNET_SET(so->so_vnet); 1854 sorele(so); 1855 if (so2 != NULL) { 1856 sowwakeup(so2); 1857 if (so2rele) 1858 sorele(so2); 1859 } 1860 CURVNET_RESTORE(); 1861 so_splice_free(sp); 1862 return (0); 1863 } 1864 1865 /* 1866 * Free socket upon release of the very last reference. 1867 */ 1868 static void 1869 sofree(struct socket *so) 1870 { 1871 struct protosw *pr = so->so_proto; 1872 1873 SOCK_LOCK_ASSERT(so); 1874 KASSERT(refcount_load(&so->so_count) == 0, 1875 ("%s: so %p has references", __func__, so)); 1876 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, 1877 ("%s: so %p is on listen queue", __func__, so)); 1878 KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0, 1879 ("%s: so %p rcvbuf is spliced", __func__, so)); 1880 KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0, 1881 ("%s: so %p sndbuf is spliced", __func__, so)); 1882 KASSERT(so->so_splice == NULL && so->so_splice_back == NULL, 1883 ("%s: so %p has spliced data", __func__, so)); 1884 1885 SOCK_UNLOCK(so); 1886 1887 if (so->so_dtor != NULL) 1888 so->so_dtor(so); 1889 1890 VNET_SO_ASSERT(so); 1891 if (pr->pr_detach != NULL) 1892 pr->pr_detach(so); 1893 1894 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { 1895 /* 1896 * From this point on, we assume that no other references to 1897 * this socket exist anywhere else in the stack. Therefore, 1898 * no locks need to be acquired or held. 1899 */ 1900 #ifdef INVARIANTS 1901 SOCK_SENDBUF_LOCK(so); 1902 SOCK_RECVBUF_LOCK(so); 1903 #endif 1904 sbdestroy(so, SO_SND); 1905 sbdestroy(so, SO_RCV); 1906 #ifdef INVARIANTS 1907 SOCK_SENDBUF_UNLOCK(so); 1908 SOCK_RECVBUF_UNLOCK(so); 1909 #endif 1910 mtx_destroy(&so->so_snd_mtx); 1911 mtx_destroy(&so->so_rcv_mtx); 1912 } 1913 seldrain(&so->so_rdsel); 1914 seldrain(&so->so_wrsel); 1915 knlist_destroy(&so->so_rdsel.si_note); 1916 knlist_destroy(&so->so_wrsel.si_note); 1917 sodealloc(so); 1918 } 1919 1920 /* 1921 * Release a reference on a socket while holding the socket lock. 1922 * Unlocks the socket lock before returning. 1923 */ 1924 void 1925 sorele_locked(struct socket *so) 1926 { 1927 SOCK_LOCK_ASSERT(so); 1928 if (refcount_release(&so->so_count)) 1929 sofree(so); 1930 else 1931 SOCK_UNLOCK(so); 1932 } 1933 1934 /* 1935 * Close a socket on last file table reference removal. Initiate disconnect 1936 * if connected. Free socket when disconnect complete. 1937 * 1938 * This function will sorele() the socket. Note that soclose() may be called 1939 * prior to the ref count reaching zero. The actual socket structure will 1940 * not be freed until the ref count reaches zero. 1941 */ 1942 int 1943 soclose(struct socket *so) 1944 { 1945 struct accept_queue lqueue; 1946 int error = 0; 1947 bool listening, last __diagused; 1948 1949 CURVNET_SET(so->so_vnet); 1950 funsetown(&so->so_sigio); 1951 if (so->so_state & SS_ISCONNECTED) { 1952 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1953 error = sodisconnect(so); 1954 if (error) { 1955 if (error == ENOTCONN) 1956 error = 0; 1957 goto drop; 1958 } 1959 } 1960 1961 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 1962 if ((so->so_state & SS_ISDISCONNECTING) && 1963 (so->so_state & SS_NBIO)) 1964 goto drop; 1965 while (so->so_state & SS_ISCONNECTED) { 1966 error = tsleep(&so->so_timeo, 1967 PSOCK | PCATCH, "soclos", 1968 so->so_linger * hz); 1969 if (error) 1970 break; 1971 } 1972 } 1973 } 1974 1975 drop: 1976 if (so->so_proto->pr_close != NULL) 1977 so->so_proto->pr_close(so); 1978 1979 SOCK_LOCK(so); 1980 if ((listening = SOLISTENING(so))) { 1981 struct socket *sp; 1982 1983 TAILQ_INIT(&lqueue); 1984 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1985 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1986 1987 so->sol_qlen = so->sol_incqlen = 0; 1988 1989 TAILQ_FOREACH(sp, &lqueue, so_list) { 1990 SOCK_LOCK(sp); 1991 sp->so_qstate = SQ_NONE; 1992 sp->so_listen = NULL; 1993 SOCK_UNLOCK(sp); 1994 last = refcount_release(&so->so_count); 1995 KASSERT(!last, ("%s: released last reference for %p", 1996 __func__, so)); 1997 } 1998 } 1999 sorele_locked(so); 2000 if (listening) { 2001 struct socket *sp, *tsp; 2002 2003 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) 2004 soabort(sp); 2005 } 2006 CURVNET_RESTORE(); 2007 return (error); 2008 } 2009 2010 /* 2011 * soabort() is used to abruptly tear down a connection, such as when a 2012 * resource limit is reached (listen queue depth exceeded), or if a listen 2013 * socket is closed while there are sockets waiting to be accepted. 2014 * 2015 * This interface is tricky, because it is called on an unreferenced socket, 2016 * and must be called only by a thread that has actually removed the socket 2017 * from the listen queue it was on. Likely this thread holds the last 2018 * reference on the socket and soabort() will proceed with sofree(). But 2019 * it might be not the last, as the sockets on the listen queues are seen 2020 * from the protocol side. 2021 * 2022 * This interface will call into the protocol code, so must not be called 2023 * with any socket locks held. Protocols do call it while holding their own 2024 * recursible protocol mutexes, but this is something that should be subject 2025 * to review in the future. 2026 * 2027 * Usually socket should have a single reference left, but this is not a 2028 * requirement. In the past, when we have had named references for file 2029 * descriptor and protocol, we asserted that none of them are being held. 2030 */ 2031 void 2032 soabort(struct socket *so) 2033 { 2034 2035 VNET_SO_ASSERT(so); 2036 2037 if (so->so_proto->pr_abort != NULL) 2038 so->so_proto->pr_abort(so); 2039 SOCK_LOCK(so); 2040 sorele_locked(so); 2041 } 2042 2043 int 2044 soaccept(struct socket *so, struct sockaddr *sa) 2045 { 2046 #ifdef INVARIANTS 2047 u_char len = sa->sa_len; 2048 #endif 2049 int error; 2050 2051 CURVNET_SET(so->so_vnet); 2052 error = so->so_proto->pr_accept(so, sa); 2053 KASSERT(sa->sa_len <= len, 2054 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2055 CURVNET_RESTORE(); 2056 return (error); 2057 } 2058 2059 int 2060 sopeeraddr(struct socket *so, struct sockaddr *sa) 2061 { 2062 #ifdef INVARIANTS 2063 u_char len = sa->sa_len; 2064 #endif 2065 int error; 2066 2067 CURVNET_ASSERT_SET(); 2068 2069 error = so->so_proto->pr_peeraddr(so, sa); 2070 KASSERT(sa->sa_len <= len, 2071 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2072 2073 return (error); 2074 } 2075 2076 int 2077 sosockaddr(struct socket *so, struct sockaddr *sa) 2078 { 2079 #ifdef INVARIANTS 2080 u_char len = sa->sa_len; 2081 #endif 2082 int error; 2083 2084 CURVNET_SET(so->so_vnet); 2085 error = so->so_proto->pr_sockaddr(so, sa); 2086 KASSERT(sa->sa_len <= len, 2087 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2088 CURVNET_RESTORE(); 2089 2090 return (error); 2091 } 2092 2093 int 2094 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 2095 { 2096 2097 return (soconnectat(AT_FDCWD, so, nam, td)); 2098 } 2099 2100 int 2101 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 2102 { 2103 int error; 2104 2105 CURVNET_SET(so->so_vnet); 2106 2107 /* 2108 * If protocol is connection-based, can only connect once. 2109 * Otherwise, if connected, try to disconnect first. This allows 2110 * user to disconnect by connecting to, e.g., a null address. 2111 * 2112 * Note, this check is racy and may need to be re-evaluated at the 2113 * protocol layer. 2114 */ 2115 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 2116 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 2117 (error = sodisconnect(so)))) { 2118 error = EISCONN; 2119 } else { 2120 /* 2121 * Prevent accumulated error from previous connection from 2122 * biting us. 2123 */ 2124 so->so_error = 0; 2125 if (fd == AT_FDCWD) { 2126 error = so->so_proto->pr_connect(so, nam, td); 2127 } else { 2128 error = so->so_proto->pr_connectat(fd, so, nam, td); 2129 } 2130 } 2131 CURVNET_RESTORE(); 2132 2133 return (error); 2134 } 2135 2136 int 2137 soconnect2(struct socket *so1, struct socket *so2) 2138 { 2139 int error; 2140 2141 CURVNET_SET(so1->so_vnet); 2142 error = so1->so_proto->pr_connect2(so1, so2); 2143 CURVNET_RESTORE(); 2144 return (error); 2145 } 2146 2147 int 2148 sodisconnect(struct socket *so) 2149 { 2150 int error; 2151 2152 if ((so->so_state & SS_ISCONNECTED) == 0) 2153 return (ENOTCONN); 2154 if (so->so_state & SS_ISDISCONNECTING) 2155 return (EALREADY); 2156 VNET_SO_ASSERT(so); 2157 error = so->so_proto->pr_disconnect(so); 2158 return (error); 2159 } 2160 2161 int 2162 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 2163 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2164 { 2165 long space; 2166 ssize_t resid; 2167 int clen = 0, error, dontroute; 2168 2169 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 2170 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 2171 ("sosend_dgram: !PR_ATOMIC")); 2172 2173 if (uio != NULL) 2174 resid = uio->uio_resid; 2175 else 2176 resid = top->m_pkthdr.len; 2177 /* 2178 * In theory resid should be unsigned. However, space must be 2179 * signed, as it might be less than 0 if we over-committed, and we 2180 * must use a signed comparison of space and resid. On the other 2181 * hand, a negative resid causes us to loop sending 0-length 2182 * segments to the protocol. 2183 */ 2184 if (resid < 0) { 2185 error = EINVAL; 2186 goto out; 2187 } 2188 2189 dontroute = 2190 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 2191 if (td != NULL) 2192 td->td_ru.ru_msgsnd++; 2193 if (control != NULL) 2194 clen = control->m_len; 2195 2196 SOCKBUF_LOCK(&so->so_snd); 2197 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2198 SOCKBUF_UNLOCK(&so->so_snd); 2199 error = EPIPE; 2200 goto out; 2201 } 2202 if (so->so_error) { 2203 error = so->so_error; 2204 so->so_error = 0; 2205 SOCKBUF_UNLOCK(&so->so_snd); 2206 goto out; 2207 } 2208 if ((so->so_state & SS_ISCONNECTED) == 0) { 2209 /* 2210 * `sendto' and `sendmsg' is allowed on a connection-based 2211 * socket if it supports implied connect. Return ENOTCONN if 2212 * not connected and no address is supplied. 2213 */ 2214 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2215 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2216 if (!(resid == 0 && clen != 0)) { 2217 SOCKBUF_UNLOCK(&so->so_snd); 2218 error = ENOTCONN; 2219 goto out; 2220 } 2221 } else if (addr == NULL) { 2222 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2223 error = ENOTCONN; 2224 else 2225 error = EDESTADDRREQ; 2226 SOCKBUF_UNLOCK(&so->so_snd); 2227 goto out; 2228 } 2229 } 2230 2231 /* 2232 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 2233 * problem and need fixing. 2234 */ 2235 space = sbspace(&so->so_snd); 2236 if (flags & MSG_OOB) 2237 space += 1024; 2238 space -= clen; 2239 SOCKBUF_UNLOCK(&so->so_snd); 2240 if (resid > space) { 2241 error = EMSGSIZE; 2242 goto out; 2243 } 2244 if (uio == NULL) { 2245 resid = 0; 2246 if (flags & MSG_EOR) 2247 top->m_flags |= M_EOR; 2248 } else { 2249 /* 2250 * Copy the data from userland into a mbuf chain. 2251 * If no data is to be copied in, a single empty mbuf 2252 * is returned. 2253 */ 2254 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 2255 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 2256 if (top == NULL) { 2257 error = EFAULT; /* only possible error */ 2258 goto out; 2259 } 2260 space -= resid - uio->uio_resid; 2261 resid = uio->uio_resid; 2262 } 2263 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 2264 /* 2265 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 2266 * than with. 2267 */ 2268 if (dontroute) { 2269 SOCK_LOCK(so); 2270 so->so_options |= SO_DONTROUTE; 2271 SOCK_UNLOCK(so); 2272 } 2273 /* 2274 * XXX all the SBS_CANTSENDMORE checks previously done could be out 2275 * of date. We could have received a reset packet in an interrupt or 2276 * maybe we slept while doing page faults in uiomove() etc. We could 2277 * probably recheck again inside the locking protection here, but 2278 * there are probably other places that this also happens. We must 2279 * rethink this. 2280 */ 2281 VNET_SO_ASSERT(so); 2282 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : 2283 /* 2284 * If the user set MSG_EOF, the protocol understands this flag and 2285 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 2286 */ 2287 ((flags & MSG_EOF) && 2288 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2289 (resid <= 0)) ? 2290 PRUS_EOF : 2291 /* If there is more to send set PRUS_MORETOCOME */ 2292 (flags & MSG_MORETOCOME) || 2293 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 2294 top, addr, control, td); 2295 if (dontroute) { 2296 SOCK_LOCK(so); 2297 so->so_options &= ~SO_DONTROUTE; 2298 SOCK_UNLOCK(so); 2299 } 2300 clen = 0; 2301 control = NULL; 2302 top = NULL; 2303 out: 2304 if (top != NULL) 2305 m_freem(top); 2306 if (control != NULL) 2307 m_freem(control); 2308 return (error); 2309 } 2310 2311 /* 2312 * Send on a socket. If send must go all at once and message is larger than 2313 * send buffering, then hard error. Lock against other senders. If must go 2314 * all at once and not enough room now, then inform user that this would 2315 * block and do nothing. Otherwise, if nonblocking, send as much as 2316 * possible. The data to be sent is described by "uio" if nonzero, otherwise 2317 * by the mbuf chain "top" (which must be null if uio is not). Data provided 2318 * in mbuf chain must be small enough to send all at once. 2319 * 2320 * Returns nonzero on error, timeout or signal; callers must check for short 2321 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 2322 * on return. 2323 */ 2324 static int 2325 sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, 2326 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2327 { 2328 long space; 2329 ssize_t resid; 2330 int clen = 0, error, dontroute; 2331 int atomic = sosendallatonce(so) || top; 2332 int pr_send_flag; 2333 #ifdef KERN_TLS 2334 struct ktls_session *tls; 2335 int tls_enq_cnt, tls_send_flag; 2336 uint8_t tls_rtype; 2337 2338 tls = NULL; 2339 tls_rtype = TLS_RLTYPE_APP; 2340 #endif 2341 2342 SOCK_IO_SEND_ASSERT_LOCKED(so); 2343 2344 if (uio != NULL) 2345 resid = uio->uio_resid; 2346 else if ((top->m_flags & M_PKTHDR) != 0) 2347 resid = top->m_pkthdr.len; 2348 else 2349 resid = m_length(top, NULL); 2350 /* 2351 * In theory resid should be unsigned. However, space must be 2352 * signed, as it might be less than 0 if we over-committed, and we 2353 * must use a signed comparison of space and resid. On the other 2354 * hand, a negative resid causes us to loop sending 0-length 2355 * segments to the protocol. 2356 * 2357 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 2358 * type sockets since that's an error. 2359 */ 2360 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 2361 error = EINVAL; 2362 goto out; 2363 } 2364 2365 dontroute = 2366 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 2367 (so->so_proto->pr_flags & PR_ATOMIC); 2368 if (td != NULL) 2369 td->td_ru.ru_msgsnd++; 2370 if (control != NULL) 2371 clen = control->m_len; 2372 2373 #ifdef KERN_TLS 2374 tls_send_flag = 0; 2375 tls = ktls_hold(so->so_snd.sb_tls_info); 2376 if (tls != NULL) { 2377 if (tls->mode == TCP_TLS_MODE_SW) 2378 tls_send_flag = PRUS_NOTREADY; 2379 2380 if (control != NULL) { 2381 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 2382 2383 if (clen >= sizeof(*cm) && 2384 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 2385 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 2386 clen = 0; 2387 m_freem(control); 2388 control = NULL; 2389 atomic = 1; 2390 } 2391 } 2392 2393 if (resid == 0 && !ktls_permit_empty_frames(tls)) { 2394 error = EINVAL; 2395 goto out; 2396 } 2397 } 2398 #endif 2399 2400 restart: 2401 do { 2402 SOCKBUF_LOCK(&so->so_snd); 2403 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2404 SOCKBUF_UNLOCK(&so->so_snd); 2405 error = EPIPE; 2406 goto out; 2407 } 2408 if (so->so_error) { 2409 error = so->so_error; 2410 so->so_error = 0; 2411 SOCKBUF_UNLOCK(&so->so_snd); 2412 goto out; 2413 } 2414 if ((so->so_state & SS_ISCONNECTED) == 0) { 2415 /* 2416 * `sendto' and `sendmsg' is allowed on a connection- 2417 * based socket if it supports implied connect. 2418 * Return ENOTCONN if not connected and no address is 2419 * supplied. 2420 */ 2421 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2422 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2423 if (!(resid == 0 && clen != 0)) { 2424 SOCKBUF_UNLOCK(&so->so_snd); 2425 error = ENOTCONN; 2426 goto out; 2427 } 2428 } else if (addr == NULL) { 2429 SOCKBUF_UNLOCK(&so->so_snd); 2430 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2431 error = ENOTCONN; 2432 else 2433 error = EDESTADDRREQ; 2434 goto out; 2435 } 2436 } 2437 space = sbspace(&so->so_snd); 2438 if (flags & MSG_OOB) 2439 space += 1024; 2440 if ((atomic && resid > so->so_snd.sb_hiwat) || 2441 clen > so->so_snd.sb_hiwat) { 2442 SOCKBUF_UNLOCK(&so->so_snd); 2443 error = EMSGSIZE; 2444 goto out; 2445 } 2446 if (space < resid + clen && 2447 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 2448 if ((so->so_state & SS_NBIO) || 2449 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 2450 SOCKBUF_UNLOCK(&so->so_snd); 2451 error = EWOULDBLOCK; 2452 goto out; 2453 } 2454 error = sbwait(so, SO_SND); 2455 SOCKBUF_UNLOCK(&so->so_snd); 2456 if (error) 2457 goto out; 2458 goto restart; 2459 } 2460 SOCKBUF_UNLOCK(&so->so_snd); 2461 space -= clen; 2462 do { 2463 if (uio == NULL) { 2464 resid = 0; 2465 if (flags & MSG_EOR) 2466 top->m_flags |= M_EOR; 2467 #ifdef KERN_TLS 2468 if (tls != NULL) { 2469 ktls_frame(top, tls, &tls_enq_cnt, 2470 tls_rtype); 2471 tls_rtype = TLS_RLTYPE_APP; 2472 } 2473 #endif 2474 } else { 2475 /* 2476 * Copy the data from userland into a mbuf 2477 * chain. If resid is 0, which can happen 2478 * only if we have control to send, then 2479 * a single empty mbuf is returned. This 2480 * is a workaround to prevent protocol send 2481 * methods to panic. 2482 */ 2483 #ifdef KERN_TLS 2484 if (tls != NULL) { 2485 top = m_uiotombuf(uio, M_WAITOK, space, 2486 tls->params.max_frame_len, 2487 M_EXTPG | 2488 ((flags & MSG_EOR) ? M_EOR : 0)); 2489 if (top != NULL) { 2490 ktls_frame(top, tls, 2491 &tls_enq_cnt, tls_rtype); 2492 } 2493 tls_rtype = TLS_RLTYPE_APP; 2494 } else 2495 #endif 2496 top = m_uiotombuf(uio, M_WAITOK, space, 2497 (atomic ? max_hdr : 0), 2498 (atomic ? M_PKTHDR : 0) | 2499 ((flags & MSG_EOR) ? M_EOR : 0)); 2500 if (top == NULL) { 2501 error = EFAULT; /* only possible error */ 2502 goto out; 2503 } 2504 space -= resid - uio->uio_resid; 2505 resid = uio->uio_resid; 2506 } 2507 if (dontroute) { 2508 SOCK_LOCK(so); 2509 so->so_options |= SO_DONTROUTE; 2510 SOCK_UNLOCK(so); 2511 } 2512 /* 2513 * XXX all the SBS_CANTSENDMORE checks previously 2514 * done could be out of date. We could have received 2515 * a reset packet in an interrupt or maybe we slept 2516 * while doing page faults in uiomove() etc. We 2517 * could probably recheck again inside the locking 2518 * protection here, but there are probably other 2519 * places that this also happens. We must rethink 2520 * this. 2521 */ 2522 VNET_SO_ASSERT(so); 2523 2524 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : 2525 /* 2526 * If the user set MSG_EOF, the protocol understands 2527 * this flag and nothing left to send then use 2528 * PRU_SEND_EOF instead of PRU_SEND. 2529 */ 2530 ((flags & MSG_EOF) && 2531 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2532 (resid <= 0)) ? 2533 PRUS_EOF : 2534 /* If there is more to send set PRUS_MORETOCOME. */ 2535 (flags & MSG_MORETOCOME) || 2536 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 2537 2538 #ifdef KERN_TLS 2539 pr_send_flag |= tls_send_flag; 2540 #endif 2541 2542 error = so->so_proto->pr_send(so, pr_send_flag, top, 2543 addr, control, td); 2544 2545 if (dontroute) { 2546 SOCK_LOCK(so); 2547 so->so_options &= ~SO_DONTROUTE; 2548 SOCK_UNLOCK(so); 2549 } 2550 2551 #ifdef KERN_TLS 2552 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 2553 if (error != 0) { 2554 m_freem(top); 2555 top = NULL; 2556 } else { 2557 soref(so); 2558 ktls_enqueue(top, so, tls_enq_cnt); 2559 } 2560 } 2561 #endif 2562 clen = 0; 2563 control = NULL; 2564 top = NULL; 2565 if (error) 2566 goto out; 2567 } while (resid && space > 0); 2568 } while (resid); 2569 2570 out: 2571 #ifdef KERN_TLS 2572 if (tls != NULL) 2573 ktls_free(tls); 2574 #endif 2575 if (top != NULL) 2576 m_freem(top); 2577 if (control != NULL) 2578 m_freem(control); 2579 return (error); 2580 } 2581 2582 int 2583 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 2584 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2585 { 2586 int error; 2587 2588 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 2589 if (error) 2590 return (error); 2591 error = sosend_generic_locked(so, addr, uio, top, control, flags, td); 2592 SOCK_IO_SEND_UNLOCK(so); 2593 return (error); 2594 } 2595 2596 /* 2597 * Send to a socket from a kernel thread. 2598 * 2599 * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. 2600 * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs 2601 * to be set at all. This function should just boil down to a static inline 2602 * calling the protocol method. 2603 */ 2604 int 2605 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2606 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2607 { 2608 int error; 2609 2610 CURVNET_SET(so->so_vnet); 2611 error = so->so_proto->pr_sosend(so, addr, uio, 2612 top, control, flags, td); 2613 CURVNET_RESTORE(); 2614 return (error); 2615 } 2616 2617 /* 2618 * send(2), write(2) or aio_write(2) on a socket. 2619 */ 2620 int 2621 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2622 struct mbuf *control, int flags, struct proc *userproc) 2623 { 2624 struct thread *td; 2625 ssize_t len; 2626 int error; 2627 2628 td = uio->uio_td; 2629 len = uio->uio_resid; 2630 CURVNET_SET(so->so_vnet); 2631 error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, 2632 td); 2633 CURVNET_RESTORE(); 2634 if (error != 0) { 2635 /* 2636 * Clear transient errors for stream protocols if they made 2637 * some progress. Make exclusion for aio(4) that would 2638 * schedule a new write in case of EWOULDBLOCK and clear 2639 * error itself. See soaio_process_job(). 2640 */ 2641 if (uio->uio_resid != len && 2642 (so->so_proto->pr_flags & PR_ATOMIC) == 0 && 2643 userproc == NULL && 2644 (error == ERESTART || error == EINTR || 2645 error == EWOULDBLOCK)) 2646 error = 0; 2647 /* Generation of SIGPIPE can be controlled per socket. */ 2648 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && 2649 (flags & MSG_NOSIGNAL) == 0) { 2650 if (userproc != NULL) { 2651 /* aio(4) job */ 2652 PROC_LOCK(userproc); 2653 kern_psignal(userproc, SIGPIPE); 2654 PROC_UNLOCK(userproc); 2655 } else { 2656 PROC_LOCK(td->td_proc); 2657 tdsignal(td, SIGPIPE); 2658 PROC_UNLOCK(td->td_proc); 2659 } 2660 } 2661 } 2662 return (error); 2663 } 2664 2665 /* 2666 * The part of soreceive() that implements reading non-inline out-of-band 2667 * data from a socket. For more complete comments, see soreceive(), from 2668 * which this code originated. 2669 * 2670 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 2671 * unable to return an mbuf chain to the caller. 2672 */ 2673 static int 2674 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 2675 { 2676 struct protosw *pr = so->so_proto; 2677 struct mbuf *m; 2678 int error; 2679 2680 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 2681 VNET_SO_ASSERT(so); 2682 2683 m = m_get(M_WAITOK, MT_DATA); 2684 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 2685 if (error) 2686 goto bad; 2687 do { 2688 error = uiomove(mtod(m, void *), 2689 (int) min(uio->uio_resid, m->m_len), uio); 2690 m = m_free(m); 2691 } while (uio->uio_resid && error == 0 && m); 2692 bad: 2693 if (m != NULL) 2694 m_freem(m); 2695 return (error); 2696 } 2697 2698 /* 2699 * Following replacement or removal of the first mbuf on the first mbuf chain 2700 * of a socket buffer, push necessary state changes back into the socket 2701 * buffer so that other consumers see the values consistently. 'nextrecord' 2702 * is the callers locally stored value of the original value of 2703 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 2704 * NOTE: 'nextrecord' may be NULL. 2705 */ 2706 static __inline void 2707 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 2708 { 2709 2710 SOCKBUF_LOCK_ASSERT(sb); 2711 /* 2712 * First, update for the new value of nextrecord. If necessary, make 2713 * it the first record. 2714 */ 2715 if (sb->sb_mb != NULL) 2716 sb->sb_mb->m_nextpkt = nextrecord; 2717 else 2718 sb->sb_mb = nextrecord; 2719 2720 /* 2721 * Now update any dependent socket buffer fields to reflect the new 2722 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 2723 * addition of a second clause that takes care of the case where 2724 * sb_mb has been updated, but remains the last record. 2725 */ 2726 if (sb->sb_mb == NULL) { 2727 sb->sb_mbtail = NULL; 2728 sb->sb_lastrecord = NULL; 2729 } else if (sb->sb_mb->m_nextpkt == NULL) 2730 sb->sb_lastrecord = sb->sb_mb; 2731 } 2732 2733 /* 2734 * Implement receive operations on a socket. We depend on the way that 2735 * records are added to the sockbuf by sbappend. In particular, each record 2736 * (mbufs linked through m_next) must begin with an address if the protocol 2737 * so specifies, followed by an optional mbuf or mbufs containing ancillary 2738 * data, and then zero or more mbufs of data. In order to allow parallelism 2739 * between network receive and copying to user space, as well as avoid 2740 * sleeping with a mutex held, we release the socket buffer mutex during the 2741 * user space copy. Although the sockbuf is locked, new data may still be 2742 * appended, and thus we must maintain consistency of the sockbuf during that 2743 * time. 2744 * 2745 * The caller may receive the data as a single mbuf chain by supplying an 2746 * mbuf **mp for use in returning the chain. The uio is then used only for 2747 * the count in uio_resid. 2748 */ 2749 static int 2750 soreceive_generic_locked(struct socket *so, struct sockaddr **psa, 2751 struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) 2752 { 2753 struct mbuf *m; 2754 int flags, error, offset; 2755 ssize_t len; 2756 struct protosw *pr = so->so_proto; 2757 struct mbuf *nextrecord; 2758 int moff, type = 0; 2759 ssize_t orig_resid = uio->uio_resid; 2760 bool report_real_len = false; 2761 2762 SOCK_IO_RECV_ASSERT_LOCKED(so); 2763 2764 error = 0; 2765 if (flagsp != NULL) { 2766 report_real_len = *flagsp & MSG_TRUNC; 2767 *flagsp &= ~MSG_TRUNC; 2768 flags = *flagsp &~ MSG_EOR; 2769 } else 2770 flags = 0; 2771 2772 restart: 2773 SOCKBUF_LOCK(&so->so_rcv); 2774 m = so->so_rcv.sb_mb; 2775 /* 2776 * If we have less data than requested, block awaiting more (subject 2777 * to any timeout) if: 2778 * 1. the current count is less than the low water mark, or 2779 * 2. MSG_DONTWAIT is not set 2780 */ 2781 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 2782 sbavail(&so->so_rcv) < uio->uio_resid) && 2783 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 2784 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 2785 KASSERT(m != NULL || !sbavail(&so->so_rcv), 2786 ("receive: m == %p sbavail == %u", 2787 m, sbavail(&so->so_rcv))); 2788 if (so->so_error || so->so_rerror) { 2789 if (m != NULL) 2790 goto dontblock; 2791 if (so->so_error) 2792 error = so->so_error; 2793 else 2794 error = so->so_rerror; 2795 if ((flags & MSG_PEEK) == 0) { 2796 if (so->so_error) 2797 so->so_error = 0; 2798 else 2799 so->so_rerror = 0; 2800 } 2801 SOCKBUF_UNLOCK(&so->so_rcv); 2802 goto release; 2803 } 2804 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2805 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2806 if (m != NULL) 2807 goto dontblock; 2808 #ifdef KERN_TLS 2809 else if (so->so_rcv.sb_tlsdcc == 0 && 2810 so->so_rcv.sb_tlscc == 0) { 2811 #else 2812 else { 2813 #endif 2814 SOCKBUF_UNLOCK(&so->so_rcv); 2815 goto release; 2816 } 2817 } 2818 for (; m != NULL; m = m->m_next) 2819 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2820 m = so->so_rcv.sb_mb; 2821 goto dontblock; 2822 } 2823 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2824 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2825 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2826 SOCKBUF_UNLOCK(&so->so_rcv); 2827 error = ENOTCONN; 2828 goto release; 2829 } 2830 if (uio->uio_resid == 0 && !report_real_len) { 2831 SOCKBUF_UNLOCK(&so->so_rcv); 2832 goto release; 2833 } 2834 if ((so->so_state & SS_NBIO) || 2835 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2836 SOCKBUF_UNLOCK(&so->so_rcv); 2837 error = EWOULDBLOCK; 2838 goto release; 2839 } 2840 SBLASTRECORDCHK(&so->so_rcv); 2841 SBLASTMBUFCHK(&so->so_rcv); 2842 error = sbwait(so, SO_RCV); 2843 SOCKBUF_UNLOCK(&so->so_rcv); 2844 if (error) 2845 goto release; 2846 goto restart; 2847 } 2848 dontblock: 2849 /* 2850 * From this point onward, we maintain 'nextrecord' as a cache of the 2851 * pointer to the next record in the socket buffer. We must keep the 2852 * various socket buffer pointers and local stack versions of the 2853 * pointers in sync, pushing out modifications before dropping the 2854 * socket buffer mutex, and re-reading them when picking it up. 2855 * 2856 * Otherwise, we will race with the network stack appending new data 2857 * or records onto the socket buffer by using inconsistent/stale 2858 * versions of the field, possibly resulting in socket buffer 2859 * corruption. 2860 * 2861 * By holding the high-level sblock(), we prevent simultaneous 2862 * readers from pulling off the front of the socket buffer. 2863 */ 2864 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2865 if (uio->uio_td) 2866 uio->uio_td->td_ru.ru_msgrcv++; 2867 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2868 SBLASTRECORDCHK(&so->so_rcv); 2869 SBLASTMBUFCHK(&so->so_rcv); 2870 nextrecord = m->m_nextpkt; 2871 if (pr->pr_flags & PR_ADDR) { 2872 KASSERT(m->m_type == MT_SONAME, 2873 ("m->m_type == %d", m->m_type)); 2874 orig_resid = 0; 2875 if (psa != NULL) 2876 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2877 M_NOWAIT); 2878 if (flags & MSG_PEEK) { 2879 m = m->m_next; 2880 } else { 2881 sbfree(&so->so_rcv, m); 2882 so->so_rcv.sb_mb = m_free(m); 2883 m = so->so_rcv.sb_mb; 2884 sockbuf_pushsync(&so->so_rcv, nextrecord); 2885 } 2886 } 2887 2888 /* 2889 * Process one or more MT_CONTROL mbufs present before any data mbufs 2890 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2891 * just copy the data; if !MSG_PEEK, we call into the protocol to 2892 * perform externalization (or freeing if controlp == NULL). 2893 */ 2894 if (m != NULL && m->m_type == MT_CONTROL) { 2895 struct mbuf *cm = NULL, *cmn; 2896 struct mbuf **cme = &cm; 2897 #ifdef KERN_TLS 2898 struct cmsghdr *cmsg; 2899 struct tls_get_record tgr; 2900 2901 /* 2902 * For MSG_TLSAPPDATA, check for an alert record. 2903 * If found, return ENXIO without removing 2904 * it from the receive queue. This allows a subsequent 2905 * call without MSG_TLSAPPDATA to receive it. 2906 * Note that, for TLS, there should only be a single 2907 * control mbuf with the TLS_GET_RECORD message in it. 2908 */ 2909 if (flags & MSG_TLSAPPDATA) { 2910 cmsg = mtod(m, struct cmsghdr *); 2911 if (cmsg->cmsg_type == TLS_GET_RECORD && 2912 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2913 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2914 if (__predict_false(tgr.tls_type == 2915 TLS_RLTYPE_ALERT)) { 2916 SOCKBUF_UNLOCK(&so->so_rcv); 2917 error = ENXIO; 2918 goto release; 2919 } 2920 } 2921 } 2922 #endif 2923 2924 do { 2925 if (flags & MSG_PEEK) { 2926 if (controlp != NULL) { 2927 *controlp = m_copym(m, 0, m->m_len, 2928 M_NOWAIT); 2929 controlp = &(*controlp)->m_next; 2930 } 2931 m = m->m_next; 2932 } else { 2933 sbfree(&so->so_rcv, m); 2934 so->so_rcv.sb_mb = m->m_next; 2935 m->m_next = NULL; 2936 *cme = m; 2937 cme = &(*cme)->m_next; 2938 m = so->so_rcv.sb_mb; 2939 } 2940 } while (m != NULL && m->m_type == MT_CONTROL); 2941 if ((flags & MSG_PEEK) == 0) 2942 sockbuf_pushsync(&so->so_rcv, nextrecord); 2943 while (cm != NULL) { 2944 cmn = cm->m_next; 2945 cm->m_next = NULL; 2946 if (controlp != NULL) 2947 *controlp = cm; 2948 else 2949 m_freem(cm); 2950 if (controlp != NULL) { 2951 while (*controlp != NULL) 2952 controlp = &(*controlp)->m_next; 2953 } 2954 cm = cmn; 2955 } 2956 if (m != NULL) 2957 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 2958 else 2959 nextrecord = so->so_rcv.sb_mb; 2960 orig_resid = 0; 2961 } 2962 if (m != NULL) { 2963 if ((flags & MSG_PEEK) == 0) { 2964 KASSERT(m->m_nextpkt == nextrecord, 2965 ("soreceive: post-control, nextrecord !sync")); 2966 if (nextrecord == NULL) { 2967 KASSERT(so->so_rcv.sb_mb == m, 2968 ("soreceive: post-control, sb_mb!=m")); 2969 KASSERT(so->so_rcv.sb_lastrecord == m, 2970 ("soreceive: post-control, lastrecord!=m")); 2971 } 2972 } 2973 type = m->m_type; 2974 if (type == MT_OOBDATA) 2975 flags |= MSG_OOB; 2976 } else { 2977 if ((flags & MSG_PEEK) == 0) { 2978 KASSERT(so->so_rcv.sb_mb == nextrecord, 2979 ("soreceive: sb_mb != nextrecord")); 2980 if (so->so_rcv.sb_mb == NULL) { 2981 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2982 ("soreceive: sb_lastercord != NULL")); 2983 } 2984 } 2985 } 2986 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2987 SBLASTRECORDCHK(&so->so_rcv); 2988 SBLASTMBUFCHK(&so->so_rcv); 2989 2990 /* 2991 * Now continue to read any data mbufs off of the head of the socket 2992 * buffer until the read request is satisfied. Note that 'type' is 2993 * used to store the type of any mbuf reads that have happened so far 2994 * such that soreceive() can stop reading if the type changes, which 2995 * causes soreceive() to return only one of regular data and inline 2996 * out-of-band data in a single socket receive operation. 2997 */ 2998 moff = 0; 2999 offset = 0; 3000 while (m != NULL && !(m->m_flags & M_NOTREADY) && uio->uio_resid > 0 && 3001 error == 0) { 3002 /* 3003 * If the type of mbuf has changed since the last mbuf 3004 * examined ('type'), end the receive operation. 3005 */ 3006 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3007 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 3008 if (type != m->m_type) 3009 break; 3010 } else if (type == MT_OOBDATA) 3011 break; 3012 else 3013 KASSERT(m->m_type == MT_DATA, 3014 ("m->m_type == %d", m->m_type)); 3015 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 3016 len = uio->uio_resid; 3017 if (so->so_oobmark && len > so->so_oobmark - offset) 3018 len = so->so_oobmark - offset; 3019 if (len > m->m_len - moff) 3020 len = m->m_len - moff; 3021 /* 3022 * If mp is set, just pass back the mbufs. Otherwise copy 3023 * them out via the uio, then free. Sockbuf must be 3024 * consistent here (points to current mbuf, it points to next 3025 * record) when we drop priority; we must note any additions 3026 * to the sockbuf when we block interrupts again. 3027 */ 3028 if (mp == NULL) { 3029 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3030 SBLASTRECORDCHK(&so->so_rcv); 3031 SBLASTMBUFCHK(&so->so_rcv); 3032 SOCKBUF_UNLOCK(&so->so_rcv); 3033 if ((m->m_flags & M_EXTPG) != 0) 3034 error = m_unmapped_uiomove(m, moff, uio, 3035 (int)len); 3036 else 3037 error = uiomove(mtod(m, char *) + moff, 3038 (int)len, uio); 3039 SOCKBUF_LOCK(&so->so_rcv); 3040 if (error) { 3041 /* 3042 * The MT_SONAME mbuf has already been removed 3043 * from the record, so it is necessary to 3044 * remove the data mbufs, if any, to preserve 3045 * the invariant in the case of PR_ADDR that 3046 * requires MT_SONAME mbufs at the head of 3047 * each record. 3048 */ 3049 if (pr->pr_flags & PR_ATOMIC && 3050 ((flags & MSG_PEEK) == 0)) 3051 (void)sbdroprecord_locked(&so->so_rcv); 3052 SOCKBUF_UNLOCK(&so->so_rcv); 3053 goto release; 3054 } 3055 } else 3056 uio->uio_resid -= len; 3057 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3058 if (len == m->m_len - moff) { 3059 if (m->m_flags & M_EOR) 3060 flags |= MSG_EOR; 3061 if (flags & MSG_PEEK) { 3062 m = m->m_next; 3063 moff = 0; 3064 } else { 3065 nextrecord = m->m_nextpkt; 3066 sbfree(&so->so_rcv, m); 3067 if (mp != NULL) { 3068 m->m_nextpkt = NULL; 3069 *mp = m; 3070 mp = &m->m_next; 3071 so->so_rcv.sb_mb = m = m->m_next; 3072 *mp = NULL; 3073 } else { 3074 so->so_rcv.sb_mb = m_free(m); 3075 m = so->so_rcv.sb_mb; 3076 } 3077 sockbuf_pushsync(&so->so_rcv, nextrecord); 3078 SBLASTRECORDCHK(&so->so_rcv); 3079 SBLASTMBUFCHK(&so->so_rcv); 3080 } 3081 } else { 3082 if (flags & MSG_PEEK) 3083 moff += len; 3084 else { 3085 if (mp != NULL) { 3086 if (flags & MSG_DONTWAIT) { 3087 *mp = m_copym(m, 0, len, 3088 M_NOWAIT); 3089 if (*mp == NULL) { 3090 /* 3091 * m_copym() couldn't 3092 * allocate an mbuf. 3093 * Adjust uio_resid back 3094 * (it was adjusted 3095 * down by len bytes, 3096 * which we didn't end 3097 * up "copying" over). 3098 */ 3099 uio->uio_resid += len; 3100 break; 3101 } 3102 } else { 3103 SOCKBUF_UNLOCK(&so->so_rcv); 3104 *mp = m_copym(m, 0, len, 3105 M_WAITOK); 3106 SOCKBUF_LOCK(&so->so_rcv); 3107 } 3108 } 3109 sbcut_locked(&so->so_rcv, len); 3110 } 3111 } 3112 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3113 if (so->so_oobmark) { 3114 if ((flags & MSG_PEEK) == 0) { 3115 so->so_oobmark -= len; 3116 if (so->so_oobmark == 0) { 3117 so->so_rcv.sb_state |= SBS_RCVATMARK; 3118 break; 3119 } 3120 } else { 3121 offset += len; 3122 if (offset == so->so_oobmark) 3123 break; 3124 } 3125 } 3126 if (flags & MSG_EOR) 3127 break; 3128 /* 3129 * If the MSG_WAITALL flag is set (for non-atomic socket), we 3130 * must not quit until "uio->uio_resid == 0" or an error 3131 * termination. If a signal/timeout occurs, return with a 3132 * short count but without error. Keep sockbuf locked 3133 * against other readers. 3134 */ 3135 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 3136 !sosendallatonce(so) && nextrecord == NULL) { 3137 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3138 if (so->so_error || so->so_rerror || 3139 so->so_rcv.sb_state & SBS_CANTRCVMORE) 3140 break; 3141 /* 3142 * Notify the protocol that some data has been 3143 * drained before blocking. 3144 */ 3145 if (pr->pr_flags & PR_WANTRCVD) { 3146 SOCKBUF_UNLOCK(&so->so_rcv); 3147 VNET_SO_ASSERT(so); 3148 pr->pr_rcvd(so, flags); 3149 SOCKBUF_LOCK(&so->so_rcv); 3150 if (__predict_false(so->so_rcv.sb_mb == NULL && 3151 (so->so_error || so->so_rerror || 3152 so->so_rcv.sb_state & SBS_CANTRCVMORE))) 3153 break; 3154 } 3155 SBLASTRECORDCHK(&so->so_rcv); 3156 SBLASTMBUFCHK(&so->so_rcv); 3157 /* 3158 * We could receive some data while was notifying 3159 * the protocol. Skip blocking in this case. 3160 */ 3161 if (so->so_rcv.sb_mb == NULL) { 3162 error = sbwait(so, SO_RCV); 3163 if (error) { 3164 SOCKBUF_UNLOCK(&so->so_rcv); 3165 goto release; 3166 } 3167 } 3168 m = so->so_rcv.sb_mb; 3169 if (m != NULL) 3170 nextrecord = m->m_nextpkt; 3171 } 3172 } 3173 3174 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3175 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 3176 if (report_real_len) 3177 uio->uio_resid -= m_length(m, NULL) - moff; 3178 flags |= MSG_TRUNC; 3179 if ((flags & MSG_PEEK) == 0) 3180 (void) sbdroprecord_locked(&so->so_rcv); 3181 } 3182 if ((flags & MSG_PEEK) == 0) { 3183 if (m == NULL) { 3184 /* 3185 * First part is an inline SB_EMPTY_FIXUP(). Second 3186 * part makes sure sb_lastrecord is up-to-date if 3187 * there is still data in the socket buffer. 3188 */ 3189 so->so_rcv.sb_mb = nextrecord; 3190 if (so->so_rcv.sb_mb == NULL) { 3191 so->so_rcv.sb_mbtail = NULL; 3192 so->so_rcv.sb_lastrecord = NULL; 3193 } else if (nextrecord->m_nextpkt == NULL) 3194 so->so_rcv.sb_lastrecord = nextrecord; 3195 } 3196 SBLASTRECORDCHK(&so->so_rcv); 3197 SBLASTMBUFCHK(&so->so_rcv); 3198 /* 3199 * If soreceive() is being done from the socket callback, 3200 * then don't need to generate ACK to peer to update window, 3201 * since ACK will be generated on return to TCP. 3202 */ 3203 if (!(flags & MSG_SOCALLBCK) && 3204 (pr->pr_flags & PR_WANTRCVD)) { 3205 SOCKBUF_UNLOCK(&so->so_rcv); 3206 VNET_SO_ASSERT(so); 3207 pr->pr_rcvd(so, flags); 3208 SOCKBUF_LOCK(&so->so_rcv); 3209 } 3210 } 3211 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3212 if (orig_resid == uio->uio_resid && orig_resid && 3213 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 3214 SOCKBUF_UNLOCK(&so->so_rcv); 3215 goto restart; 3216 } 3217 SOCKBUF_UNLOCK(&so->so_rcv); 3218 3219 if (flagsp != NULL) 3220 *flagsp |= flags; 3221 release: 3222 return (error); 3223 } 3224 3225 int 3226 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 3227 struct mbuf **mp, struct mbuf **controlp, int *flagsp) 3228 { 3229 int error, flags; 3230 3231 if (psa != NULL) 3232 *psa = NULL; 3233 if (controlp != NULL) 3234 *controlp = NULL; 3235 if (flagsp != NULL) { 3236 flags = *flagsp; 3237 if ((flags & MSG_OOB) != 0) 3238 return (soreceive_rcvoob(so, uio, flags)); 3239 } else { 3240 flags = 0; 3241 } 3242 if (mp != NULL) 3243 *mp = NULL; 3244 3245 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3246 if (error) 3247 return (error); 3248 error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp); 3249 SOCK_IO_RECV_UNLOCK(so); 3250 return (error); 3251 } 3252 3253 /* 3254 * Optimized version of soreceive() for stream (TCP) sockets. 3255 */ 3256 static int 3257 soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 3258 struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, 3259 struct mbuf **controlp, int flags) 3260 { 3261 int len = 0, error = 0, oresid; 3262 struct mbuf *m, *n = NULL; 3263 3264 SOCK_IO_RECV_ASSERT_LOCKED(so); 3265 3266 /* Easy one, no space to copyout anything. */ 3267 if (uio->uio_resid == 0) 3268 return (EINVAL); 3269 oresid = uio->uio_resid; 3270 3271 SOCKBUF_LOCK(sb); 3272 /* We will never ever get anything unless we are or were connected. */ 3273 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 3274 error = ENOTCONN; 3275 goto out; 3276 } 3277 3278 restart: 3279 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3280 3281 /* Abort if socket has reported problems. */ 3282 if (so->so_error) { 3283 if (sbavail(sb) > 0) 3284 goto deliver; 3285 if (oresid > uio->uio_resid) 3286 goto out; 3287 error = so->so_error; 3288 if (!(flags & MSG_PEEK)) 3289 so->so_error = 0; 3290 goto out; 3291 } 3292 3293 /* Door is closed. Deliver what is left, if any. */ 3294 if (sb->sb_state & SBS_CANTRCVMORE) { 3295 if (sbavail(sb) > 0) 3296 goto deliver; 3297 else 3298 goto out; 3299 } 3300 3301 /* Socket buffer is empty and we shall not block. */ 3302 if (sbavail(sb) == 0 && 3303 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 3304 error = EAGAIN; 3305 goto out; 3306 } 3307 3308 /* Socket buffer got some data that we shall deliver now. */ 3309 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 3310 ((so->so_state & SS_NBIO) || 3311 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 3312 sbavail(sb) >= sb->sb_lowat || 3313 sbavail(sb) >= uio->uio_resid || 3314 sbavail(sb) >= sb->sb_hiwat) ) { 3315 goto deliver; 3316 } 3317 3318 /* On MSG_WAITALL we must wait until all data or error arrives. */ 3319 if ((flags & MSG_WAITALL) && 3320 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 3321 goto deliver; 3322 3323 /* 3324 * Wait and block until (more) data comes in. 3325 * NB: Drops the sockbuf lock during wait. 3326 */ 3327 error = sbwait(so, SO_RCV); 3328 if (error) 3329 goto out; 3330 goto restart; 3331 3332 deliver: 3333 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3334 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 3335 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 3336 3337 /* Statistics. */ 3338 if (uio->uio_td) 3339 uio->uio_td->td_ru.ru_msgrcv++; 3340 3341 /* Fill uio until full or current end of socket buffer is reached. */ 3342 len = min(uio->uio_resid, sbavail(sb)); 3343 if (mp0 != NULL) { 3344 /* Dequeue as many mbufs as possible. */ 3345 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 3346 if (*mp0 == NULL) 3347 *mp0 = sb->sb_mb; 3348 else 3349 m_cat(*mp0, sb->sb_mb); 3350 for (m = sb->sb_mb; 3351 m != NULL && m->m_len <= len; 3352 m = m->m_next) { 3353 KASSERT(!(m->m_flags & M_NOTREADY), 3354 ("%s: m %p not available", __func__, m)); 3355 len -= m->m_len; 3356 uio->uio_resid -= m->m_len; 3357 sbfree(sb, m); 3358 n = m; 3359 } 3360 n->m_next = NULL; 3361 sb->sb_mb = m; 3362 sb->sb_lastrecord = sb->sb_mb; 3363 if (sb->sb_mb == NULL) 3364 SB_EMPTY_FIXUP(sb); 3365 } 3366 /* Copy the remainder. */ 3367 if (len > 0) { 3368 KASSERT(sb->sb_mb != NULL, 3369 ("%s: len > 0 && sb->sb_mb empty", __func__)); 3370 3371 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 3372 if (m == NULL) 3373 len = 0; /* Don't flush data from sockbuf. */ 3374 else 3375 uio->uio_resid -= len; 3376 if (*mp0 != NULL) 3377 m_cat(*mp0, m); 3378 else 3379 *mp0 = m; 3380 if (*mp0 == NULL) { 3381 error = ENOBUFS; 3382 goto out; 3383 } 3384 } 3385 } else { 3386 /* NB: Must unlock socket buffer as uiomove may sleep. */ 3387 SOCKBUF_UNLOCK(sb); 3388 error = m_mbuftouio(uio, sb->sb_mb, len); 3389 SOCKBUF_LOCK(sb); 3390 if (error) 3391 goto out; 3392 } 3393 SBLASTRECORDCHK(sb); 3394 SBLASTMBUFCHK(sb); 3395 3396 /* 3397 * Remove the delivered data from the socket buffer unless we 3398 * were only peeking. 3399 */ 3400 if (!(flags & MSG_PEEK)) { 3401 if (len > 0) 3402 sbdrop_locked(sb, len); 3403 3404 /* Notify protocol that we drained some data. */ 3405 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 3406 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 3407 !(flags & MSG_SOCALLBCK))) { 3408 SOCKBUF_UNLOCK(sb); 3409 VNET_SO_ASSERT(so); 3410 so->so_proto->pr_rcvd(so, flags); 3411 SOCKBUF_LOCK(sb); 3412 } 3413 } 3414 3415 /* 3416 * For MSG_WAITALL we may have to loop again and wait for 3417 * more data to come in. 3418 */ 3419 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 3420 goto restart; 3421 out: 3422 SBLASTRECORDCHK(sb); 3423 SBLASTMBUFCHK(sb); 3424 SOCKBUF_UNLOCK(sb); 3425 return (error); 3426 } 3427 3428 int 3429 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 3430 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3431 { 3432 struct sockbuf *sb; 3433 int error, flags; 3434 3435 sb = &so->so_rcv; 3436 3437 /* We only do stream sockets. */ 3438 if (so->so_type != SOCK_STREAM) 3439 return (EINVAL); 3440 if (psa != NULL) 3441 *psa = NULL; 3442 if (flagsp != NULL) 3443 flags = *flagsp & ~MSG_EOR; 3444 else 3445 flags = 0; 3446 if (controlp != NULL) 3447 *controlp = NULL; 3448 if (flags & MSG_OOB) 3449 return (soreceive_rcvoob(so, uio, flags)); 3450 if (mp0 != NULL) 3451 *mp0 = NULL; 3452 3453 #ifdef KERN_TLS 3454 /* 3455 * KTLS store TLS records as records with a control message to 3456 * describe the framing. 3457 * 3458 * We check once here before acquiring locks to optimize the 3459 * common case. 3460 */ 3461 if (sb->sb_tls_info != NULL) 3462 return (soreceive_generic(so, psa, uio, mp0, controlp, 3463 flagsp)); 3464 #endif 3465 3466 /* 3467 * Prevent other threads from reading from the socket. This lock may be 3468 * dropped in order to sleep waiting for data to arrive. 3469 */ 3470 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3471 if (error) 3472 return (error); 3473 #ifdef KERN_TLS 3474 if (__predict_false(sb->sb_tls_info != NULL)) { 3475 SOCK_IO_RECV_UNLOCK(so); 3476 return (soreceive_generic(so, psa, uio, mp0, controlp, 3477 flagsp)); 3478 } 3479 #endif 3480 error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags); 3481 SOCK_IO_RECV_UNLOCK(so); 3482 return (error); 3483 } 3484 3485 /* 3486 * Optimized version of soreceive() for simple datagram cases from userspace. 3487 * Unlike in the stream case, we're able to drop a datagram if copyout() 3488 * fails, and because we handle datagrams atomically, we don't need to use a 3489 * sleep lock to prevent I/O interlacing. 3490 */ 3491 int 3492 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 3493 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3494 { 3495 struct mbuf *m, *m2; 3496 int flags, error; 3497 ssize_t len; 3498 struct protosw *pr = so->so_proto; 3499 struct mbuf *nextrecord; 3500 3501 if (psa != NULL) 3502 *psa = NULL; 3503 if (controlp != NULL) 3504 *controlp = NULL; 3505 if (flagsp != NULL) 3506 flags = *flagsp &~ MSG_EOR; 3507 else 3508 flags = 0; 3509 3510 /* 3511 * For any complicated cases, fall back to the full 3512 * soreceive_generic(). 3513 */ 3514 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) 3515 return (soreceive_generic(so, psa, uio, mp0, controlp, 3516 flagsp)); 3517 3518 /* 3519 * Enforce restrictions on use. 3520 */ 3521 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 3522 ("soreceive_dgram: wantrcvd")); 3523 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 3524 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 3525 ("soreceive_dgram: SBS_RCVATMARK")); 3526 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 3527 ("soreceive_dgram: P_CONNREQUIRED")); 3528 3529 /* 3530 * Loop blocking while waiting for a datagram. 3531 */ 3532 SOCKBUF_LOCK(&so->so_rcv); 3533 while ((m = so->so_rcv.sb_mb) == NULL) { 3534 KASSERT(sbavail(&so->so_rcv) == 0, 3535 ("soreceive_dgram: sb_mb NULL but sbavail %u", 3536 sbavail(&so->so_rcv))); 3537 if (so->so_error) { 3538 error = so->so_error; 3539 so->so_error = 0; 3540 SOCKBUF_UNLOCK(&so->so_rcv); 3541 return (error); 3542 } 3543 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 3544 uio->uio_resid == 0) { 3545 SOCKBUF_UNLOCK(&so->so_rcv); 3546 return (0); 3547 } 3548 if ((so->so_state & SS_NBIO) || 3549 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 3550 SOCKBUF_UNLOCK(&so->so_rcv); 3551 return (EWOULDBLOCK); 3552 } 3553 SBLASTRECORDCHK(&so->so_rcv); 3554 SBLASTMBUFCHK(&so->so_rcv); 3555 error = sbwait(so, SO_RCV); 3556 if (error) { 3557 SOCKBUF_UNLOCK(&so->so_rcv); 3558 return (error); 3559 } 3560 } 3561 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3562 3563 if (uio->uio_td) 3564 uio->uio_td->td_ru.ru_msgrcv++; 3565 SBLASTRECORDCHK(&so->so_rcv); 3566 SBLASTMBUFCHK(&so->so_rcv); 3567 nextrecord = m->m_nextpkt; 3568 if (nextrecord == NULL) { 3569 KASSERT(so->so_rcv.sb_lastrecord == m, 3570 ("soreceive_dgram: lastrecord != m")); 3571 } 3572 3573 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 3574 ("soreceive_dgram: m_nextpkt != nextrecord")); 3575 3576 /* 3577 * Pull 'm' and its chain off the front of the packet queue. 3578 */ 3579 so->so_rcv.sb_mb = NULL; 3580 sockbuf_pushsync(&so->so_rcv, nextrecord); 3581 3582 /* 3583 * Walk 'm's chain and free that many bytes from the socket buffer. 3584 */ 3585 for (m2 = m; m2 != NULL; m2 = m2->m_next) 3586 sbfree(&so->so_rcv, m2); 3587 3588 /* 3589 * Do a few last checks before we let go of the lock. 3590 */ 3591 SBLASTRECORDCHK(&so->so_rcv); 3592 SBLASTMBUFCHK(&so->so_rcv); 3593 SOCKBUF_UNLOCK(&so->so_rcv); 3594 3595 if (pr->pr_flags & PR_ADDR) { 3596 KASSERT(m->m_type == MT_SONAME, 3597 ("m->m_type == %d", m->m_type)); 3598 if (psa != NULL) 3599 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 3600 M_WAITOK); 3601 m = m_free(m); 3602 } 3603 KASSERT(m, ("%s: no data or control after soname", __func__)); 3604 3605 /* 3606 * Packet to copyout() is now in 'm' and it is disconnected from the 3607 * queue. 3608 * 3609 * Process one or more MT_CONTROL mbufs present before any data mbufs 3610 * in the first mbuf chain on the socket buffer. We call into the 3611 * protocol to perform externalization (or freeing if controlp == 3612 * NULL). In some cases there can be only MT_CONTROL mbufs without 3613 * MT_DATA mbufs. 3614 */ 3615 if (m->m_type == MT_CONTROL) { 3616 struct mbuf *cm = NULL, *cmn; 3617 struct mbuf **cme = &cm; 3618 3619 do { 3620 m2 = m->m_next; 3621 m->m_next = NULL; 3622 *cme = m; 3623 cme = &(*cme)->m_next; 3624 m = m2; 3625 } while (m != NULL && m->m_type == MT_CONTROL); 3626 while (cm != NULL) { 3627 cmn = cm->m_next; 3628 cm->m_next = NULL; 3629 if (controlp != NULL) 3630 *controlp = cm; 3631 else 3632 m_freem(cm); 3633 if (controlp != NULL) { 3634 while (*controlp != NULL) 3635 controlp = &(*controlp)->m_next; 3636 } 3637 cm = cmn; 3638 } 3639 } 3640 KASSERT(m == NULL || m->m_type == MT_DATA, 3641 ("soreceive_dgram: !data")); 3642 while (m != NULL && uio->uio_resid > 0) { 3643 len = uio->uio_resid; 3644 if (len > m->m_len) 3645 len = m->m_len; 3646 error = uiomove(mtod(m, char *), (int)len, uio); 3647 if (error) { 3648 m_freem(m); 3649 return (error); 3650 } 3651 if (len == m->m_len) 3652 m = m_free(m); 3653 else { 3654 m->m_data += len; 3655 m->m_len -= len; 3656 } 3657 } 3658 if (m != NULL) { 3659 flags |= MSG_TRUNC; 3660 m_freem(m); 3661 } 3662 if (flagsp != NULL) 3663 *flagsp |= flags; 3664 return (0); 3665 } 3666 3667 int 3668 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 3669 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3670 { 3671 int error; 3672 3673 CURVNET_SET(so->so_vnet); 3674 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); 3675 CURVNET_RESTORE(); 3676 return (error); 3677 } 3678 3679 int 3680 soshutdown(struct socket *so, enum shutdown_how how) 3681 { 3682 int error; 3683 3684 CURVNET_SET(so->so_vnet); 3685 error = so->so_proto->pr_shutdown(so, how); 3686 CURVNET_RESTORE(); 3687 3688 return (error); 3689 } 3690 3691 /* 3692 * Used by several pr_shutdown implementations that use generic socket buffers. 3693 */ 3694 void 3695 sorflush(struct socket *so) 3696 { 3697 int error; 3698 3699 VNET_SO_ASSERT(so); 3700 3701 /* 3702 * Dislodge threads currently blocked in receive and wait to acquire 3703 * a lock against other simultaneous readers before clearing the 3704 * socket buffer. Don't let our acquire be interrupted by a signal 3705 * despite any existing socket disposition on interruptable waiting. 3706 * 3707 * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive 3708 * methods that read the top of the socket buffer without acquisition 3709 * of the socket buffer mutex, assuming that top of the buffer 3710 * exclusively belongs to the read(2) syscall. This is handy when 3711 * performing MSG_PEEK. 3712 */ 3713 socantrcvmore(so); 3714 3715 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 3716 if (error != 0) { 3717 KASSERT(SOLISTENING(so), 3718 ("%s: soiolock(%p) failed", __func__, so)); 3719 return; 3720 } 3721 3722 sbrelease(so, SO_RCV); 3723 SOCK_IO_RECV_UNLOCK(so); 3724 3725 } 3726 3727 int 3728 sosetfib(struct socket *so, int fibnum) 3729 { 3730 if (fibnum < 0 || fibnum >= rt_numfibs) 3731 return (EINVAL); 3732 3733 SOCK_LOCK(so); 3734 so->so_fibnum = fibnum; 3735 SOCK_UNLOCK(so); 3736 3737 return (0); 3738 } 3739 3740 #ifdef SOCKET_HHOOK 3741 /* 3742 * Wrapper for Socket established helper hook. 3743 * Parameters: socket, context of the hook point, hook id. 3744 */ 3745 static inline int 3746 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 3747 { 3748 struct socket_hhook_data hhook_data = { 3749 .so = so, 3750 .hctx = hctx, 3751 .m = NULL, 3752 .status = 0 3753 }; 3754 3755 CURVNET_SET(so->so_vnet); 3756 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 3757 CURVNET_RESTORE(); 3758 3759 /* Ugly but needed, since hhooks return void for now */ 3760 return (hhook_data.status); 3761 } 3762 #endif 3763 3764 /* 3765 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 3766 * additional variant to handle the case where the option value needs to be 3767 * some kind of integer, but not a specific size. In addition to their use 3768 * here, these functions are also called by the protocol-level pr_ctloutput() 3769 * routines. 3770 */ 3771 int 3772 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 3773 { 3774 size_t valsize; 3775 3776 /* 3777 * If the user gives us more than we wanted, we ignore it, but if we 3778 * don't get the minimum length the caller wants, we return EINVAL. 3779 * On success, sopt->sopt_valsize is set to however much we actually 3780 * retrieved. 3781 */ 3782 if ((valsize = sopt->sopt_valsize) < minlen) 3783 return EINVAL; 3784 if (valsize > len) 3785 sopt->sopt_valsize = valsize = len; 3786 3787 if (sopt->sopt_td != NULL) 3788 return (copyin(sopt->sopt_val, buf, valsize)); 3789 3790 bcopy(sopt->sopt_val, buf, valsize); 3791 return (0); 3792 } 3793 3794 /* 3795 * Kernel version of setsockopt(2). 3796 * 3797 * XXX: optlen is size_t, not socklen_t 3798 */ 3799 int 3800 so_setsockopt(struct socket *so, int level, int optname, void *optval, 3801 size_t optlen) 3802 { 3803 struct sockopt sopt; 3804 3805 sopt.sopt_level = level; 3806 sopt.sopt_name = optname; 3807 sopt.sopt_dir = SOPT_SET; 3808 sopt.sopt_val = optval; 3809 sopt.sopt_valsize = optlen; 3810 sopt.sopt_td = NULL; 3811 return (sosetopt(so, &sopt)); 3812 } 3813 3814 int 3815 sosetopt(struct socket *so, struct sockopt *sopt) 3816 { 3817 int error, optval; 3818 struct linger l; 3819 struct timeval tv; 3820 sbintime_t val, *valp; 3821 uint32_t val32; 3822 #ifdef MAC 3823 struct mac extmac; 3824 #endif 3825 3826 CURVNET_SET(so->so_vnet); 3827 error = 0; 3828 if (sopt->sopt_level != SOL_SOCKET) { 3829 error = so->so_proto->pr_ctloutput(so, sopt); 3830 } else { 3831 switch (sopt->sopt_name) { 3832 case SO_ACCEPTFILTER: 3833 error = accept_filt_setopt(so, sopt); 3834 if (error) 3835 goto bad; 3836 break; 3837 3838 case SO_LINGER: 3839 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3840 if (error) 3841 goto bad; 3842 if (l.l_linger < 0 || 3843 l.l_linger > USHRT_MAX || 3844 l.l_linger > (INT_MAX / hz)) { 3845 error = EDOM; 3846 goto bad; 3847 } 3848 SOCK_LOCK(so); 3849 so->so_linger = l.l_linger; 3850 if (l.l_onoff) 3851 so->so_options |= SO_LINGER; 3852 else 3853 so->so_options &= ~SO_LINGER; 3854 SOCK_UNLOCK(so); 3855 break; 3856 3857 case SO_DEBUG: 3858 case SO_KEEPALIVE: 3859 case SO_DONTROUTE: 3860 case SO_USELOOPBACK: 3861 case SO_BROADCAST: 3862 case SO_REUSEADDR: 3863 case SO_REUSEPORT: 3864 case SO_REUSEPORT_LB: 3865 case SO_OOBINLINE: 3866 case SO_TIMESTAMP: 3867 case SO_BINTIME: 3868 case SO_NOSIGPIPE: 3869 case SO_NO_DDP: 3870 case SO_NO_OFFLOAD: 3871 case SO_RERROR: 3872 error = sooptcopyin(sopt, &optval, sizeof optval, 3873 sizeof optval); 3874 if (error) 3875 goto bad; 3876 SOCK_LOCK(so); 3877 if (optval) 3878 so->so_options |= sopt->sopt_name; 3879 else 3880 so->so_options &= ~sopt->sopt_name; 3881 SOCK_UNLOCK(so); 3882 break; 3883 3884 case SO_SETFIB: 3885 error = so->so_proto->pr_ctloutput(so, sopt); 3886 break; 3887 3888 case SO_USER_COOKIE: 3889 error = sooptcopyin(sopt, &val32, sizeof val32, 3890 sizeof val32); 3891 if (error) 3892 goto bad; 3893 so->so_user_cookie = val32; 3894 break; 3895 3896 case SO_SNDBUF: 3897 case SO_RCVBUF: 3898 case SO_SNDLOWAT: 3899 case SO_RCVLOWAT: 3900 error = so->so_proto->pr_setsbopt(so, sopt); 3901 if (error) 3902 goto bad; 3903 break; 3904 3905 case SO_SNDTIMEO: 3906 case SO_RCVTIMEO: 3907 #ifdef COMPAT_FREEBSD32 3908 if (SV_CURPROC_FLAG(SV_ILP32)) { 3909 struct timeval32 tv32; 3910 3911 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3912 sizeof tv32); 3913 CP(tv32, tv, tv_sec); 3914 CP(tv32, tv, tv_usec); 3915 } else 3916 #endif 3917 error = sooptcopyin(sopt, &tv, sizeof tv, 3918 sizeof tv); 3919 if (error) 3920 goto bad; 3921 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3922 tv.tv_usec >= 1000000) { 3923 error = EDOM; 3924 goto bad; 3925 } 3926 if (tv.tv_sec > INT32_MAX) 3927 val = SBT_MAX; 3928 else 3929 val = tvtosbt(tv); 3930 SOCK_LOCK(so); 3931 valp = sopt->sopt_name == SO_SNDTIMEO ? 3932 (SOLISTENING(so) ? &so->sol_sbsnd_timeo : 3933 &so->so_snd.sb_timeo) : 3934 (SOLISTENING(so) ? &so->sol_sbrcv_timeo : 3935 &so->so_rcv.sb_timeo); 3936 *valp = val; 3937 SOCK_UNLOCK(so); 3938 break; 3939 3940 case SO_LABEL: 3941 #ifdef MAC 3942 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3943 sizeof extmac); 3944 if (error) 3945 goto bad; 3946 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3947 so, &extmac); 3948 #else 3949 error = EOPNOTSUPP; 3950 #endif 3951 break; 3952 3953 case SO_TS_CLOCK: 3954 error = sooptcopyin(sopt, &optval, sizeof optval, 3955 sizeof optval); 3956 if (error) 3957 goto bad; 3958 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 3959 error = EINVAL; 3960 goto bad; 3961 } 3962 so->so_ts_clock = optval; 3963 break; 3964 3965 case SO_MAX_PACING_RATE: 3966 error = sooptcopyin(sopt, &val32, sizeof(val32), 3967 sizeof(val32)); 3968 if (error) 3969 goto bad; 3970 so->so_max_pacing_rate = val32; 3971 break; 3972 3973 case SO_SPLICE: { 3974 struct splice splice; 3975 3976 #ifdef COMPAT_FREEBSD32 3977 if (SV_CURPROC_FLAG(SV_ILP32)) { 3978 struct splice32 splice32; 3979 3980 error = sooptcopyin(sopt, &splice32, 3981 sizeof(splice32), sizeof(splice32)); 3982 if (error == 0) { 3983 splice.sp_fd = splice32.sp_fd; 3984 splice.sp_max = splice32.sp_max; 3985 CP(splice32.sp_idle, splice.sp_idle, 3986 tv_sec); 3987 CP(splice32.sp_idle, splice.sp_idle, 3988 tv_usec); 3989 } 3990 } else 3991 #endif 3992 { 3993 error = sooptcopyin(sopt, &splice, 3994 sizeof(splice), sizeof(splice)); 3995 } 3996 if (error) 3997 goto bad; 3998 #ifdef KTRACE 3999 if (KTRPOINT(curthread, KTR_STRUCT)) 4000 ktrsplice(&splice); 4001 #endif 4002 4003 error = splice_init(); 4004 if (error != 0) 4005 goto bad; 4006 4007 if (splice.sp_fd >= 0) { 4008 struct file *fp; 4009 struct socket *so2; 4010 4011 if (!cap_rights_contains(sopt->sopt_rights, 4012 &cap_recv_rights)) { 4013 error = ENOTCAPABLE; 4014 goto bad; 4015 } 4016 error = getsock(sopt->sopt_td, splice.sp_fd, 4017 &cap_send_rights, &fp); 4018 if (error != 0) 4019 goto bad; 4020 so2 = fp->f_data; 4021 4022 error = so_splice(so, so2, &splice); 4023 fdrop(fp, sopt->sopt_td); 4024 } else { 4025 error = so_unsplice(so, false); 4026 } 4027 break; 4028 } 4029 default: 4030 #ifdef SOCKET_HHOOK 4031 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 4032 error = hhook_run_socket(so, sopt, 4033 HHOOK_SOCKET_OPT); 4034 else 4035 #endif 4036 error = ENOPROTOOPT; 4037 break; 4038 } 4039 if (error == 0) 4040 (void)so->so_proto->pr_ctloutput(so, sopt); 4041 } 4042 bad: 4043 CURVNET_RESTORE(); 4044 return (error); 4045 } 4046 4047 /* 4048 * Helper routine for getsockopt. 4049 */ 4050 int 4051 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 4052 { 4053 int error; 4054 size_t valsize; 4055 4056 error = 0; 4057 4058 /* 4059 * Documented get behavior is that we always return a value, possibly 4060 * truncated to fit in the user's buffer. Traditional behavior is 4061 * that we always tell the user precisely how much we copied, rather 4062 * than something useful like the total amount we had available for 4063 * her. Note that this interface is not idempotent; the entire 4064 * answer must be generated ahead of time. 4065 */ 4066 valsize = min(len, sopt->sopt_valsize); 4067 sopt->sopt_valsize = valsize; 4068 if (sopt->sopt_val != NULL) { 4069 if (sopt->sopt_td != NULL) 4070 error = copyout(buf, sopt->sopt_val, valsize); 4071 else 4072 bcopy(buf, sopt->sopt_val, valsize); 4073 } 4074 return (error); 4075 } 4076 4077 int 4078 sogetopt(struct socket *so, struct sockopt *sopt) 4079 { 4080 int error, optval; 4081 struct linger l; 4082 struct timeval tv; 4083 #ifdef MAC 4084 struct mac extmac; 4085 #endif 4086 4087 CURVNET_SET(so->so_vnet); 4088 error = 0; 4089 if (sopt->sopt_level != SOL_SOCKET) { 4090 error = so->so_proto->pr_ctloutput(so, sopt); 4091 CURVNET_RESTORE(); 4092 return (error); 4093 } else { 4094 switch (sopt->sopt_name) { 4095 case SO_ACCEPTFILTER: 4096 error = accept_filt_getopt(so, sopt); 4097 break; 4098 4099 case SO_LINGER: 4100 SOCK_LOCK(so); 4101 l.l_onoff = so->so_options & SO_LINGER; 4102 l.l_linger = so->so_linger; 4103 SOCK_UNLOCK(so); 4104 error = sooptcopyout(sopt, &l, sizeof l); 4105 break; 4106 4107 case SO_USELOOPBACK: 4108 case SO_DONTROUTE: 4109 case SO_DEBUG: 4110 case SO_KEEPALIVE: 4111 case SO_REUSEADDR: 4112 case SO_REUSEPORT: 4113 case SO_REUSEPORT_LB: 4114 case SO_BROADCAST: 4115 case SO_OOBINLINE: 4116 case SO_ACCEPTCONN: 4117 case SO_TIMESTAMP: 4118 case SO_BINTIME: 4119 case SO_NOSIGPIPE: 4120 case SO_NO_DDP: 4121 case SO_NO_OFFLOAD: 4122 case SO_RERROR: 4123 optval = so->so_options & sopt->sopt_name; 4124 integer: 4125 error = sooptcopyout(sopt, &optval, sizeof optval); 4126 break; 4127 4128 case SO_FIB: 4129 SOCK_LOCK(so); 4130 optval = so->so_fibnum; 4131 SOCK_UNLOCK(so); 4132 goto integer; 4133 4134 case SO_DOMAIN: 4135 optval = so->so_proto->pr_domain->dom_family; 4136 goto integer; 4137 4138 case SO_TYPE: 4139 optval = so->so_type; 4140 goto integer; 4141 4142 case SO_PROTOCOL: 4143 optval = so->so_proto->pr_protocol; 4144 goto integer; 4145 4146 case SO_ERROR: 4147 SOCK_LOCK(so); 4148 if (so->so_error) { 4149 optval = so->so_error; 4150 so->so_error = 0; 4151 } else { 4152 optval = so->so_rerror; 4153 so->so_rerror = 0; 4154 } 4155 SOCK_UNLOCK(so); 4156 goto integer; 4157 4158 case SO_SNDBUF: 4159 SOCK_LOCK(so); 4160 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 4161 so->so_snd.sb_hiwat; 4162 SOCK_UNLOCK(so); 4163 goto integer; 4164 4165 case SO_RCVBUF: 4166 SOCK_LOCK(so); 4167 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 4168 so->so_rcv.sb_hiwat; 4169 SOCK_UNLOCK(so); 4170 goto integer; 4171 4172 case SO_SNDLOWAT: 4173 SOCK_LOCK(so); 4174 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 4175 so->so_snd.sb_lowat; 4176 SOCK_UNLOCK(so); 4177 goto integer; 4178 4179 case SO_RCVLOWAT: 4180 SOCK_LOCK(so); 4181 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 4182 so->so_rcv.sb_lowat; 4183 SOCK_UNLOCK(so); 4184 goto integer; 4185 4186 case SO_SNDTIMEO: 4187 case SO_RCVTIMEO: 4188 SOCK_LOCK(so); 4189 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 4190 (SOLISTENING(so) ? so->sol_sbsnd_timeo : 4191 so->so_snd.sb_timeo) : 4192 (SOLISTENING(so) ? so->sol_sbrcv_timeo : 4193 so->so_rcv.sb_timeo)); 4194 SOCK_UNLOCK(so); 4195 #ifdef COMPAT_FREEBSD32 4196 if (SV_CURPROC_FLAG(SV_ILP32)) { 4197 struct timeval32 tv32; 4198 4199 CP(tv, tv32, tv_sec); 4200 CP(tv, tv32, tv_usec); 4201 error = sooptcopyout(sopt, &tv32, sizeof tv32); 4202 } else 4203 #endif 4204 error = sooptcopyout(sopt, &tv, sizeof tv); 4205 break; 4206 4207 case SO_LABEL: 4208 #ifdef MAC 4209 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4210 sizeof(extmac)); 4211 if (error) 4212 goto bad; 4213 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 4214 so, &extmac); 4215 if (error) 4216 goto bad; 4217 /* Don't copy out extmac, it is unchanged. */ 4218 #else 4219 error = EOPNOTSUPP; 4220 #endif 4221 break; 4222 4223 case SO_PEERLABEL: 4224 #ifdef MAC 4225 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4226 sizeof(extmac)); 4227 if (error) 4228 goto bad; 4229 error = mac_getsockopt_peerlabel( 4230 sopt->sopt_td->td_ucred, so, &extmac); 4231 if (error) 4232 goto bad; 4233 /* Don't copy out extmac, it is unchanged. */ 4234 #else 4235 error = EOPNOTSUPP; 4236 #endif 4237 break; 4238 4239 case SO_LISTENQLIMIT: 4240 SOCK_LOCK(so); 4241 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 4242 SOCK_UNLOCK(so); 4243 goto integer; 4244 4245 case SO_LISTENQLEN: 4246 SOCK_LOCK(so); 4247 optval = SOLISTENING(so) ? so->sol_qlen : 0; 4248 SOCK_UNLOCK(so); 4249 goto integer; 4250 4251 case SO_LISTENINCQLEN: 4252 SOCK_LOCK(so); 4253 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 4254 SOCK_UNLOCK(so); 4255 goto integer; 4256 4257 case SO_TS_CLOCK: 4258 optval = so->so_ts_clock; 4259 goto integer; 4260 4261 case SO_MAX_PACING_RATE: 4262 optval = so->so_max_pacing_rate; 4263 goto integer; 4264 4265 case SO_SPLICE: { 4266 off_t n; 4267 4268 /* 4269 * Acquire the I/O lock to serialize with 4270 * so_splice_xfer(). This is not required for 4271 * correctness, but makes testing simpler: once a byte 4272 * has been transmitted to the sink and observed (e.g., 4273 * by reading from the socket to which the sink is 4274 * connected), a subsequent getsockopt(SO_SPLICE) will 4275 * return an up-to-date value. 4276 */ 4277 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT); 4278 if (error != 0) 4279 goto bad; 4280 SOCK_LOCK(so); 4281 if (SOLISTENING(so)) { 4282 n = 0; 4283 } else { 4284 n = so->so_splice_sent; 4285 } 4286 SOCK_UNLOCK(so); 4287 SOCK_IO_RECV_UNLOCK(so); 4288 error = sooptcopyout(sopt, &n, sizeof(n)); 4289 break; 4290 } 4291 4292 default: 4293 #ifdef SOCKET_HHOOK 4294 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 4295 error = hhook_run_socket(so, sopt, 4296 HHOOK_SOCKET_OPT); 4297 else 4298 #endif 4299 error = ENOPROTOOPT; 4300 break; 4301 } 4302 } 4303 bad: 4304 CURVNET_RESTORE(); 4305 return (error); 4306 } 4307 4308 int 4309 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 4310 { 4311 struct mbuf *m, *m_prev; 4312 int sopt_size = sopt->sopt_valsize; 4313 4314 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4315 if (m == NULL) 4316 return ENOBUFS; 4317 if (sopt_size > MLEN) { 4318 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 4319 if ((m->m_flags & M_EXT) == 0) { 4320 m_free(m); 4321 return ENOBUFS; 4322 } 4323 m->m_len = min(MCLBYTES, sopt_size); 4324 } else { 4325 m->m_len = min(MLEN, sopt_size); 4326 } 4327 sopt_size -= m->m_len; 4328 *mp = m; 4329 m_prev = m; 4330 4331 while (sopt_size) { 4332 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4333 if (m == NULL) { 4334 m_freem(*mp); 4335 return ENOBUFS; 4336 } 4337 if (sopt_size > MLEN) { 4338 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 4339 M_NOWAIT); 4340 if ((m->m_flags & M_EXT) == 0) { 4341 m_freem(m); 4342 m_freem(*mp); 4343 return ENOBUFS; 4344 } 4345 m->m_len = min(MCLBYTES, sopt_size); 4346 } else { 4347 m->m_len = min(MLEN, sopt_size); 4348 } 4349 sopt_size -= m->m_len; 4350 m_prev->m_next = m; 4351 m_prev = m; 4352 } 4353 return (0); 4354 } 4355 4356 int 4357 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 4358 { 4359 struct mbuf *m0 = m; 4360 4361 if (sopt->sopt_val == NULL) 4362 return (0); 4363 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4364 if (sopt->sopt_td != NULL) { 4365 int error; 4366 4367 error = copyin(sopt->sopt_val, mtod(m, char *), 4368 m->m_len); 4369 if (error != 0) { 4370 m_freem(m0); 4371 return(error); 4372 } 4373 } else 4374 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 4375 sopt->sopt_valsize -= m->m_len; 4376 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4377 m = m->m_next; 4378 } 4379 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 4380 panic("ip6_sooptmcopyin"); 4381 return (0); 4382 } 4383 4384 int 4385 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 4386 { 4387 struct mbuf *m0 = m; 4388 size_t valsize = 0; 4389 4390 if (sopt->sopt_val == NULL) 4391 return (0); 4392 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4393 if (sopt->sopt_td != NULL) { 4394 int error; 4395 4396 error = copyout(mtod(m, char *), sopt->sopt_val, 4397 m->m_len); 4398 if (error != 0) { 4399 m_freem(m0); 4400 return(error); 4401 } 4402 } else 4403 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 4404 sopt->sopt_valsize -= m->m_len; 4405 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4406 valsize += m->m_len; 4407 m = m->m_next; 4408 } 4409 if (m != NULL) { 4410 /* enough soopt buffer should be given from user-land */ 4411 m_freem(m0); 4412 return(EINVAL); 4413 } 4414 sopt->sopt_valsize = valsize; 4415 return (0); 4416 } 4417 4418 /* 4419 * sohasoutofband(): protocol notifies socket layer of the arrival of new 4420 * out-of-band data, which will then notify socket consumers. 4421 */ 4422 void 4423 sohasoutofband(struct socket *so) 4424 { 4425 4426 if (so->so_sigio != NULL) 4427 pgsigio(&so->so_sigio, SIGURG, 0); 4428 selwakeuppri(&so->so_rdsel, PSOCK); 4429 } 4430 4431 int 4432 sopoll_generic(struct socket *so, int events, struct thread *td) 4433 { 4434 int revents; 4435 4436 SOCK_LOCK(so); 4437 if (SOLISTENING(so)) { 4438 if (!(events & (POLLIN | POLLRDNORM))) 4439 revents = 0; 4440 else if (!TAILQ_EMPTY(&so->sol_comp)) 4441 revents = events & (POLLIN | POLLRDNORM); 4442 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 4443 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 4444 else { 4445 selrecord(td, &so->so_rdsel); 4446 revents = 0; 4447 } 4448 } else { 4449 revents = 0; 4450 SOCK_SENDBUF_LOCK(so); 4451 SOCK_RECVBUF_LOCK(so); 4452 if (events & (POLLIN | POLLRDNORM)) 4453 if (soreadabledata(so) && !isspliced(so)) 4454 revents |= events & (POLLIN | POLLRDNORM); 4455 if (events & (POLLOUT | POLLWRNORM)) 4456 if (sowriteable(so) && !issplicedback(so)) 4457 revents |= events & (POLLOUT | POLLWRNORM); 4458 if (events & (POLLPRI | POLLRDBAND)) 4459 if (so->so_oobmark || 4460 (so->so_rcv.sb_state & SBS_RCVATMARK)) 4461 revents |= events & (POLLPRI | POLLRDBAND); 4462 if ((events & POLLINIGNEOF) == 0) { 4463 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4464 revents |= events & (POLLIN | POLLRDNORM); 4465 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 4466 revents |= POLLHUP; 4467 } 4468 } 4469 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4470 revents |= events & POLLRDHUP; 4471 if (revents == 0) { 4472 if (events & 4473 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 4474 selrecord(td, &so->so_rdsel); 4475 so->so_rcv.sb_flags |= SB_SEL; 4476 } 4477 if (events & (POLLOUT | POLLWRNORM)) { 4478 selrecord(td, &so->so_wrsel); 4479 so->so_snd.sb_flags |= SB_SEL; 4480 } 4481 } 4482 SOCK_RECVBUF_UNLOCK(so); 4483 SOCK_SENDBUF_UNLOCK(so); 4484 } 4485 SOCK_UNLOCK(so); 4486 return (revents); 4487 } 4488 4489 int 4490 sokqfilter_generic(struct socket *so, struct knote *kn) 4491 { 4492 struct sockbuf *sb; 4493 sb_which which; 4494 struct knlist *knl; 4495 4496 switch (kn->kn_filter) { 4497 case EVFILT_READ: 4498 kn->kn_fop = &soread_filtops; 4499 knl = &so->so_rdsel.si_note; 4500 sb = &so->so_rcv; 4501 which = SO_RCV; 4502 break; 4503 case EVFILT_WRITE: 4504 kn->kn_fop = &sowrite_filtops; 4505 knl = &so->so_wrsel.si_note; 4506 sb = &so->so_snd; 4507 which = SO_SND; 4508 break; 4509 case EVFILT_EMPTY: 4510 kn->kn_fop = &soempty_filtops; 4511 knl = &so->so_wrsel.si_note; 4512 sb = &so->so_snd; 4513 which = SO_SND; 4514 break; 4515 default: 4516 return (EINVAL); 4517 } 4518 4519 SOCK_LOCK(so); 4520 if (SOLISTENING(so)) { 4521 knlist_add(knl, kn, 1); 4522 } else { 4523 SOCK_BUF_LOCK(so, which); 4524 knlist_add(knl, kn, 1); 4525 sb->sb_flags |= SB_KNOTE; 4526 if ((kn->kn_sfflags & NOTE_LOWAT) && 4527 (sb->sb_flags & SB_AUTOLOWAT)) 4528 sb->sb_flags &= ~SB_AUTOLOWAT; 4529 SOCK_BUF_UNLOCK(so, which); 4530 } 4531 SOCK_UNLOCK(so); 4532 return (0); 4533 } 4534 4535 static void 4536 filt_sordetach(struct knote *kn) 4537 { 4538 struct socket *so = kn->kn_fp->f_data; 4539 4540 so_rdknl_lock(so); 4541 knlist_remove(&so->so_rdsel.si_note, kn, 1); 4542 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 4543 so->so_rcv.sb_flags &= ~SB_KNOTE; 4544 so_rdknl_unlock(so); 4545 } 4546 4547 /*ARGSUSED*/ 4548 static int 4549 filt_soread(struct knote *kn, long hint) 4550 { 4551 struct socket *so; 4552 4553 so = kn->kn_fp->f_data; 4554 4555 if (SOLISTENING(so)) { 4556 SOCK_LOCK_ASSERT(so); 4557 kn->kn_data = so->sol_qlen; 4558 if (so->so_error) { 4559 kn->kn_flags |= EV_EOF; 4560 kn->kn_fflags = so->so_error; 4561 return (1); 4562 } 4563 return (!TAILQ_EMPTY(&so->sol_comp)); 4564 } 4565 4566 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 4567 return (0); 4568 4569 SOCK_RECVBUF_LOCK_ASSERT(so); 4570 4571 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 4572 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4573 kn->kn_flags |= EV_EOF; 4574 kn->kn_fflags = so->so_error; 4575 return (1); 4576 } else if (so->so_error || so->so_rerror) 4577 return (1); 4578 4579 if (kn->kn_sfflags & NOTE_LOWAT) { 4580 if (kn->kn_data >= kn->kn_sdata) 4581 return (1); 4582 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 4583 return (1); 4584 4585 #ifdef SOCKET_HHOOK 4586 /* This hook returning non-zero indicates an event, not error */ 4587 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 4588 #else 4589 return (0); 4590 #endif 4591 } 4592 4593 static void 4594 filt_sowdetach(struct knote *kn) 4595 { 4596 struct socket *so = kn->kn_fp->f_data; 4597 4598 so_wrknl_lock(so); 4599 knlist_remove(&so->so_wrsel.si_note, kn, 1); 4600 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 4601 so->so_snd.sb_flags &= ~SB_KNOTE; 4602 so_wrknl_unlock(so); 4603 } 4604 4605 /*ARGSUSED*/ 4606 static int 4607 filt_sowrite(struct knote *kn, long hint) 4608 { 4609 struct socket *so; 4610 4611 so = kn->kn_fp->f_data; 4612 4613 if (SOLISTENING(so)) 4614 return (0); 4615 4616 SOCK_SENDBUF_LOCK_ASSERT(so); 4617 kn->kn_data = sbspace(&so->so_snd); 4618 4619 #ifdef SOCKET_HHOOK 4620 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 4621 #endif 4622 4623 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 4624 kn->kn_flags |= EV_EOF; 4625 kn->kn_fflags = so->so_error; 4626 return (1); 4627 } else if (so->so_error) /* temporary udp error */ 4628 return (1); 4629 else if (((so->so_state & SS_ISCONNECTED) == 0) && 4630 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 4631 return (0); 4632 else if (kn->kn_sfflags & NOTE_LOWAT) 4633 return (kn->kn_data >= kn->kn_sdata); 4634 else 4635 return (kn->kn_data >= so->so_snd.sb_lowat); 4636 } 4637 4638 static int 4639 filt_soempty(struct knote *kn, long hint) 4640 { 4641 struct socket *so; 4642 4643 so = kn->kn_fp->f_data; 4644 4645 if (SOLISTENING(so)) 4646 return (1); 4647 4648 SOCK_SENDBUF_LOCK_ASSERT(so); 4649 kn->kn_data = sbused(&so->so_snd); 4650 4651 if (kn->kn_data == 0) 4652 return (1); 4653 else 4654 return (0); 4655 } 4656 4657 int 4658 socheckuid(struct socket *so, uid_t uid) 4659 { 4660 4661 if (so == NULL) 4662 return (EPERM); 4663 if (so->so_cred->cr_uid != uid) 4664 return (EPERM); 4665 return (0); 4666 } 4667 4668 /* 4669 * These functions are used by protocols to notify the socket layer (and its 4670 * consumers) of state changes in the sockets driven by protocol-side events. 4671 */ 4672 4673 /* 4674 * Procedures to manipulate state flags of socket and do appropriate wakeups. 4675 * 4676 * Normal sequence from the active (originating) side is that 4677 * soisconnecting() is called during processing of connect() call, resulting 4678 * in an eventual call to soisconnected() if/when the connection is 4679 * established. When the connection is torn down soisdisconnecting() is 4680 * called during processing of disconnect() call, and soisdisconnected() is 4681 * called when the connection to the peer is totally severed. The semantics 4682 * of these routines are such that connectionless protocols can call 4683 * soisconnected() and soisdisconnected() only, bypassing the in-progress 4684 * calls when setting up a ``connection'' takes no time. 4685 * 4686 * From the passive side, a socket is created with two queues of sockets: 4687 * so_incomp for connections in progress and so_comp for connections already 4688 * made and awaiting user acceptance. As a protocol is preparing incoming 4689 * connections, it creates a socket structure queued on so_incomp by calling 4690 * sonewconn(). When the connection is established, soisconnected() is 4691 * called, and transfers the socket structure to so_comp, making it available 4692 * to accept(). 4693 * 4694 * If a socket is closed with sockets on either so_incomp or so_comp, these 4695 * sockets are dropped. 4696 * 4697 * If higher-level protocols are implemented in the kernel, the wakeups done 4698 * here will sometimes cause software-interrupt process scheduling. 4699 */ 4700 void 4701 soisconnecting(struct socket *so) 4702 { 4703 4704 SOCK_LOCK(so); 4705 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 4706 so->so_state |= SS_ISCONNECTING; 4707 SOCK_UNLOCK(so); 4708 } 4709 4710 void 4711 soisconnected(struct socket *so) 4712 { 4713 bool last __diagused; 4714 4715 SOCK_LOCK(so); 4716 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 4717 so->so_state |= SS_ISCONNECTED; 4718 4719 if (so->so_qstate == SQ_INCOMP) { 4720 struct socket *head = so->so_listen; 4721 int ret; 4722 4723 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 4724 /* 4725 * Promoting a socket from incomplete queue to complete, we 4726 * need to go through reverse order of locking. We first do 4727 * trylock, and if that doesn't succeed, we go the hard way 4728 * leaving a reference and rechecking consistency after proper 4729 * locking. 4730 */ 4731 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 4732 soref(head); 4733 SOCK_UNLOCK(so); 4734 SOLISTEN_LOCK(head); 4735 SOCK_LOCK(so); 4736 if (__predict_false(head != so->so_listen)) { 4737 /* 4738 * The socket went off the listen queue, 4739 * should be lost race to close(2) of sol. 4740 * The socket is about to soabort(). 4741 */ 4742 SOCK_UNLOCK(so); 4743 sorele_locked(head); 4744 return; 4745 } 4746 last = refcount_release(&head->so_count); 4747 KASSERT(!last, ("%s: released last reference for %p", 4748 __func__, head)); 4749 } 4750 again: 4751 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 4752 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 4753 head->sol_incqlen--; 4754 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 4755 head->sol_qlen++; 4756 so->so_qstate = SQ_COMP; 4757 SOCK_UNLOCK(so); 4758 solisten_wakeup(head); /* unlocks */ 4759 } else { 4760 SOCK_RECVBUF_LOCK(so); 4761 soupcall_set(so, SO_RCV, 4762 head->sol_accept_filter->accf_callback, 4763 head->sol_accept_filter_arg); 4764 so->so_options &= ~SO_ACCEPTFILTER; 4765 ret = head->sol_accept_filter->accf_callback(so, 4766 head->sol_accept_filter_arg, M_NOWAIT); 4767 if (ret == SU_ISCONNECTED) { 4768 soupcall_clear(so, SO_RCV); 4769 SOCK_RECVBUF_UNLOCK(so); 4770 goto again; 4771 } 4772 SOCK_RECVBUF_UNLOCK(so); 4773 SOCK_UNLOCK(so); 4774 SOLISTEN_UNLOCK(head); 4775 } 4776 return; 4777 } 4778 SOCK_UNLOCK(so); 4779 wakeup(&so->so_timeo); 4780 sorwakeup(so); 4781 sowwakeup(so); 4782 } 4783 4784 void 4785 soisdisconnecting(struct socket *so) 4786 { 4787 4788 SOCK_LOCK(so); 4789 so->so_state &= ~SS_ISCONNECTING; 4790 so->so_state |= SS_ISDISCONNECTING; 4791 4792 if (!SOLISTENING(so)) { 4793 SOCK_RECVBUF_LOCK(so); 4794 socantrcvmore_locked(so); 4795 SOCK_SENDBUF_LOCK(so); 4796 socantsendmore_locked(so); 4797 } 4798 SOCK_UNLOCK(so); 4799 wakeup(&so->so_timeo); 4800 } 4801 4802 void 4803 soisdisconnected(struct socket *so) 4804 { 4805 4806 SOCK_LOCK(so); 4807 4808 /* 4809 * There is at least one reader of so_state that does not 4810 * acquire socket lock, namely soreceive_generic(). Ensure 4811 * that it never sees all flags that track connection status 4812 * cleared, by ordering the update with a barrier semantic of 4813 * our release thread fence. 4814 */ 4815 so->so_state |= SS_ISDISCONNECTED; 4816 atomic_thread_fence_rel(); 4817 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 4818 4819 if (!SOLISTENING(so)) { 4820 SOCK_UNLOCK(so); 4821 SOCK_RECVBUF_LOCK(so); 4822 socantrcvmore_locked(so); 4823 SOCK_SENDBUF_LOCK(so); 4824 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 4825 socantsendmore_locked(so); 4826 } else 4827 SOCK_UNLOCK(so); 4828 wakeup(&so->so_timeo); 4829 } 4830 4831 int 4832 soiolock(struct socket *so, struct sx *sx, int flags) 4833 { 4834 int error; 4835 4836 KASSERT((flags & SBL_VALID) == flags, 4837 ("soiolock: invalid flags %#x", flags)); 4838 4839 if ((flags & SBL_WAIT) != 0) { 4840 if ((flags & SBL_NOINTR) != 0) { 4841 sx_xlock(sx); 4842 } else { 4843 error = sx_xlock_sig(sx); 4844 if (error != 0) 4845 return (error); 4846 } 4847 } else if (!sx_try_xlock(sx)) { 4848 return (EWOULDBLOCK); 4849 } 4850 4851 if (__predict_false(SOLISTENING(so))) { 4852 sx_xunlock(sx); 4853 return (ENOTCONN); 4854 } 4855 return (0); 4856 } 4857 4858 void 4859 soiounlock(struct sx *sx) 4860 { 4861 sx_xunlock(sx); 4862 } 4863 4864 /* 4865 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 4866 */ 4867 struct sockaddr * 4868 sodupsockaddr(const struct sockaddr *sa, int mflags) 4869 { 4870 struct sockaddr *sa2; 4871 4872 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 4873 if (sa2) 4874 bcopy(sa, sa2, sa->sa_len); 4875 return sa2; 4876 } 4877 4878 /* 4879 * Register per-socket destructor. 4880 */ 4881 void 4882 sodtor_set(struct socket *so, so_dtor_t *func) 4883 { 4884 4885 SOCK_LOCK_ASSERT(so); 4886 so->so_dtor = func; 4887 } 4888 4889 /* 4890 * Register per-socket buffer upcalls. 4891 */ 4892 void 4893 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) 4894 { 4895 struct sockbuf *sb; 4896 4897 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4898 4899 switch (which) { 4900 case SO_RCV: 4901 sb = &so->so_rcv; 4902 break; 4903 case SO_SND: 4904 sb = &so->so_snd; 4905 break; 4906 } 4907 SOCK_BUF_LOCK_ASSERT(so, which); 4908 sb->sb_upcall = func; 4909 sb->sb_upcallarg = arg; 4910 sb->sb_flags |= SB_UPCALL; 4911 } 4912 4913 void 4914 soupcall_clear(struct socket *so, sb_which which) 4915 { 4916 struct sockbuf *sb; 4917 4918 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4919 4920 switch (which) { 4921 case SO_RCV: 4922 sb = &so->so_rcv; 4923 break; 4924 case SO_SND: 4925 sb = &so->so_snd; 4926 break; 4927 } 4928 SOCK_BUF_LOCK_ASSERT(so, which); 4929 KASSERT(sb->sb_upcall != NULL, 4930 ("%s: so %p no upcall to clear", __func__, so)); 4931 sb->sb_upcall = NULL; 4932 sb->sb_upcallarg = NULL; 4933 sb->sb_flags &= ~SB_UPCALL; 4934 } 4935 4936 void 4937 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4938 { 4939 4940 SOLISTEN_LOCK_ASSERT(so); 4941 so->sol_upcall = func; 4942 so->sol_upcallarg = arg; 4943 } 4944 4945 static void 4946 so_rdknl_lock(void *arg) 4947 { 4948 struct socket *so = arg; 4949 4950 retry: 4951 if (SOLISTENING(so)) { 4952 SOLISTEN_LOCK(so); 4953 } else { 4954 SOCK_RECVBUF_LOCK(so); 4955 if (__predict_false(SOLISTENING(so))) { 4956 SOCK_RECVBUF_UNLOCK(so); 4957 goto retry; 4958 } 4959 } 4960 } 4961 4962 static void 4963 so_rdknl_unlock(void *arg) 4964 { 4965 struct socket *so = arg; 4966 4967 if (SOLISTENING(so)) 4968 SOLISTEN_UNLOCK(so); 4969 else 4970 SOCK_RECVBUF_UNLOCK(so); 4971 } 4972 4973 static void 4974 so_rdknl_assert_lock(void *arg, int what) 4975 { 4976 struct socket *so = arg; 4977 4978 if (what == LA_LOCKED) { 4979 if (SOLISTENING(so)) 4980 SOLISTEN_LOCK_ASSERT(so); 4981 else 4982 SOCK_RECVBUF_LOCK_ASSERT(so); 4983 } else { 4984 if (SOLISTENING(so)) 4985 SOLISTEN_UNLOCK_ASSERT(so); 4986 else 4987 SOCK_RECVBUF_UNLOCK_ASSERT(so); 4988 } 4989 } 4990 4991 static void 4992 so_wrknl_lock(void *arg) 4993 { 4994 struct socket *so = arg; 4995 4996 retry: 4997 if (SOLISTENING(so)) { 4998 SOLISTEN_LOCK(so); 4999 } else { 5000 SOCK_SENDBUF_LOCK(so); 5001 if (__predict_false(SOLISTENING(so))) { 5002 SOCK_SENDBUF_UNLOCK(so); 5003 goto retry; 5004 } 5005 } 5006 } 5007 5008 static void 5009 so_wrknl_unlock(void *arg) 5010 { 5011 struct socket *so = arg; 5012 5013 if (SOLISTENING(so)) 5014 SOLISTEN_UNLOCK(so); 5015 else 5016 SOCK_SENDBUF_UNLOCK(so); 5017 } 5018 5019 static void 5020 so_wrknl_assert_lock(void *arg, int what) 5021 { 5022 struct socket *so = arg; 5023 5024 if (what == LA_LOCKED) { 5025 if (SOLISTENING(so)) 5026 SOLISTEN_LOCK_ASSERT(so); 5027 else 5028 SOCK_SENDBUF_LOCK_ASSERT(so); 5029 } else { 5030 if (SOLISTENING(so)) 5031 SOLISTEN_UNLOCK_ASSERT(so); 5032 else 5033 SOCK_SENDBUF_UNLOCK_ASSERT(so); 5034 } 5035 } 5036 5037 /* 5038 * Create an external-format (``xsocket'') structure using the information in 5039 * the kernel-format socket structure pointed to by so. This is done to 5040 * reduce the spew of irrelevant information over this interface, to isolate 5041 * user code from changes in the kernel structure, and potentially to provide 5042 * information-hiding if we decide that some of this information should be 5043 * hidden from users. 5044 */ 5045 void 5046 sotoxsocket(struct socket *so, struct xsocket *xso) 5047 { 5048 5049 bzero(xso, sizeof(*xso)); 5050 xso->xso_len = sizeof *xso; 5051 xso->xso_so = (uintptr_t)so; 5052 xso->so_type = so->so_type; 5053 xso->so_options = so->so_options; 5054 xso->so_linger = so->so_linger; 5055 xso->so_state = so->so_state; 5056 xso->so_pcb = (uintptr_t)so->so_pcb; 5057 xso->xso_protocol = so->so_proto->pr_protocol; 5058 xso->xso_family = so->so_proto->pr_domain->dom_family; 5059 xso->so_timeo = so->so_timeo; 5060 xso->so_error = so->so_error; 5061 xso->so_uid = so->so_cred->cr_uid; 5062 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 5063 SOCK_LOCK(so); 5064 xso->so_fibnum = so->so_fibnum; 5065 if (SOLISTENING(so)) { 5066 xso->so_qlen = so->sol_qlen; 5067 xso->so_incqlen = so->sol_incqlen; 5068 xso->so_qlimit = so->sol_qlimit; 5069 xso->so_oobmark = 0; 5070 } else { 5071 xso->so_state |= so->so_qstate; 5072 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 5073 xso->so_oobmark = so->so_oobmark; 5074 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 5075 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 5076 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 5077 xso->so_splice_so = (uintptr_t)so->so_splice->dst; 5078 } 5079 SOCK_UNLOCK(so); 5080 } 5081 5082 int 5083 so_options_get(const struct socket *so) 5084 { 5085 5086 return (so->so_options); 5087 } 5088 5089 void 5090 so_options_set(struct socket *so, int val) 5091 { 5092 5093 so->so_options = val; 5094 } 5095 5096 int 5097 so_error_get(const struct socket *so) 5098 { 5099 5100 return (so->so_error); 5101 } 5102 5103 void 5104 so_error_set(struct socket *so, int val) 5105 { 5106 5107 so->so_error = val; 5108 } 5109