1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pr_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pr_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pr_attach() has 50 * been successfully called. If pr_attach() returned an error, 51 * pr_detach() will not be called. Socket layer private. 52 * 53 * pr_abort() and pr_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pr_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. pr_fdclose() is called when userspace invokes close(2) on a socket 58 * file descriptor. 59 * 60 * socreate() creates a socket and attaches protocol state. This is a public 61 * interface that may be used by socket layer consumers to create new 62 * sockets. 63 * 64 * sonewconn() creates a socket and attaches protocol state. This is a 65 * public interface that may be used by protocols to create new sockets when 66 * a new connection is received and will be available for accept() on a 67 * listen socket. 68 * 69 * soclose() destroys a socket after possibly waiting for it to disconnect. 70 * This is a public interface that socket consumers should use to close and 71 * release a socket when done with it. 72 * 73 * soabort() destroys a socket without waiting for it to disconnect (used 74 * only for incoming connections that are already partially or fully 75 * connected). This is used internally by the socket layer when clearing 76 * listen socket queues (due to overflow or close on the listen socket), but 77 * is also a public interface protocols may use to abort connections in 78 * their incomplete listen queues should they no longer be required. Sockets 79 * placed in completed connection listen queues should not be aborted for 80 * reasons described in the comment above the soclose() implementation. This 81 * is not a general purpose close routine, and except in the specific 82 * circumstances described here, should not be used. 83 * 84 * sofree() will free a socket and its protocol state if all references on 85 * the socket have been released, and is the public interface to attempt to 86 * free a socket when a reference is removed. This is a socket layer private 87 * interface. 88 * 89 * NOTE: In addition to socreate() and soclose(), which provide a single 90 * socket reference to the consumer to be managed as required, there are two 91 * calls to explicitly manage socket references, soref(), and sorele(). 92 * Currently, these are generally required only when transitioning a socket 93 * from a listen queue to a file descriptor, in order to prevent garbage 94 * collection of the socket at an untimely moment. For a number of reasons, 95 * these interfaces are not preferred, and should be avoided. 96 * 97 * NOTE: With regard to VNETs the general rule is that callers do not set 98 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 99 * sofree(), sorele(), sonewconn() and sorflush(), which are usually called 100 * from a pre-set VNET context. sopoll_generic() currently does not need a 101 * VNET context to be set. 102 */ 103 104 #include <sys/cdefs.h> 105 #include "opt_inet.h" 106 #include "opt_inet6.h" 107 #include "opt_kern_tls.h" 108 #include "opt_ktrace.h" 109 #include "opt_sctp.h" 110 111 #include <sys/param.h> 112 #include <sys/systm.h> 113 #include <sys/capsicum.h> 114 #include <sys/fcntl.h> 115 #include <sys/limits.h> 116 #include <sys/lock.h> 117 #include <sys/mac.h> 118 #include <sys/malloc.h> 119 #include <sys/mbuf.h> 120 #include <sys/mutex.h> 121 #include <sys/domain.h> 122 #include <sys/file.h> /* for struct knote */ 123 #include <sys/hhook.h> 124 #include <sys/kernel.h> 125 #include <sys/khelp.h> 126 #include <sys/kthread.h> 127 #include <sys/ktls.h> 128 #include <sys/event.h> 129 #include <sys/eventhandler.h> 130 #include <sys/poll.h> 131 #include <sys/proc.h> 132 #include <sys/protosw.h> 133 #include <sys/sbuf.h> 134 #include <sys/socket.h> 135 #include <sys/socketvar.h> 136 #include <sys/resourcevar.h> 137 #include <net/route.h> 138 #include <sys/sched.h> 139 #include <sys/signalvar.h> 140 #include <sys/smp.h> 141 #include <sys/stat.h> 142 #include <sys/sx.h> 143 #include <sys/sysctl.h> 144 #include <sys/taskqueue.h> 145 #include <sys/uio.h> 146 #include <sys/un.h> 147 #include <sys/unpcb.h> 148 #include <sys/jail.h> 149 #include <sys/syslog.h> 150 #include <netinet/in.h> 151 #include <netinet/in_pcb.h> 152 #include <netinet/tcp.h> 153 154 #include <net/vnet.h> 155 156 #include <security/mac/mac_framework.h> 157 #include <security/mac/mac_internal.h> 158 159 #include <vm/uma.h> 160 161 #ifdef COMPAT_FREEBSD32 162 #include <sys/mount.h> 163 #include <sys/sysent.h> 164 #include <compat/freebsd32/freebsd32.h> 165 #endif 166 167 static int soreceive_generic_locked(struct socket *so, 168 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 169 struct mbuf **controlp, int *flagsp); 170 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 171 int flags); 172 static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 173 struct sockaddr **psa, struct uio *uio, struct mbuf **mp, 174 struct mbuf **controlp, int flags); 175 static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, 176 struct uio *uio, struct mbuf *top, struct mbuf *control, 177 int flags, struct thread *td); 178 static void so_rdknl_lock(void *); 179 static void so_rdknl_unlock(void *); 180 static void so_rdknl_assert_lock(void *, int); 181 static void so_wrknl_lock(void *); 182 static void so_wrknl_unlock(void *); 183 static void so_wrknl_assert_lock(void *, int); 184 185 static void filt_sordetach(struct knote *kn); 186 static int filt_soread(struct knote *kn, long hint); 187 static void filt_sowdetach(struct knote *kn); 188 static int filt_sowrite(struct knote *kn, long hint); 189 static int filt_soempty(struct knote *kn, long hint); 190 191 static const struct filterops soread_filtops = { 192 .f_isfd = 1, 193 .f_detach = filt_sordetach, 194 .f_event = filt_soread, 195 .f_copy = knote_triv_copy, 196 }; 197 static const struct filterops sowrite_filtops = { 198 .f_isfd = 1, 199 .f_detach = filt_sowdetach, 200 .f_event = filt_sowrite, 201 .f_copy = knote_triv_copy, 202 }; 203 static const struct filterops soempty_filtops = { 204 .f_isfd = 1, 205 .f_detach = filt_sowdetach, 206 .f_event = filt_soempty, 207 .f_copy = knote_triv_copy, 208 }; 209 210 so_gen_t so_gencnt; /* generation count for sockets */ 211 212 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 213 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 214 215 #define VNET_SO_ASSERT(so) \ 216 VNET_ASSERT(curvnet != NULL, \ 217 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 218 219 #ifdef SOCKET_HHOOK 220 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 221 #define V_socket_hhh VNET(socket_hhh) 222 static inline int hhook_run_socket(struct socket *, void *, int32_t); 223 #endif 224 225 #ifdef COMPAT_FREEBSD32 226 #ifdef __amd64__ 227 /* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */ 228 #define __splice32_packed __packed 229 #else 230 #define __splice32_packed 231 #endif 232 struct splice32 { 233 int32_t sp_fd; 234 int64_t sp_max; 235 struct timeval32 sp_idle; 236 } __splice32_packed; 237 #undef __splice32_packed 238 #endif 239 240 /* 241 * Limit on the number of connections in the listen queue waiting 242 * for accept(2). 243 * NB: The original sysctl somaxconn is still available but hidden 244 * to prevent confusion about the actual purpose of this number. 245 */ 246 VNET_DEFINE_STATIC(u_int, somaxconn) = SOMAXCONN; 247 #define V_somaxconn VNET(somaxconn) 248 249 static int 250 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 251 { 252 int error; 253 u_int val; 254 255 val = V_somaxconn; 256 error = sysctl_handle_int(oidp, &val, 0, req); 257 if (error || !req->newptr ) 258 return (error); 259 260 /* 261 * The purpose of the UINT_MAX / 3 limit, is so that the formula 262 * 3 * sol_qlimit / 2 263 * below, will not overflow. 264 */ 265 266 if (val < 1 || val > UINT_MAX / 3) 267 return (EINVAL); 268 269 V_somaxconn = val; 270 return (0); 271 } 272 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 273 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int), 274 sysctl_somaxconn, "IU", 275 "Maximum listen socket pending connection accept queue size"); 276 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 277 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, 278 sizeof(u_int), sysctl_somaxconn, "IU", 279 "Maximum listen socket pending connection accept queue size (compat)"); 280 281 static u_int numopensockets; 282 static int 283 sysctl_numopensockets(SYSCTL_HANDLER_ARGS) 284 { 285 u_int val; 286 287 #ifdef VIMAGE 288 if(!IS_DEFAULT_VNET(curvnet)) 289 val = curvnet->vnet_sockcnt; 290 else 291 #endif 292 val = numopensockets; 293 return (sysctl_handle_int(oidp, &val, 0, req)); 294 } 295 SYSCTL_PROC(_kern_ipc, OID_AUTO, numopensockets, 296 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int), 297 sysctl_numopensockets, "IU", "Number of open sockets"); 298 299 /* 300 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 301 * so_gencnt field. 302 */ 303 static struct mtx so_global_mtx; 304 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 305 306 /* 307 * General IPC sysctl name space, used by sockets and a variety of other IPC 308 * types. 309 */ 310 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 311 "IPC"); 312 313 /* 314 * Initialize the socket subsystem and set up the socket 315 * memory allocator. 316 */ 317 static uma_zone_t socket_zone; 318 int maxsockets; 319 320 static void 321 socket_zone_change(void *tag) 322 { 323 324 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 325 } 326 327 static int splice_init_state; 328 static struct sx splice_init_lock; 329 SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init"); 330 331 static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0, 332 "Settings relating to the SO_SPLICE socket option"); 333 334 static bool splice_receive_stream = true; 335 SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN, 336 &splice_receive_stream, 0, 337 "Use soreceive_stream() for stream splices"); 338 339 static int splice_num_wq = -1; 340 static int 341 sysctl_splice_num_wq(SYSCTL_HANDLER_ARGS) 342 { 343 int error, new; 344 345 new = splice_num_wq; 346 error = sysctl_handle_int(oidp, &new, 0, req); 347 if (error == 0 && req->newptr && new != splice_num_wq) { 348 if (!cold) 349 sx_xlock(&splice_init_lock); 350 if (new < -1 || new > mp_ncpus || 351 (new <= 0 && splice_init_state != 0)) { 352 error = EINVAL; 353 } else { 354 splice_num_wq = new; 355 } 356 if (!cold) 357 sx_xunlock(&splice_init_lock); 358 } 359 return (error); 360 } 361 SYSCTL_PROC(_kern_ipc_splice, OID_AUTO, num_wq, 362 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 363 &splice_num_wq, 0, sysctl_splice_num_wq, "IU", 364 "Number of splice worker queues"); 365 366 static uma_zone_t splice_zone; 367 static struct proc *splice_proc; 368 struct splice_wq { 369 struct mtx mtx; 370 STAILQ_HEAD(, so_splice) head; 371 bool running; 372 } __aligned(CACHE_LINE_SIZE); 373 static struct splice_wq *splice_wq; 374 static uint32_t splice_index = 0; 375 376 static void so_splice_timeout(void *arg, int pending); 377 static void so_splice_xfer(struct so_splice *s); 378 static int so_unsplice(struct socket *so, bool timeout); 379 380 static void 381 splice_work_thread(void *ctx) 382 { 383 struct splice_wq *wq = ctx; 384 struct so_splice *s, *s_temp; 385 STAILQ_HEAD(, so_splice) local_head; 386 int cpu; 387 388 cpu = wq - splice_wq; 389 if (bootverbose) 390 printf("starting so_splice worker thread for CPU %d\n", cpu); 391 392 for (;;) { 393 mtx_lock(&wq->mtx); 394 while (STAILQ_EMPTY(&wq->head)) { 395 wq->running = false; 396 mtx_sleep(wq, &wq->mtx, 0, "-", 0); 397 wq->running = true; 398 } 399 STAILQ_INIT(&local_head); 400 STAILQ_CONCAT(&local_head, &wq->head); 401 STAILQ_INIT(&wq->head); 402 mtx_unlock(&wq->mtx); 403 STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) { 404 mtx_lock(&s->mtx); 405 CURVNET_SET(s->src->so_vnet); 406 so_splice_xfer(s); 407 CURVNET_RESTORE(); 408 } 409 } 410 } 411 412 static void 413 so_splice_dispatch_async(struct so_splice *sp) 414 { 415 struct splice_wq *wq; 416 bool running; 417 418 wq = &splice_wq[sp->wq_index]; 419 mtx_lock(&wq->mtx); 420 STAILQ_INSERT_TAIL(&wq->head, sp, next); 421 running = wq->running; 422 mtx_unlock(&wq->mtx); 423 if (!running) 424 wakeup(wq); 425 } 426 427 void 428 so_splice_dispatch(struct so_splice *sp) 429 { 430 mtx_assert(&sp->mtx, MA_OWNED); 431 432 if (sp->state != SPLICE_IDLE) { 433 mtx_unlock(&sp->mtx); 434 } else { 435 sp->state = SPLICE_QUEUED; 436 mtx_unlock(&sp->mtx); 437 so_splice_dispatch_async(sp); 438 } 439 } 440 441 static int 442 splice_zinit(void *mem, int size __unused, int flags __unused) 443 { 444 struct so_splice *s; 445 446 s = (struct so_splice *)mem; 447 mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF); 448 return (0); 449 } 450 451 static void 452 splice_zfini(void *mem, int size) 453 { 454 struct so_splice *s; 455 456 s = (struct so_splice *)mem; 457 mtx_destroy(&s->mtx); 458 } 459 460 static int 461 splice_init(void) 462 { 463 struct thread *td; 464 int error, i, state; 465 466 state = atomic_load_acq_int(&splice_init_state); 467 if (__predict_true(state > 0)) 468 return (0); 469 if (state < 0) 470 return (ENXIO); 471 sx_xlock(&splice_init_lock); 472 if (splice_init_state != 0) { 473 sx_xunlock(&splice_init_lock); 474 return (0); 475 } 476 477 if (splice_num_wq == -1) { 478 /* if no user preference, use all cores */ 479 splice_num_wq = mp_ncpus; 480 } else if (splice_num_wq == 0) { 481 /* allow user to disable */ 482 splice_init_state = -1; 483 sx_xunlock(&splice_init_lock); 484 return (ENXIO); 485 } else if (splice_num_wq > mp_ncpus) { 486 splice_num_wq = mp_ncpus; 487 } 488 489 splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL, 490 NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0); 491 492 splice_wq = mallocarray(mp_ncpus, sizeof(*splice_wq), M_TEMP, 493 M_WAITOK | M_ZERO); 494 495 /* 496 * Initialize the workqueues to run the splice work. We create a 497 * work queue for each CPU. 498 */ 499 for (i = 0; i < mp_ncpus; i++) { 500 STAILQ_INIT(&splice_wq[i].head); 501 mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF); 502 } 503 504 /* Start kthreads for each workqueue. */ 505 error = 0; 506 for (i = 0; i < mp_ncpus; i++) { 507 error = kproc_kthread_add(splice_work_thread, &splice_wq[i], 508 &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i); 509 if (error) { 510 printf("Can't add so_splice thread %d error %d\n", 511 i, error); 512 break; 513 } 514 515 /* 516 * It's possible to create loops with SO_SPLICE; ensure that 517 * worker threads aren't able to starve the system too easily. 518 */ 519 thread_lock(td); 520 sched_prio(td, PUSER); 521 thread_unlock(td); 522 } 523 524 splice_init_state = error != 0 ? -1 : 1; 525 sx_xunlock(&splice_init_lock); 526 527 return (error); 528 } 529 530 /* 531 * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding 532 * one lock in order to avoid potential deadlocks in case there is some other 533 * code path which acquires more than one I/O lock at a time. 534 */ 535 static void 536 splice_lock_pair(struct socket *so_src, struct socket *so_dst) 537 { 538 int error; 539 540 for (;;) { 541 error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR); 542 KASSERT(error == 0, 543 ("%s: failed to lock send I/O lock: %d", __func__, error)); 544 error = SOCK_IO_RECV_LOCK(so_src, 0); 545 KASSERT(error == 0 || error == EWOULDBLOCK, 546 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 547 if (error == 0) 548 break; 549 SOCK_IO_SEND_UNLOCK(so_dst); 550 551 error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR); 552 KASSERT(error == 0, 553 ("%s: failed to lock recv I/O lock: %d", __func__, error)); 554 error = SOCK_IO_SEND_LOCK(so_dst, 0); 555 KASSERT(error == 0 || error == EWOULDBLOCK, 556 ("%s: failed to lock send I/O lock: %d", __func__, error)); 557 if (error == 0) 558 break; 559 SOCK_IO_RECV_UNLOCK(so_src); 560 } 561 } 562 563 static void 564 splice_unlock_pair(struct socket *so_src, struct socket *so_dst) 565 { 566 SOCK_IO_RECV_UNLOCK(so_src); 567 SOCK_IO_SEND_UNLOCK(so_dst); 568 } 569 570 /* 571 * Move data from the source to the sink. Assumes that both of the relevant 572 * socket I/O locks are held. 573 */ 574 static int 575 so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max, 576 ssize_t *lenp) 577 { 578 struct uio uio; 579 struct mbuf *m; 580 struct sockbuf *sb_src, *sb_dst; 581 ssize_t len; 582 long space; 583 int error, flags; 584 585 SOCK_IO_RECV_ASSERT_LOCKED(so_src); 586 SOCK_IO_SEND_ASSERT_LOCKED(so_dst); 587 588 error = 0; 589 m = NULL; 590 memset(&uio, 0, sizeof(uio)); 591 592 sb_src = &so_src->so_rcv; 593 sb_dst = &so_dst->so_snd; 594 595 space = sbspace(sb_dst); 596 if (space < 0) 597 space = 0; 598 len = MIN(max, MIN(space, sbavail(sb_src))); 599 if (len == 0) { 600 SOCK_RECVBUF_LOCK(so_src); 601 if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0) 602 error = EPIPE; 603 SOCK_RECVBUF_UNLOCK(so_src); 604 } else { 605 flags = MSG_DONTWAIT; 606 uio.uio_resid = len; 607 if (splice_receive_stream && sb_src->sb_tls_info == NULL) { 608 error = soreceive_stream_locked(so_src, sb_src, NULL, 609 &uio, &m, NULL, flags); 610 } else { 611 error = soreceive_generic_locked(so_src, NULL, 612 &uio, &m, NULL, &flags); 613 } 614 if (error != 0 && m != NULL) { 615 m_freem(m); 616 m = NULL; 617 } 618 } 619 if (m != NULL) { 620 len -= uio.uio_resid; 621 error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL, 622 MSG_DONTWAIT, curthread); 623 } else if (error == 0) { 624 len = 0; 625 SOCK_SENDBUF_LOCK(so_dst); 626 if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0) 627 error = EPIPE; 628 SOCK_SENDBUF_UNLOCK(so_dst); 629 } 630 if (error == 0) 631 *lenp = len; 632 return (error); 633 } 634 635 /* 636 * Transfer data from the source to the sink. 637 */ 638 static void 639 so_splice_xfer(struct so_splice *sp) 640 { 641 struct socket *so_src, *so_dst; 642 off_t max; 643 ssize_t len; 644 int error; 645 646 mtx_assert(&sp->mtx, MA_OWNED); 647 KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING, 648 ("so_splice_xfer: invalid state %d", sp->state)); 649 KASSERT(sp->max != 0, ("so_splice_xfer: max == 0")); 650 651 if (sp->state == SPLICE_CLOSING) { 652 /* Userspace asked us to close the splice. */ 653 goto closing; 654 } 655 656 sp->state = SPLICE_RUNNING; 657 so_src = sp->src; 658 so_dst = sp->dst; 659 max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX; 660 if (max < 0) 661 max = 0; 662 663 /* 664 * Lock the sockets in order to block userspace from doing anything 665 * sneaky. If an error occurs or one of the sockets can no longer 666 * transfer data, we will automatically unsplice. 667 */ 668 mtx_unlock(&sp->mtx); 669 splice_lock_pair(so_src, so_dst); 670 671 error = so_splice_xfer_data(so_src, so_dst, max, &len); 672 673 mtx_lock(&sp->mtx); 674 675 /* 676 * Update our stats while still holding the socket locks. This 677 * synchronizes with getsockopt(SO_SPLICE), see the comment there. 678 */ 679 if (error == 0) { 680 KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len)); 681 so_src->so_splice_sent += len; 682 } 683 splice_unlock_pair(so_src, so_dst); 684 685 switch (sp->state) { 686 case SPLICE_CLOSING: 687 closing: 688 sp->state = SPLICE_CLOSED; 689 wakeup(sp); 690 mtx_unlock(&sp->mtx); 691 break; 692 case SPLICE_RUNNING: 693 if (error != 0 || 694 (sp->max > 0 && so_src->so_splice_sent >= sp->max)) { 695 sp->state = SPLICE_EXCEPTION; 696 soref(so_src); 697 mtx_unlock(&sp->mtx); 698 (void)so_unsplice(so_src, false); 699 sorele(so_src); 700 } else { 701 /* 702 * Locklessly check for additional bytes in the source's 703 * receive buffer and queue more work if possible. We 704 * may end up queuing needless work, but that's ok, and 705 * if we race with a thread inserting more data into the 706 * buffer and observe sbavail() == 0, the splice mutex 707 * ensures that splice_push() will queue more work for 708 * us. 709 */ 710 if (sbavail(&so_src->so_rcv) > 0 && 711 sbspace(&so_dst->so_snd) > 0) { 712 sp->state = SPLICE_QUEUED; 713 mtx_unlock(&sp->mtx); 714 so_splice_dispatch_async(sp); 715 } else { 716 sp->state = SPLICE_IDLE; 717 mtx_unlock(&sp->mtx); 718 } 719 } 720 break; 721 default: 722 __assert_unreachable(); 723 } 724 } 725 726 static void 727 socket_init(void *tag) 728 { 729 730 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 731 NULL, NULL, UMA_ALIGN_PTR, 0); 732 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 733 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 734 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 735 EVENTHANDLER_PRI_FIRST); 736 } 737 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 738 739 #ifdef SOCKET_HHOOK 740 static void 741 socket_hhook_register(int subtype) 742 { 743 744 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 745 &V_socket_hhh[subtype], 746 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 747 printf("%s: WARNING: unable to register hook\n", __func__); 748 } 749 750 static void 751 socket_hhook_deregister(int subtype) 752 { 753 754 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 755 printf("%s: WARNING: unable to deregister hook\n", __func__); 756 } 757 758 static void 759 socket_vnet_init(const void *unused __unused) 760 { 761 int i; 762 763 /* We expect a contiguous range */ 764 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 765 socket_hhook_register(i); 766 } 767 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 768 socket_vnet_init, NULL); 769 770 static void 771 socket_vnet_uninit(const void *unused __unused) 772 { 773 int i; 774 775 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 776 socket_hhook_deregister(i); 777 } 778 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 779 socket_vnet_uninit, NULL); 780 #endif /* SOCKET_HHOOK */ 781 782 /* 783 * Initialise maxsockets. This SYSINIT must be run after 784 * tunable_mbinit(). 785 */ 786 static void 787 init_maxsockets(void *ignored) 788 { 789 790 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 791 maxsockets = imax(maxsockets, maxfiles); 792 } 793 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 794 795 /* 796 * Sysctl to get and set the maximum global sockets limit. Notify protocols 797 * of the change so that they can update their dependent limits as required. 798 */ 799 static int 800 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 801 { 802 int error, newmaxsockets; 803 804 newmaxsockets = maxsockets; 805 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 806 if (error == 0 && req->newptr && newmaxsockets != maxsockets) { 807 if (newmaxsockets > maxsockets && 808 newmaxsockets <= maxfiles) { 809 maxsockets = newmaxsockets; 810 EVENTHANDLER_INVOKE(maxsockets_change); 811 } else 812 error = EINVAL; 813 } 814 return (error); 815 } 816 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 817 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 818 &maxsockets, 0, sysctl_maxsockets, "IU", 819 "Maximum number of sockets available"); 820 821 /* 822 * Socket operation routines. These routines are called by the routines in 823 * sys_socket.c or from a system process, and implement the semantics of 824 * socket operations by switching out to the protocol specific routines. 825 */ 826 827 /* 828 * Get a socket structure from our zone, and initialize it. Note that it 829 * would probably be better to allocate socket and PCB at the same time, but 830 * I'm not convinced that all the protocols can be easily modified to do 831 * this. 832 * 833 * soalloc() returns a socket with a ref count of 0. 834 */ 835 static struct socket * 836 soalloc(struct vnet *vnet) 837 { 838 struct socket *so; 839 840 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 841 if (so == NULL) 842 return (NULL); 843 #ifdef MAC 844 if (mac_socket_init(so, M_NOWAIT) != 0) { 845 uma_zfree(socket_zone, so); 846 return (NULL); 847 } 848 #endif 849 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 850 uma_zfree(socket_zone, so); 851 return (NULL); 852 } 853 854 /* 855 * The socket locking protocol allows to lock 2 sockets at a time, 856 * however, the first one must be a listening socket. WITNESS lacks 857 * a feature to change class of an existing lock, so we use DUPOK. 858 */ 859 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 860 so->so_rcv.sb_sel = &so->so_rdsel; 861 so->so_snd.sb_sel = &so->so_wrsel; 862 sx_init(&so->so_snd_sx, "so_snd_sx"); 863 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 864 TAILQ_INIT(&so->so_snd.sb_aiojobq); 865 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 866 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 867 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 868 #ifdef VIMAGE 869 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 870 __func__, __LINE__, so)); 871 so->so_vnet = vnet; 872 #endif 873 #ifdef SOCKET_HHOOK 874 /* We shouldn't need the so_global_mtx */ 875 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 876 /* Do we need more comprehensive error returns? */ 877 uma_zfree(socket_zone, so); 878 return (NULL); 879 } 880 #endif 881 mtx_lock(&so_global_mtx); 882 so->so_gencnt = ++so_gencnt; 883 ++numopensockets; 884 #ifdef VIMAGE 885 vnet->vnet_sockcnt++; 886 #endif 887 mtx_unlock(&so_global_mtx); 888 889 return (so); 890 } 891 892 /* 893 * Free the storage associated with a socket at the socket layer, tear down 894 * locks, labels, etc. All protocol state is assumed already to have been 895 * torn down (and possibly never set up) by the caller. 896 */ 897 void 898 sodealloc(struct socket *so) 899 { 900 901 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 902 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 903 904 mtx_lock(&so_global_mtx); 905 so->so_gencnt = ++so_gencnt; 906 --numopensockets; /* Could be below, but faster here. */ 907 #ifdef VIMAGE 908 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 909 __func__, __LINE__, so)); 910 so->so_vnet->vnet_sockcnt--; 911 #endif 912 mtx_unlock(&so_global_mtx); 913 #ifdef MAC 914 mac_socket_destroy(so); 915 #endif 916 #ifdef SOCKET_HHOOK 917 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 918 #endif 919 920 khelp_destroy_osd(&so->osd); 921 if (SOLISTENING(so)) { 922 if (so->sol_accept_filter != NULL) 923 accept_filt_setopt(so, NULL); 924 } else { 925 if (so->so_rcv.sb_hiwat) 926 (void)chgsbsize(so->so_cred->cr_uidinfo, 927 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 928 if (so->so_snd.sb_hiwat) 929 (void)chgsbsize(so->so_cred->cr_uidinfo, 930 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 931 sx_destroy(&so->so_snd_sx); 932 sx_destroy(&so->so_rcv_sx); 933 } 934 crfree(so->so_cred); 935 mtx_destroy(&so->so_lock); 936 uma_zfree(socket_zone, so); 937 } 938 939 /* 940 * Shim to accomodate protocols that already do their own socket buffers 941 * management (marked with PR_SOCKBUF) with protocols that yet do not. 942 * 943 * Attach via socket(2) is different from attach via accept(2). In case of 944 * normal socket(2) syscall it is the pr_attach that calls soreserve(), even 945 * for protocols that don't yet do PR_SOCKBUF. In case of accepted connection 946 * it is our shim that calls soreserve() and the hiwat values are taken from 947 * the parent socket. The SCTP's sopeeloff() hands us a non-listening parent 948 * socket. 949 * 950 * This whole shim should go away when all major protocols fully manage their 951 * socket buffers. 952 */ 953 static int 954 soattach(struct socket *so, int proto, struct thread *td, struct socket *head) 955 { 956 int error; 957 958 VNET_ASSERT(curvnet == so->so_vnet, 959 ("%s: %p != %p", __func__, curvnet, so->so_vnet)); 960 961 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 962 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); 963 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); 964 so->so_snd.sb_mtx = &so->so_snd_mtx; 965 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 966 } 967 if (head == NULL || (error = soreserve(so, 968 SOLISTENING(head) ? head->sol_sbsnd_hiwat : head->so_snd.sb_hiwat, 969 SOLISTENING(head) ? head->sol_sbrcv_hiwat : head->so_rcv.sb_hiwat)) 970 == 0) 971 error = so->so_proto->pr_attach(so, proto, td); 972 if (error != 0 && (so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 973 mtx_destroy(&so->so_snd_mtx); 974 mtx_destroy(&so->so_rcv_mtx); 975 } 976 977 return (error); 978 } 979 980 /* 981 * socreate returns a socket with a ref count of 1 and a file descriptor 982 * reference. The socket should be closed with soclose(). 983 */ 984 int 985 socreate(int dom, struct socket **aso, int type, int proto, 986 struct ucred *cred, struct thread *td) 987 { 988 struct protosw *prp; 989 struct socket *so; 990 int error; 991 992 prp = pffindproto(dom, type, proto); 993 if (prp == NULL) { 994 /* No support for domain. */ 995 if (pffinddomain(dom) == NULL) 996 return (EAFNOSUPPORT); 997 /* No support for socket type. */ 998 if (proto == 0 && type != 0) 999 return (EPROTOTYPE); 1000 return (EPROTONOSUPPORT); 1001 } 1002 1003 MPASS(prp->pr_attach); 1004 1005 if ((prp->pr_flags & PR_CAPATTACH) == 0) { 1006 if (CAP_TRACING(td)) 1007 ktrcapfail(CAPFAIL_PROTO, &proto); 1008 if (IN_CAPABILITY_MODE(td)) 1009 return (ECAPMODE); 1010 } 1011 1012 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 1013 return (EPROTONOSUPPORT); 1014 1015 so = soalloc(CRED_TO_VNET(cred)); 1016 if (so == NULL) 1017 return (ENOBUFS); 1018 1019 so->so_type = type; 1020 so->so_cred = crhold(cred); 1021 if ((prp->pr_domain->dom_family == PF_INET) || 1022 (prp->pr_domain->dom_family == PF_INET6) || 1023 (prp->pr_domain->dom_family == PF_ROUTE)) 1024 so->so_fibnum = td->td_proc->p_fibnum; 1025 else 1026 so->so_fibnum = 0; 1027 so->so_proto = prp; 1028 #ifdef MAC 1029 mac_socket_create(cred, so); 1030 #endif 1031 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1032 so_rdknl_assert_lock); 1033 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1034 so_wrknl_assert_lock); 1035 CURVNET_SET(so->so_vnet); 1036 error = soattach(so, proto, td, NULL); 1037 CURVNET_RESTORE(); 1038 if (error) { 1039 sodealloc(so); 1040 return (error); 1041 } 1042 soref(so); 1043 *aso = so; 1044 return (0); 1045 } 1046 1047 #ifdef REGRESSION 1048 static int regression_sonewconn_earlytest = 1; 1049 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 1050 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 1051 #endif 1052 1053 static int sooverprio = LOG_DEBUG; 1054 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, 1055 &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); 1056 1057 static struct timeval overinterval = { 60, 0 }; 1058 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 1059 &overinterval, 1060 "Delay in seconds between warnings for listen socket overflows"); 1061 1062 /* 1063 * When an attempt at a new connection is noted on a socket which supports 1064 * accept(2), the protocol has two options: 1065 * 1) Call legacy sonewconn() function, which would call protocol attach 1066 * method, same as used for socket(2). 1067 * 2) Call solisten_clone(), do attach that is specific to a cloned connection, 1068 * and then call solisten_enqueue(). 1069 * 1070 * Note: the ref count on the socket is 0 on return. 1071 */ 1072 struct socket * 1073 solisten_clone(struct socket *head) 1074 { 1075 struct sbuf descrsb; 1076 struct socket *so; 1077 int len, overcount; 1078 u_int qlen; 1079 const char localprefix[] = "local:"; 1080 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 1081 #if defined(INET6) 1082 char addrbuf[INET6_ADDRSTRLEN]; 1083 #elif defined(INET) 1084 char addrbuf[INET_ADDRSTRLEN]; 1085 #endif 1086 bool dolog, over; 1087 1088 SOLISTEN_LOCK(head); 1089 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 1090 #ifdef REGRESSION 1091 if (regression_sonewconn_earlytest && over) { 1092 #else 1093 if (over) { 1094 #endif 1095 head->sol_overcount++; 1096 dolog = (sooverprio >= 0) && 1097 !!ratecheck(&head->sol_lastover, &overinterval); 1098 1099 /* 1100 * If we're going to log, copy the overflow count and queue 1101 * length from the listen socket before dropping the lock. 1102 * Also, reset the overflow count. 1103 */ 1104 if (dolog) { 1105 overcount = head->sol_overcount; 1106 head->sol_overcount = 0; 1107 qlen = head->sol_qlen; 1108 } 1109 SOLISTEN_UNLOCK(head); 1110 1111 if (dolog) { 1112 /* 1113 * Try to print something descriptive about the 1114 * socket for the error message. 1115 */ 1116 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 1117 SBUF_FIXEDLEN); 1118 switch (head->so_proto->pr_domain->dom_family) { 1119 #if defined(INET) || defined(INET6) 1120 #ifdef INET 1121 case AF_INET: 1122 #endif 1123 #ifdef INET6 1124 case AF_INET6: 1125 if (head->so_proto->pr_domain->dom_family == 1126 AF_INET6 || 1127 (sotoinpcb(head)->inp_inc.inc_flags & 1128 INC_ISIPV6)) { 1129 ip6_sprintf(addrbuf, 1130 &sotoinpcb(head)->inp_inc.inc6_laddr); 1131 sbuf_printf(&descrsb, "[%s]", addrbuf); 1132 } else 1133 #endif 1134 { 1135 #ifdef INET 1136 inet_ntoa_r( 1137 sotoinpcb(head)->inp_inc.inc_laddr, 1138 addrbuf); 1139 sbuf_cat(&descrsb, addrbuf); 1140 #endif 1141 } 1142 sbuf_printf(&descrsb, ":%hu (proto %u)", 1143 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 1144 head->so_proto->pr_protocol); 1145 break; 1146 #endif /* INET || INET6 */ 1147 case AF_UNIX: 1148 sbuf_cat(&descrsb, localprefix); 1149 if (sotounpcb(head)->unp_addr != NULL) 1150 len = 1151 sotounpcb(head)->unp_addr->sun_len - 1152 offsetof(struct sockaddr_un, 1153 sun_path); 1154 else 1155 len = 0; 1156 if (len > 0) 1157 sbuf_bcat(&descrsb, 1158 sotounpcb(head)->unp_addr->sun_path, 1159 len); 1160 else 1161 sbuf_cat(&descrsb, "(unknown)"); 1162 break; 1163 } 1164 1165 /* 1166 * If we can't print something more specific, at least 1167 * print the domain name. 1168 */ 1169 if (sbuf_finish(&descrsb) != 0 || 1170 sbuf_len(&descrsb) <= 0) { 1171 sbuf_clear(&descrsb); 1172 sbuf_cat(&descrsb, 1173 head->so_proto->pr_domain->dom_name ?: 1174 "unknown"); 1175 sbuf_finish(&descrsb); 1176 } 1177 KASSERT(sbuf_len(&descrsb) > 0, 1178 ("%s: sbuf creation failed", __func__)); 1179 /* 1180 * Preserve the historic listen queue overflow log 1181 * message, that starts with "sonewconn:". It has 1182 * been known to sysadmins for years and also test 1183 * sys/kern/sonewconn_overflow checks for it. 1184 */ 1185 if (head->so_cred == 0) { 1186 log(LOG_PRI(sooverprio), 1187 "sonewconn: pcb %p (%s): " 1188 "Listen queue overflow: %i already in " 1189 "queue awaiting acceptance (%d " 1190 "occurrences)\n", head->so_pcb, 1191 sbuf_data(&descrsb), 1192 qlen, overcount); 1193 } else { 1194 log(LOG_PRI(sooverprio), 1195 "sonewconn: pcb %p (%s): " 1196 "Listen queue overflow: " 1197 "%i already in queue awaiting acceptance " 1198 "(%d occurrences), euid %d, rgid %d, jail %s\n", 1199 head->so_pcb, sbuf_data(&descrsb), qlen, 1200 overcount, head->so_cred->cr_uid, 1201 head->so_cred->cr_rgid, 1202 head->so_cred->cr_prison ? 1203 head->so_cred->cr_prison->pr_name : 1204 "not_jailed"); 1205 } 1206 sbuf_delete(&descrsb); 1207 1208 overcount = 0; 1209 } 1210 1211 return (NULL); 1212 } 1213 SOLISTEN_UNLOCK(head); 1214 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 1215 __func__, head)); 1216 so = soalloc(head->so_vnet); 1217 if (so == NULL) { 1218 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1219 "limit reached or out of memory\n", 1220 __func__, head->so_pcb); 1221 return (NULL); 1222 } 1223 so->so_listen = head; 1224 so->so_type = head->so_type; 1225 /* 1226 * POSIX is ambiguous on what options an accept(2)ed socket should 1227 * inherit from the listener. Words "create a new socket" may be 1228 * interpreted as not inheriting anything. Best programming practice 1229 * for application developers is to not rely on such inheritance. 1230 * FreeBSD had historically inherited all so_options excluding 1231 * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, 1232 * including those completely irrelevant to a new born socket. For 1233 * compatibility with older versions we will inherit a list of 1234 * meaningful options. 1235 * The crucial bit to inherit is SO_ACCEPTFILTER. We need it present 1236 * in the child socket for soisconnected() promoting socket from the 1237 * incomplete queue to complete. It will be cleared before the child 1238 * gets available to accept(2). 1239 */ 1240 so->so_options = head->so_options & (SO_ACCEPTFILTER | SO_KEEPALIVE | 1241 SO_DONTROUTE | SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); 1242 so->so_linger = head->so_linger; 1243 so->so_state = head->so_state; 1244 so->so_fibnum = head->so_fibnum; 1245 so->so_proto = head->so_proto; 1246 so->so_cred = crhold(head->so_cred); 1247 #ifdef SOCKET_HHOOK 1248 if (V_socket_hhh[HHOOK_SOCKET_NEWCONN]->hhh_nhooks > 0) { 1249 if (hhook_run_socket(so, head, HHOOK_SOCKET_NEWCONN)) { 1250 sodealloc(so); 1251 log(LOG_DEBUG, "%s: hhook run failed\n", __func__); 1252 return (NULL); 1253 } 1254 } 1255 #endif 1256 #ifdef MAC 1257 mac_socket_newconn(head, so); 1258 #endif 1259 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1260 so_rdknl_assert_lock); 1261 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1262 so_wrknl_assert_lock); 1263 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 1264 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 1265 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 1266 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 1267 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; 1268 so->so_snd.sb_flags = head->sol_sbsnd_flags & 1269 (SB_AUTOSIZE | SB_AUTOLOWAT); 1270 1271 return (so); 1272 } 1273 1274 /* Connstatus may be 0 or SS_ISCONNECTED. */ 1275 struct socket * 1276 sonewconn(struct socket *head, int connstatus) 1277 { 1278 struct socket *so; 1279 1280 if ((so = solisten_clone(head)) == NULL) 1281 return (NULL); 1282 1283 if (soattach(so, 0, NULL, head) != 0) { 1284 sodealloc(so); 1285 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 1286 __func__, head->so_pcb); 1287 return (NULL); 1288 } 1289 1290 (void)solisten_enqueue(so, connstatus); 1291 1292 return (so); 1293 } 1294 1295 /* 1296 * Enqueue socket cloned by solisten_clone() to the listen queue of the 1297 * listener it has been cloned from. 1298 * 1299 * Return 'true' if socket landed on complete queue, otherwise 'false'. 1300 */ 1301 bool 1302 solisten_enqueue(struct socket *so, int connstatus) 1303 { 1304 struct socket *head = so->so_listen; 1305 1306 MPASS(refcount_load(&so->so_count) == 0); 1307 refcount_init(&so->so_count, 1); 1308 1309 SOLISTEN_LOCK(head); 1310 if (head->sol_accept_filter != NULL) 1311 connstatus = 0; 1312 so->so_state |= connstatus; 1313 soref(head); /* A socket on (in)complete queue refs head. */ 1314 if (connstatus) { 1315 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 1316 so->so_qstate = SQ_COMP; 1317 head->sol_qlen++; 1318 solisten_wakeup(head); /* unlocks */ 1319 return (true); 1320 } else { 1321 /* 1322 * Keep removing sockets from the head until there's room for 1323 * us to insert on the tail. In pre-locking revisions, this 1324 * was a simple if(), but as we could be racing with other 1325 * threads and soabort() requires dropping locks, we must 1326 * loop waiting for the condition to be true. 1327 */ 1328 while (head->sol_incqlen > head->sol_qlimit) { 1329 struct socket *sp; 1330 1331 sp = TAILQ_FIRST(&head->sol_incomp); 1332 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 1333 head->sol_incqlen--; 1334 SOCK_LOCK(sp); 1335 sp->so_qstate = SQ_NONE; 1336 sp->so_listen = NULL; 1337 SOCK_UNLOCK(sp); 1338 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ 1339 soabort(sp); 1340 SOLISTEN_LOCK(head); 1341 } 1342 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 1343 so->so_qstate = SQ_INCOMP; 1344 head->sol_incqlen++; 1345 SOLISTEN_UNLOCK(head); 1346 return (false); 1347 } 1348 } 1349 1350 #if defined(SCTP) || defined(SCTP_SUPPORT) 1351 /* 1352 * Socket part of sctp_peeloff(). Create a new socket for an 1353 * association. The new socket is returned with a reference. 1354 * 1355 * XXXGL: reduce copy-paste with solisten_clone(). 1356 */ 1357 struct socket * 1358 sopeeloff(struct socket *head, struct protosw *so_proto) 1359 { 1360 struct socket *so; 1361 1362 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 1363 __func__, __LINE__, head)); 1364 KASSERT(head->so_type == SOCK_SEQPACKET, 1365 ("%s: unexpecte so_type: %d", __func__, head->so_type)); 1366 so = soalloc(head->so_vnet); 1367 if (so == NULL) { 1368 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 1369 "limit reached or out of memory\n", 1370 __func__, head->so_pcb); 1371 return (NULL); 1372 } 1373 so->so_type = SOCK_STREAM; 1374 so->so_options = head->so_options; 1375 so->so_linger = head->so_linger; 1376 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 1377 so->so_fibnum = head->so_fibnum; 1378 so->so_proto = so_proto; 1379 so->so_cred = crhold(head->so_cred); 1380 #ifdef MAC 1381 mac_socket_newconn(head, so); 1382 #endif 1383 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 1384 so_rdknl_assert_lock); 1385 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 1386 so_wrknl_assert_lock); 1387 if (soattach(so, 0, NULL, head)) { 1388 sodealloc(so); 1389 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 1390 __func__, head->so_pcb); 1391 return (NULL); 1392 } 1393 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 1394 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 1395 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 1396 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 1397 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 1398 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 1399 1400 soref(so); 1401 1402 return (so); 1403 } 1404 #endif /* SCTP */ 1405 1406 int 1407 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 1408 { 1409 int error; 1410 1411 CURVNET_SET(so->so_vnet); 1412 error = so->so_proto->pr_bind(so, nam, td); 1413 CURVNET_RESTORE(); 1414 return (error); 1415 } 1416 1417 int 1418 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1419 { 1420 int error; 1421 1422 CURVNET_SET(so->so_vnet); 1423 error = so->so_proto->pr_bindat(fd, so, nam, td); 1424 CURVNET_RESTORE(); 1425 return (error); 1426 } 1427 1428 /* 1429 * solisten() transitions a socket from a non-listening state to a listening 1430 * state, but can also be used to update the listen queue depth on an 1431 * existing listen socket. The protocol will call back into the sockets 1432 * layer using solisten_proto_check() and solisten_proto() to check and set 1433 * socket-layer listen state. Call backs are used so that the protocol can 1434 * acquire both protocol and socket layer locks in whatever order is required 1435 * by the protocol. 1436 * 1437 * Protocol implementors are advised to hold the socket lock across the 1438 * socket-layer test and set to avoid races at the socket layer. 1439 */ 1440 int 1441 solisten(struct socket *so, int backlog, struct thread *td) 1442 { 1443 int error; 1444 1445 CURVNET_SET(so->so_vnet); 1446 error = so->so_proto->pr_listen(so, backlog, td); 1447 CURVNET_RESTORE(); 1448 return (error); 1449 } 1450 1451 /* 1452 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 1453 * order to interlock with socket I/O. 1454 */ 1455 int 1456 solisten_proto_check(struct socket *so) 1457 { 1458 SOCK_LOCK_ASSERT(so); 1459 1460 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1461 SS_ISDISCONNECTING)) != 0) 1462 return (EINVAL); 1463 1464 /* 1465 * Sleeping is not permitted here, so simply fail if userspace is 1466 * attempting to transmit or receive on the socket. This kind of 1467 * transient failure is not ideal, but it should occur only if userspace 1468 * is misusing the socket interfaces. 1469 */ 1470 if (!sx_try_xlock(&so->so_snd_sx)) 1471 return (EAGAIN); 1472 if (!sx_try_xlock(&so->so_rcv_sx)) { 1473 sx_xunlock(&so->so_snd_sx); 1474 return (EAGAIN); 1475 } 1476 mtx_lock(&so->so_snd_mtx); 1477 mtx_lock(&so->so_rcv_mtx); 1478 1479 /* Interlock with soo_aio_queue() and KTLS. */ 1480 if (!SOLISTENING(so)) { 1481 bool ktls; 1482 1483 #ifdef KERN_TLS 1484 ktls = so->so_snd.sb_tls_info != NULL || 1485 so->so_rcv.sb_tls_info != NULL; 1486 #else 1487 ktls = false; 1488 #endif 1489 if (ktls || 1490 (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || 1491 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { 1492 solisten_proto_abort(so); 1493 return (EINVAL); 1494 } 1495 } 1496 1497 return (0); 1498 } 1499 1500 /* 1501 * Undo the setup done by solisten_proto_check(). 1502 */ 1503 void 1504 solisten_proto_abort(struct socket *so) 1505 { 1506 mtx_unlock(&so->so_snd_mtx); 1507 mtx_unlock(&so->so_rcv_mtx); 1508 sx_xunlock(&so->so_snd_sx); 1509 sx_xunlock(&so->so_rcv_sx); 1510 } 1511 1512 void 1513 solisten_proto(struct socket *so, int backlog) 1514 { 1515 int sbrcv_lowat, sbsnd_lowat; 1516 u_int sbrcv_hiwat, sbsnd_hiwat; 1517 short sbrcv_flags, sbsnd_flags; 1518 sbintime_t sbrcv_timeo, sbsnd_timeo; 1519 1520 SOCK_LOCK_ASSERT(so); 1521 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1522 SS_ISDISCONNECTING)) == 0, 1523 ("%s: bad socket state %p", __func__, so)); 1524 1525 if (SOLISTENING(so)) 1526 goto listening; 1527 1528 /* 1529 * Change this socket to listening state. 1530 */ 1531 sbrcv_lowat = so->so_rcv.sb_lowat; 1532 sbsnd_lowat = so->so_snd.sb_lowat; 1533 sbrcv_hiwat = so->so_rcv.sb_hiwat; 1534 sbsnd_hiwat = so->so_snd.sb_hiwat; 1535 sbrcv_flags = so->so_rcv.sb_flags; 1536 sbsnd_flags = so->so_snd.sb_flags; 1537 sbrcv_timeo = so->so_rcv.sb_timeo; 1538 sbsnd_timeo = so->so_snd.sb_timeo; 1539 1540 #ifdef MAC 1541 mac_socketpeer_label_free(so->so_peerlabel); 1542 #endif 1543 1544 if (!(so->so_proto->pr_flags & PR_SOCKBUF)) { 1545 sbdestroy(so, SO_SND); 1546 sbdestroy(so, SO_RCV); 1547 } 1548 1549 #ifdef INVARIANTS 1550 bzero(&so->so_rcv, 1551 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 1552 #endif 1553 1554 so->sol_sbrcv_lowat = sbrcv_lowat; 1555 so->sol_sbsnd_lowat = sbsnd_lowat; 1556 so->sol_sbrcv_hiwat = sbrcv_hiwat; 1557 so->sol_sbsnd_hiwat = sbsnd_hiwat; 1558 so->sol_sbrcv_flags = sbrcv_flags; 1559 so->sol_sbsnd_flags = sbsnd_flags; 1560 so->sol_sbrcv_timeo = sbrcv_timeo; 1561 so->sol_sbsnd_timeo = sbsnd_timeo; 1562 1563 so->sol_qlen = so->sol_incqlen = 0; 1564 TAILQ_INIT(&so->sol_incomp); 1565 TAILQ_INIT(&so->sol_comp); 1566 1567 so->sol_accept_filter = NULL; 1568 so->sol_accept_filter_arg = NULL; 1569 so->sol_accept_filter_str = NULL; 1570 1571 so->sol_upcall = NULL; 1572 so->sol_upcallarg = NULL; 1573 1574 so->so_options |= SO_ACCEPTCONN; 1575 1576 listening: 1577 if (backlog < 0 || backlog > V_somaxconn) 1578 backlog = V_somaxconn; 1579 so->sol_qlimit = backlog; 1580 1581 mtx_unlock(&so->so_snd_mtx); 1582 mtx_unlock(&so->so_rcv_mtx); 1583 sx_xunlock(&so->so_snd_sx); 1584 sx_xunlock(&so->so_rcv_sx); 1585 } 1586 1587 /* 1588 * Wakeup listeners/subsystems once we have a complete connection. 1589 * Enters with lock, returns unlocked. 1590 */ 1591 void 1592 solisten_wakeup(struct socket *sol) 1593 { 1594 1595 if (sol->sol_upcall != NULL) 1596 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1597 else { 1598 selwakeuppri(&sol->so_rdsel, PSOCK); 1599 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1600 } 1601 SOLISTEN_UNLOCK(sol); 1602 wakeup_one(&sol->sol_comp); 1603 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1604 pgsigio(&sol->so_sigio, SIGIO, 0); 1605 } 1606 1607 /* 1608 * Return single connection off a listening socket queue. Main consumer of 1609 * the function is kern_accept4(). Some modules, that do their own accept 1610 * management also use the function. The socket reference held by the 1611 * listen queue is handed to the caller. 1612 * 1613 * Listening socket must be locked on entry and is returned unlocked on 1614 * return. 1615 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1616 */ 1617 int 1618 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1619 { 1620 struct socket *so; 1621 int error; 1622 1623 SOLISTEN_LOCK_ASSERT(head); 1624 1625 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1626 head->so_error == 0) { 1627 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1628 "accept", 0); 1629 if (error != 0) { 1630 SOLISTEN_UNLOCK(head); 1631 return (error); 1632 } 1633 } 1634 if (head->so_error) { 1635 error = head->so_error; 1636 head->so_error = 0; 1637 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1638 error = EWOULDBLOCK; 1639 else 1640 error = 0; 1641 if (error) { 1642 SOLISTEN_UNLOCK(head); 1643 return (error); 1644 } 1645 so = TAILQ_FIRST(&head->sol_comp); 1646 SOCK_LOCK(so); 1647 KASSERT(so->so_qstate == SQ_COMP, 1648 ("%s: so %p not SQ_COMP", __func__, so)); 1649 head->sol_qlen--; 1650 so->so_qstate = SQ_NONE; 1651 so->so_listen = NULL; 1652 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1653 if (flags & ACCEPT4_INHERIT) 1654 so->so_state |= (head->so_state & SS_NBIO); 1655 else 1656 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1657 SOCK_UNLOCK(so); 1658 sorele_locked(head); 1659 1660 *ret = so; 1661 return (0); 1662 } 1663 1664 static struct so_splice * 1665 so_splice_alloc(off_t max) 1666 { 1667 struct so_splice *sp; 1668 1669 sp = uma_zalloc(splice_zone, M_WAITOK); 1670 sp->src = NULL; 1671 sp->dst = NULL; 1672 sp->max = max > 0 ? max : -1; 1673 sp->wq_index = atomic_fetchadd_32(&splice_index, 1) % splice_num_wq; 1674 sp->state = SPLICE_INIT; 1675 TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout, 1676 sp); 1677 return (sp); 1678 } 1679 1680 static void 1681 so_splice_free(struct so_splice *sp) 1682 { 1683 KASSERT(sp->state == SPLICE_CLOSED, 1684 ("so_splice_free: sp %p not closed", sp)); 1685 uma_zfree(splice_zone, sp); 1686 } 1687 1688 static void 1689 so_splice_timeout(void *arg, int pending __unused) 1690 { 1691 struct so_splice *sp; 1692 1693 sp = arg; 1694 (void)so_unsplice(sp->src, true); 1695 } 1696 1697 /* 1698 * Splice the output from so to the input of so2. 1699 */ 1700 static int 1701 so_splice(struct socket *so, struct socket *so2, struct splice *splice) 1702 { 1703 struct so_splice *sp; 1704 int error; 1705 1706 if (splice->sp_max < 0) 1707 return (EINVAL); 1708 /* Handle only TCP for now; TODO: other streaming protos */ 1709 if (so->so_proto->pr_protocol != IPPROTO_TCP || 1710 so2->so_proto->pr_protocol != IPPROTO_TCP) 1711 return (EPROTONOSUPPORT); 1712 if (so->so_vnet != so2->so_vnet) 1713 return (EINVAL); 1714 1715 /* so_splice_xfer() assumes that we're using these implementations. */ 1716 KASSERT(so->so_proto->pr_sosend == sosend_generic, 1717 ("so_splice: sosend not sosend_generic")); 1718 KASSERT(so2->so_proto->pr_soreceive == soreceive_generic || 1719 so2->so_proto->pr_soreceive == soreceive_stream, 1720 ("so_splice: soreceive not soreceive_generic/stream")); 1721 1722 sp = so_splice_alloc(splice->sp_max); 1723 so->so_splice_sent = 0; 1724 sp->src = so; 1725 sp->dst = so2; 1726 1727 error = 0; 1728 SOCK_LOCK(so); 1729 if (SOLISTENING(so)) 1730 error = EINVAL; 1731 else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1732 error = ENOTCONN; 1733 else if (so->so_splice != NULL) 1734 error = EBUSY; 1735 if (error != 0) { 1736 SOCK_UNLOCK(so); 1737 uma_zfree(splice_zone, sp); 1738 return (error); 1739 } 1740 SOCK_RECVBUF_LOCK(so); 1741 if (so->so_rcv.sb_tls_info != NULL) { 1742 SOCK_RECVBUF_UNLOCK(so); 1743 SOCK_UNLOCK(so); 1744 uma_zfree(splice_zone, sp); 1745 return (EINVAL); 1746 } 1747 so->so_rcv.sb_flags |= SB_SPLICED; 1748 so->so_splice = sp; 1749 soref(so); 1750 SOCK_RECVBUF_UNLOCK(so); 1751 SOCK_UNLOCK(so); 1752 1753 error = 0; 1754 SOCK_LOCK(so2); 1755 if (SOLISTENING(so2)) 1756 error = EINVAL; 1757 else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) 1758 error = ENOTCONN; 1759 else if (so2->so_splice_back != NULL) 1760 error = EBUSY; 1761 if (error != 0) { 1762 SOCK_UNLOCK(so2); 1763 mtx_lock(&sp->mtx); 1764 sp->dst = NULL; 1765 sp->state = SPLICE_EXCEPTION; 1766 mtx_unlock(&sp->mtx); 1767 so_unsplice(so, false); 1768 return (error); 1769 } 1770 SOCK_SENDBUF_LOCK(so2); 1771 if (so->so_snd.sb_tls_info != NULL) { 1772 SOCK_SENDBUF_UNLOCK(so2); 1773 SOCK_UNLOCK(so2); 1774 mtx_lock(&sp->mtx); 1775 sp->dst = NULL; 1776 sp->state = SPLICE_EXCEPTION; 1777 mtx_unlock(&sp->mtx); 1778 so_unsplice(so, false); 1779 return (EINVAL); 1780 } 1781 so2->so_snd.sb_flags |= SB_SPLICED; 1782 so2->so_splice_back = sp; 1783 soref(so2); 1784 mtx_lock(&sp->mtx); 1785 SOCK_SENDBUF_UNLOCK(so2); 1786 SOCK_UNLOCK(so2); 1787 1788 if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) { 1789 taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout, 1790 tvtosbt(splice->sp_idle), 0, C_PREL(4)); 1791 } 1792 1793 /* 1794 * Transfer any data already present in the socket buffer. 1795 */ 1796 KASSERT(sp->state == SPLICE_INIT, 1797 ("so_splice: splice %p state %d", sp, sp->state)); 1798 sp->state = SPLICE_QUEUED; 1799 so_splice_xfer(sp); 1800 return (0); 1801 } 1802 1803 static int 1804 so_unsplice(struct socket *so, bool timeout) 1805 { 1806 struct socket *so2; 1807 struct so_splice *sp; 1808 bool drain, so2rele; 1809 1810 /* 1811 * First unset SB_SPLICED and hide the splice structure so that 1812 * wakeup routines will stop enqueuing work. This also ensures that 1813 * a only a single thread will proceed with the unsplice. 1814 */ 1815 SOCK_LOCK(so); 1816 if (SOLISTENING(so)) { 1817 SOCK_UNLOCK(so); 1818 return (EINVAL); 1819 } 1820 SOCK_RECVBUF_LOCK(so); 1821 if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) { 1822 SOCK_RECVBUF_UNLOCK(so); 1823 SOCK_UNLOCK(so); 1824 return (ENOTCONN); 1825 } 1826 sp = so->so_splice; 1827 mtx_lock(&sp->mtx); 1828 if (sp->state == SPLICE_INIT) { 1829 /* 1830 * A splice is in the middle of being set up. 1831 */ 1832 mtx_unlock(&sp->mtx); 1833 SOCK_RECVBUF_UNLOCK(so); 1834 SOCK_UNLOCK(so); 1835 return (ENOTCONN); 1836 } 1837 mtx_unlock(&sp->mtx); 1838 so->so_rcv.sb_flags &= ~SB_SPLICED; 1839 so->so_splice = NULL; 1840 SOCK_RECVBUF_UNLOCK(so); 1841 SOCK_UNLOCK(so); 1842 1843 so2 = sp->dst; 1844 if (so2 != NULL) { 1845 SOCK_LOCK(so2); 1846 KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__)); 1847 SOCK_SENDBUF_LOCK(so2); 1848 KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0, 1849 ("%s: so2 is not spliced", __func__)); 1850 KASSERT(so2->so_splice_back == sp, 1851 ("%s: so_splice_back != sp", __func__)); 1852 so2->so_snd.sb_flags &= ~SB_SPLICED; 1853 so2rele = so2->so_splice_back != NULL; 1854 so2->so_splice_back = NULL; 1855 SOCK_SENDBUF_UNLOCK(so2); 1856 SOCK_UNLOCK(so2); 1857 } 1858 1859 /* 1860 * No new work is being enqueued. The worker thread might be 1861 * splicing data right now, in which case we want to wait for it to 1862 * finish before proceeding. 1863 */ 1864 mtx_lock(&sp->mtx); 1865 switch (sp->state) { 1866 case SPLICE_QUEUED: 1867 case SPLICE_RUNNING: 1868 sp->state = SPLICE_CLOSING; 1869 while (sp->state == SPLICE_CLOSING) 1870 msleep(sp, &sp->mtx, PSOCK, "unsplice", 0); 1871 break; 1872 case SPLICE_INIT: 1873 case SPLICE_IDLE: 1874 case SPLICE_EXCEPTION: 1875 sp->state = SPLICE_CLOSED; 1876 break; 1877 default: 1878 __assert_unreachable(); 1879 } 1880 if (!timeout) { 1881 drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout, 1882 NULL) != 0; 1883 } else { 1884 drain = false; 1885 } 1886 mtx_unlock(&sp->mtx); 1887 if (drain) 1888 taskqueue_drain_timeout(taskqueue_thread, &sp->timeout); 1889 1890 /* 1891 * Now we hold the sole reference to the splice structure. 1892 * Clean up: signal userspace and release socket references. 1893 */ 1894 sorwakeup(so); 1895 CURVNET_SET(so->so_vnet); 1896 sorele(so); 1897 if (so2 != NULL) { 1898 sowwakeup(so2); 1899 if (so2rele) 1900 sorele(so2); 1901 } 1902 CURVNET_RESTORE(); 1903 so_splice_free(sp); 1904 return (0); 1905 } 1906 1907 /* 1908 * Free socket upon release of the very last reference. 1909 */ 1910 static void 1911 sofree(struct socket *so) 1912 { 1913 struct protosw *pr = so->so_proto; 1914 1915 SOCK_LOCK_ASSERT(so); 1916 KASSERT(refcount_load(&so->so_count) == 0, 1917 ("%s: so %p has references", __func__, so)); 1918 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, 1919 ("%s: so %p is on listen queue", __func__, so)); 1920 KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0, 1921 ("%s: so %p rcvbuf is spliced", __func__, so)); 1922 KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0, 1923 ("%s: so %p sndbuf is spliced", __func__, so)); 1924 KASSERT(so->so_splice == NULL && so->so_splice_back == NULL, 1925 ("%s: so %p has spliced data", __func__, so)); 1926 1927 SOCK_UNLOCK(so); 1928 1929 if (so->so_dtor != NULL) 1930 so->so_dtor(so); 1931 1932 VNET_SO_ASSERT(so); 1933 if (pr->pr_detach != NULL) 1934 pr->pr_detach(so); 1935 1936 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { 1937 /* 1938 * From this point on, we assume that no other references to 1939 * this socket exist anywhere else in the stack. Therefore, 1940 * no locks need to be acquired or held. 1941 */ 1942 #ifdef INVARIANTS 1943 SOCK_SENDBUF_LOCK(so); 1944 SOCK_RECVBUF_LOCK(so); 1945 #endif 1946 sbdestroy(so, SO_SND); 1947 sbdestroy(so, SO_RCV); 1948 #ifdef INVARIANTS 1949 SOCK_SENDBUF_UNLOCK(so); 1950 SOCK_RECVBUF_UNLOCK(so); 1951 #endif 1952 mtx_destroy(&so->so_snd_mtx); 1953 mtx_destroy(&so->so_rcv_mtx); 1954 } 1955 seldrain(&so->so_rdsel); 1956 seldrain(&so->so_wrsel); 1957 knlist_destroy(&so->so_rdsel.si_note); 1958 knlist_destroy(&so->so_wrsel.si_note); 1959 sodealloc(so); 1960 } 1961 1962 /* 1963 * Release a reference on a socket while holding the socket lock. 1964 * Unlocks the socket lock before returning. 1965 */ 1966 void 1967 sorele_locked(struct socket *so) 1968 { 1969 SOCK_LOCK_ASSERT(so); 1970 if (refcount_release(&so->so_count)) 1971 sofree(so); 1972 else 1973 SOCK_UNLOCK(so); 1974 } 1975 1976 /* 1977 * Close a socket on last file table reference removal. Initiate disconnect 1978 * if connected. Free socket when disconnect complete. 1979 * 1980 * This function will sorele() the socket. Note that soclose() may be called 1981 * prior to the ref count reaching zero. The actual socket structure will 1982 * not be freed until the ref count reaches zero. 1983 */ 1984 int 1985 soclose(struct socket *so) 1986 { 1987 struct accept_queue lqueue; 1988 int error = 0; 1989 bool listening, last __diagused; 1990 1991 CURVNET_SET(so->so_vnet); 1992 funsetown(&so->so_sigio); 1993 if (so->so_state & SS_ISCONNECTED) { 1994 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1995 error = sodisconnect(so); 1996 if (error) { 1997 if (error == ENOTCONN) 1998 error = 0; 1999 goto drop; 2000 } 2001 } 2002 2003 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 2004 if ((so->so_state & SS_ISDISCONNECTING) && 2005 (so->so_state & SS_NBIO)) 2006 goto drop; 2007 while (so->so_state & SS_ISCONNECTED) { 2008 error = tsleep(&so->so_timeo, 2009 PSOCK | PCATCH, "soclos", 2010 so->so_linger * hz); 2011 if (error) 2012 break; 2013 } 2014 } 2015 } 2016 2017 drop: 2018 if (so->so_proto->pr_close != NULL) 2019 so->so_proto->pr_close(so); 2020 2021 SOCK_LOCK(so); 2022 if ((listening = SOLISTENING(so))) { 2023 struct socket *sp; 2024 2025 TAILQ_INIT(&lqueue); 2026 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 2027 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 2028 2029 so->sol_qlen = so->sol_incqlen = 0; 2030 2031 TAILQ_FOREACH(sp, &lqueue, so_list) { 2032 SOCK_LOCK(sp); 2033 sp->so_qstate = SQ_NONE; 2034 sp->so_listen = NULL; 2035 SOCK_UNLOCK(sp); 2036 last = refcount_release(&so->so_count); 2037 KASSERT(!last, ("%s: released last reference for %p", 2038 __func__, so)); 2039 } 2040 } 2041 sorele_locked(so); 2042 if (listening) { 2043 struct socket *sp, *tsp; 2044 2045 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) 2046 soabort(sp); 2047 } 2048 CURVNET_RESTORE(); 2049 return (error); 2050 } 2051 2052 /* 2053 * soabort() is used to abruptly tear down a connection, such as when a 2054 * resource limit is reached (listen queue depth exceeded), or if a listen 2055 * socket is closed while there are sockets waiting to be accepted. 2056 * 2057 * This interface is tricky, because it is called on an unreferenced socket, 2058 * and must be called only by a thread that has actually removed the socket 2059 * from the listen queue it was on. Likely this thread holds the last 2060 * reference on the socket and soabort() will proceed with sofree(). But 2061 * it might be not the last, as the sockets on the listen queues are seen 2062 * from the protocol side. 2063 * 2064 * This interface will call into the protocol code, so must not be called 2065 * with any socket locks held. Protocols do call it while holding their own 2066 * recursible protocol mutexes, but this is something that should be subject 2067 * to review in the future. 2068 * 2069 * Usually socket should have a single reference left, but this is not a 2070 * requirement. In the past, when we have had named references for file 2071 * descriptor and protocol, we asserted that none of them are being held. 2072 */ 2073 void 2074 soabort(struct socket *so) 2075 { 2076 2077 VNET_SO_ASSERT(so); 2078 2079 if (so->so_proto->pr_abort != NULL) 2080 so->so_proto->pr_abort(so); 2081 SOCK_LOCK(so); 2082 sorele_locked(so); 2083 } 2084 2085 int 2086 soaccept(struct socket *so, struct sockaddr *sa) 2087 { 2088 #ifdef INVARIANTS 2089 u_char len = sa->sa_len; 2090 #endif 2091 int error; 2092 2093 CURVNET_SET(so->so_vnet); 2094 error = so->so_proto->pr_accept(so, sa); 2095 KASSERT(sa->sa_len <= len, 2096 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2097 CURVNET_RESTORE(); 2098 return (error); 2099 } 2100 2101 int 2102 sopeeraddr(struct socket *so, struct sockaddr *sa) 2103 { 2104 #ifdef INVARIANTS 2105 u_char len = sa->sa_len; 2106 #endif 2107 int error; 2108 2109 CURVNET_ASSERT_SET(); 2110 2111 error = so->so_proto->pr_peeraddr(so, sa); 2112 KASSERT(sa->sa_len <= len, 2113 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2114 2115 return (error); 2116 } 2117 2118 int 2119 sosockaddr(struct socket *so, struct sockaddr *sa) 2120 { 2121 #ifdef INVARIANTS 2122 u_char len = sa->sa_len; 2123 #endif 2124 int error; 2125 2126 CURVNET_SET(so->so_vnet); 2127 error = so->so_proto->pr_sockaddr(so, sa); 2128 KASSERT(sa->sa_len <= len, 2129 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 2130 CURVNET_RESTORE(); 2131 2132 return (error); 2133 } 2134 2135 int 2136 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 2137 { 2138 2139 return (soconnectat(AT_FDCWD, so, nam, td)); 2140 } 2141 2142 int 2143 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 2144 { 2145 int error; 2146 2147 CURVNET_SET(so->so_vnet); 2148 2149 /* 2150 * If protocol is connection-based, can only connect once. 2151 * Otherwise, if connected, try to disconnect first. This allows 2152 * user to disconnect by connecting to, e.g., a null address. 2153 * 2154 * Note, this check is racy and may need to be re-evaluated at the 2155 * protocol layer. 2156 */ 2157 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 2158 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 2159 (error = sodisconnect(so)))) { 2160 error = EISCONN; 2161 } else { 2162 /* 2163 * Prevent accumulated error from previous connection from 2164 * biting us. 2165 */ 2166 so->so_error = 0; 2167 if (fd == AT_FDCWD) { 2168 error = so->so_proto->pr_connect(so, nam, td); 2169 } else { 2170 error = so->so_proto->pr_connectat(fd, so, nam, td); 2171 } 2172 } 2173 CURVNET_RESTORE(); 2174 2175 return (error); 2176 } 2177 2178 int 2179 soconnect2(struct socket *so1, struct socket *so2) 2180 { 2181 int error; 2182 2183 CURVNET_SET(so1->so_vnet); 2184 error = so1->so_proto->pr_connect2(so1, so2); 2185 CURVNET_RESTORE(); 2186 return (error); 2187 } 2188 2189 int 2190 sodisconnect(struct socket *so) 2191 { 2192 int error; 2193 2194 if ((so->so_state & SS_ISCONNECTED) == 0) 2195 return (ENOTCONN); 2196 if (so->so_state & SS_ISDISCONNECTING) 2197 return (EALREADY); 2198 VNET_SO_ASSERT(so); 2199 error = so->so_proto->pr_disconnect(so); 2200 return (error); 2201 } 2202 2203 int 2204 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 2205 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2206 { 2207 long space; 2208 ssize_t resid; 2209 int clen = 0, error, dontroute; 2210 2211 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 2212 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 2213 ("sosend_dgram: !PR_ATOMIC")); 2214 2215 if (uio != NULL) 2216 resid = uio->uio_resid; 2217 else 2218 resid = top->m_pkthdr.len; 2219 /* 2220 * In theory resid should be unsigned. However, space must be 2221 * signed, as it might be less than 0 if we over-committed, and we 2222 * must use a signed comparison of space and resid. On the other 2223 * hand, a negative resid causes us to loop sending 0-length 2224 * segments to the protocol. 2225 */ 2226 if (resid < 0) { 2227 error = EINVAL; 2228 goto out; 2229 } 2230 2231 dontroute = 2232 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 2233 if (td != NULL) 2234 td->td_ru.ru_msgsnd++; 2235 if (control != NULL) 2236 clen = control->m_len; 2237 2238 SOCKBUF_LOCK(&so->so_snd); 2239 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2240 SOCKBUF_UNLOCK(&so->so_snd); 2241 error = EPIPE; 2242 goto out; 2243 } 2244 if (so->so_error) { 2245 error = so->so_error; 2246 so->so_error = 0; 2247 SOCKBUF_UNLOCK(&so->so_snd); 2248 goto out; 2249 } 2250 if ((so->so_state & SS_ISCONNECTED) == 0) { 2251 /* 2252 * `sendto' and `sendmsg' is allowed on a connection-based 2253 * socket if it supports implied connect. Return ENOTCONN if 2254 * not connected and no address is supplied. 2255 */ 2256 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2257 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2258 if (!(resid == 0 && clen != 0)) { 2259 SOCKBUF_UNLOCK(&so->so_snd); 2260 error = ENOTCONN; 2261 goto out; 2262 } 2263 } else if (addr == NULL) { 2264 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2265 error = ENOTCONN; 2266 else 2267 error = EDESTADDRREQ; 2268 SOCKBUF_UNLOCK(&so->so_snd); 2269 goto out; 2270 } 2271 } 2272 2273 /* 2274 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 2275 * problem and need fixing. 2276 */ 2277 space = sbspace(&so->so_snd); 2278 if (flags & MSG_OOB) 2279 space += 1024; 2280 space -= clen; 2281 SOCKBUF_UNLOCK(&so->so_snd); 2282 if (resid > space) { 2283 error = EMSGSIZE; 2284 goto out; 2285 } 2286 if (uio == NULL) { 2287 resid = 0; 2288 if (flags & MSG_EOR) 2289 top->m_flags |= M_EOR; 2290 } else { 2291 /* 2292 * Copy the data from userland into a mbuf chain. 2293 * If no data is to be copied in, a single empty mbuf 2294 * is returned. 2295 */ 2296 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 2297 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 2298 if (top == NULL) { 2299 error = EFAULT; /* only possible error */ 2300 goto out; 2301 } 2302 space -= resid - uio->uio_resid; 2303 resid = uio->uio_resid; 2304 } 2305 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 2306 /* 2307 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 2308 * than with. 2309 */ 2310 if (dontroute) { 2311 SOCK_LOCK(so); 2312 so->so_options |= SO_DONTROUTE; 2313 SOCK_UNLOCK(so); 2314 } 2315 /* 2316 * XXX all the SBS_CANTSENDMORE checks previously done could be out 2317 * of date. We could have received a reset packet in an interrupt or 2318 * maybe we slept while doing page faults in uiomove() etc. We could 2319 * probably recheck again inside the locking protection here, but 2320 * there are probably other places that this also happens. We must 2321 * rethink this. 2322 */ 2323 VNET_SO_ASSERT(so); 2324 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : 2325 /* 2326 * If the user set MSG_EOF, the protocol understands this flag and 2327 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 2328 */ 2329 ((flags & MSG_EOF) && 2330 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2331 (resid <= 0)) ? 2332 PRUS_EOF : 2333 /* If there is more to send set PRUS_MORETOCOME */ 2334 (flags & MSG_MORETOCOME) || 2335 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 2336 top, addr, control, td); 2337 if (dontroute) { 2338 SOCK_LOCK(so); 2339 so->so_options &= ~SO_DONTROUTE; 2340 SOCK_UNLOCK(so); 2341 } 2342 clen = 0; 2343 control = NULL; 2344 top = NULL; 2345 out: 2346 if (top != NULL) 2347 m_freem(top); 2348 if (control != NULL) 2349 m_freem(control); 2350 return (error); 2351 } 2352 2353 /* 2354 * Send on a socket. If send must go all at once and message is larger than 2355 * send buffering, then hard error. Lock against other senders. If must go 2356 * all at once and not enough room now, then inform user that this would 2357 * block and do nothing. Otherwise, if nonblocking, send as much as 2358 * possible. The data to be sent is described by "uio" if nonzero, otherwise 2359 * by the mbuf chain "top" (which must be null if uio is not). Data provided 2360 * in mbuf chain must be small enough to send all at once. 2361 * 2362 * Returns nonzero on error, timeout or signal; callers must check for short 2363 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 2364 * on return. 2365 */ 2366 static int 2367 sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, 2368 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2369 { 2370 long space; 2371 ssize_t resid; 2372 int clen = 0, error, dontroute; 2373 int atomic = sosendallatonce(so) || top; 2374 int pr_send_flag; 2375 #ifdef KERN_TLS 2376 struct ktls_session *tls; 2377 int tls_enq_cnt, tls_send_flag; 2378 uint8_t tls_rtype; 2379 2380 tls = NULL; 2381 tls_rtype = TLS_RLTYPE_APP; 2382 #endif 2383 2384 SOCK_IO_SEND_ASSERT_LOCKED(so); 2385 2386 if (uio != NULL) 2387 resid = uio->uio_resid; 2388 else if ((top->m_flags & M_PKTHDR) != 0) 2389 resid = top->m_pkthdr.len; 2390 else 2391 resid = m_length(top, NULL); 2392 /* 2393 * In theory resid should be unsigned. However, space must be 2394 * signed, as it might be less than 0 if we over-committed, and we 2395 * must use a signed comparison of space and resid. On the other 2396 * hand, a negative resid causes us to loop sending 0-length 2397 * segments to the protocol. 2398 * 2399 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 2400 * type sockets since that's an error. 2401 */ 2402 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 2403 error = EINVAL; 2404 goto out; 2405 } 2406 2407 dontroute = 2408 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 2409 (so->so_proto->pr_flags & PR_ATOMIC); 2410 if (td != NULL) 2411 td->td_ru.ru_msgsnd++; 2412 if (control != NULL) 2413 clen = control->m_len; 2414 2415 #ifdef KERN_TLS 2416 tls_send_flag = 0; 2417 tls = ktls_hold(so->so_snd.sb_tls_info); 2418 if (tls != NULL) { 2419 if (tls->mode == TCP_TLS_MODE_SW) 2420 tls_send_flag = PRUS_NOTREADY; 2421 2422 if (control != NULL) { 2423 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 2424 2425 if (clen >= sizeof(*cm) && 2426 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 2427 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 2428 clen = 0; 2429 m_freem(control); 2430 control = NULL; 2431 atomic = 1; 2432 } 2433 } 2434 2435 if (resid == 0 && !ktls_permit_empty_frames(tls)) { 2436 error = EINVAL; 2437 goto out; 2438 } 2439 } 2440 #endif 2441 2442 restart: 2443 do { 2444 SOCKBUF_LOCK(&so->so_snd); 2445 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2446 SOCKBUF_UNLOCK(&so->so_snd); 2447 error = EPIPE; 2448 goto out; 2449 } 2450 if (so->so_error) { 2451 error = so->so_error; 2452 so->so_error = 0; 2453 SOCKBUF_UNLOCK(&so->so_snd); 2454 goto out; 2455 } 2456 if ((so->so_state & SS_ISCONNECTED) == 0) { 2457 /* 2458 * `sendto' and `sendmsg' is allowed on a connection- 2459 * based socket if it supports implied connect. 2460 * Return ENOTCONN if not connected and no address is 2461 * supplied. 2462 */ 2463 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 2464 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 2465 if (!(resid == 0 && clen != 0)) { 2466 SOCKBUF_UNLOCK(&so->so_snd); 2467 error = ENOTCONN; 2468 goto out; 2469 } 2470 } else if (addr == NULL) { 2471 SOCKBUF_UNLOCK(&so->so_snd); 2472 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 2473 error = ENOTCONN; 2474 else 2475 error = EDESTADDRREQ; 2476 goto out; 2477 } 2478 } 2479 space = sbspace(&so->so_snd); 2480 if (flags & MSG_OOB) 2481 space += 1024; 2482 if ((atomic && resid > so->so_snd.sb_hiwat) || 2483 clen > so->so_snd.sb_hiwat) { 2484 SOCKBUF_UNLOCK(&so->so_snd); 2485 error = EMSGSIZE; 2486 goto out; 2487 } 2488 if (space < resid + clen && 2489 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 2490 if ((so->so_state & SS_NBIO) || 2491 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 2492 SOCKBUF_UNLOCK(&so->so_snd); 2493 error = EWOULDBLOCK; 2494 goto out; 2495 } 2496 error = sbwait(so, SO_SND); 2497 SOCKBUF_UNLOCK(&so->so_snd); 2498 if (error) 2499 goto out; 2500 goto restart; 2501 } 2502 SOCKBUF_UNLOCK(&so->so_snd); 2503 space -= clen; 2504 do { 2505 if (uio == NULL) { 2506 resid = 0; 2507 if (flags & MSG_EOR) 2508 top->m_flags |= M_EOR; 2509 #ifdef KERN_TLS 2510 if (tls != NULL) { 2511 ktls_frame(top, tls, &tls_enq_cnt, 2512 tls_rtype); 2513 tls_rtype = TLS_RLTYPE_APP; 2514 } 2515 #endif 2516 } else { 2517 /* 2518 * Copy the data from userland into a mbuf 2519 * chain. If resid is 0, which can happen 2520 * only if we have control to send, then 2521 * a single empty mbuf is returned. This 2522 * is a workaround to prevent protocol send 2523 * methods to panic. 2524 */ 2525 #ifdef KERN_TLS 2526 if (tls != NULL) { 2527 top = m_uiotombuf(uio, M_WAITOK, space, 2528 tls->params.max_frame_len, 2529 M_EXTPG | 2530 ((flags & MSG_EOR) ? M_EOR : 0)); 2531 if (top != NULL) { 2532 ktls_frame(top, tls, 2533 &tls_enq_cnt, tls_rtype); 2534 } 2535 tls_rtype = TLS_RLTYPE_APP; 2536 } else 2537 #endif 2538 top = m_uiotombuf(uio, M_WAITOK, space, 2539 (atomic ? max_hdr : 0), 2540 (atomic ? M_PKTHDR : 0) | 2541 ((flags & MSG_EOR) ? M_EOR : 0)); 2542 if (top == NULL) { 2543 error = EFAULT; /* only possible error */ 2544 goto out; 2545 } 2546 space -= resid - uio->uio_resid; 2547 resid = uio->uio_resid; 2548 } 2549 if (dontroute) { 2550 SOCK_LOCK(so); 2551 so->so_options |= SO_DONTROUTE; 2552 SOCK_UNLOCK(so); 2553 } 2554 /* 2555 * XXX all the SBS_CANTSENDMORE checks previously 2556 * done could be out of date. We could have received 2557 * a reset packet in an interrupt or maybe we slept 2558 * while doing page faults in uiomove() etc. We 2559 * could probably recheck again inside the locking 2560 * protection here, but there are probably other 2561 * places that this also happens. We must rethink 2562 * this. 2563 */ 2564 VNET_SO_ASSERT(so); 2565 2566 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : 2567 /* 2568 * If the user set MSG_EOF, the protocol understands 2569 * this flag and nothing left to send then use 2570 * PRU_SEND_EOF instead of PRU_SEND. 2571 */ 2572 ((flags & MSG_EOF) && 2573 (so->so_proto->pr_flags & PR_IMPLOPCL) && 2574 (resid <= 0)) ? 2575 PRUS_EOF : 2576 /* If there is more to send set PRUS_MORETOCOME. */ 2577 (flags & MSG_MORETOCOME) || 2578 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 2579 2580 #ifdef KERN_TLS 2581 pr_send_flag |= tls_send_flag; 2582 #endif 2583 2584 error = so->so_proto->pr_send(so, pr_send_flag, top, 2585 addr, control, td); 2586 2587 if (dontroute) { 2588 SOCK_LOCK(so); 2589 so->so_options &= ~SO_DONTROUTE; 2590 SOCK_UNLOCK(so); 2591 } 2592 2593 #ifdef KERN_TLS 2594 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 2595 if (error != 0) { 2596 m_freem(top); 2597 top = NULL; 2598 } else { 2599 soref(so); 2600 ktls_enqueue(top, so, tls_enq_cnt); 2601 } 2602 } 2603 #endif 2604 clen = 0; 2605 control = NULL; 2606 top = NULL; 2607 if (error) 2608 goto out; 2609 } while (resid && space > 0); 2610 } while (resid); 2611 2612 out: 2613 #ifdef KERN_TLS 2614 if (tls != NULL) 2615 ktls_free(tls); 2616 #endif 2617 if (top != NULL) 2618 m_freem(top); 2619 if (control != NULL) 2620 m_freem(control); 2621 return (error); 2622 } 2623 2624 int 2625 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 2626 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2627 { 2628 int error; 2629 2630 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 2631 if (error) 2632 return (error); 2633 error = sosend_generic_locked(so, addr, uio, top, control, flags, td); 2634 SOCK_IO_SEND_UNLOCK(so); 2635 return (error); 2636 } 2637 2638 /* 2639 * Send to a socket from a kernel thread. 2640 * 2641 * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. 2642 * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs 2643 * to be set at all. This function should just boil down to a static inline 2644 * calling the protocol method. 2645 */ 2646 int 2647 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2648 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 2649 { 2650 int error; 2651 2652 CURVNET_SET(so->so_vnet); 2653 error = so->so_proto->pr_sosend(so, addr, uio, 2654 top, control, flags, td); 2655 CURVNET_RESTORE(); 2656 return (error); 2657 } 2658 2659 /* 2660 * send(2), write(2) or aio_write(2) on a socket. 2661 */ 2662 int 2663 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, 2664 struct mbuf *control, int flags, struct proc *userproc) 2665 { 2666 struct thread *td; 2667 ssize_t len; 2668 int error; 2669 2670 td = uio->uio_td; 2671 len = uio->uio_resid; 2672 CURVNET_SET(so->so_vnet); 2673 error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, 2674 td); 2675 CURVNET_RESTORE(); 2676 if (error != 0) { 2677 /* 2678 * Clear transient errors for stream protocols if they made 2679 * some progress. Make exclusion for aio(4) that would 2680 * schedule a new write in case of EWOULDBLOCK and clear 2681 * error itself. See soaio_process_job(). 2682 */ 2683 if (uio->uio_resid != len && 2684 (so->so_proto->pr_flags & PR_ATOMIC) == 0 && 2685 userproc == NULL && 2686 (error == ERESTART || error == EINTR || 2687 error == EWOULDBLOCK)) 2688 error = 0; 2689 /* Generation of SIGPIPE can be controlled per socket. */ 2690 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && 2691 (flags & MSG_NOSIGNAL) == 0) { 2692 if (userproc != NULL) { 2693 /* aio(4) job */ 2694 PROC_LOCK(userproc); 2695 kern_psignal(userproc, SIGPIPE); 2696 PROC_UNLOCK(userproc); 2697 } else { 2698 PROC_LOCK(td->td_proc); 2699 tdsignal(td, SIGPIPE); 2700 PROC_UNLOCK(td->td_proc); 2701 } 2702 } 2703 } 2704 return (error); 2705 } 2706 2707 /* 2708 * The part of soreceive() that implements reading non-inline out-of-band 2709 * data from a socket. For more complete comments, see soreceive(), from 2710 * which this code originated. 2711 * 2712 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 2713 * unable to return an mbuf chain to the caller. 2714 */ 2715 static int 2716 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 2717 { 2718 struct protosw *pr = so->so_proto; 2719 struct mbuf *m; 2720 int error; 2721 2722 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 2723 VNET_SO_ASSERT(so); 2724 2725 m = m_get(M_WAITOK, MT_DATA); 2726 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 2727 if (error) 2728 goto bad; 2729 do { 2730 error = uiomove(mtod(m, void *), 2731 (int) min(uio->uio_resid, m->m_len), uio); 2732 m = m_free(m); 2733 } while (uio->uio_resid && error == 0 && m); 2734 bad: 2735 if (m != NULL) 2736 m_freem(m); 2737 return (error); 2738 } 2739 2740 /* 2741 * Following replacement or removal of the first mbuf on the first mbuf chain 2742 * of a socket buffer, push necessary state changes back into the socket 2743 * buffer so that other consumers see the values consistently. 'nextrecord' 2744 * is the callers locally stored value of the original value of 2745 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 2746 * NOTE: 'nextrecord' may be NULL. 2747 */ 2748 static __inline void 2749 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 2750 { 2751 2752 SOCKBUF_LOCK_ASSERT(sb); 2753 /* 2754 * First, update for the new value of nextrecord. If necessary, make 2755 * it the first record. 2756 */ 2757 if (sb->sb_mb != NULL) 2758 sb->sb_mb->m_nextpkt = nextrecord; 2759 else 2760 sb->sb_mb = nextrecord; 2761 2762 /* 2763 * Now update any dependent socket buffer fields to reflect the new 2764 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 2765 * addition of a second clause that takes care of the case where 2766 * sb_mb has been updated, but remains the last record. 2767 */ 2768 if (sb->sb_mb == NULL) { 2769 sb->sb_mbtail = NULL; 2770 sb->sb_lastrecord = NULL; 2771 } else if (sb->sb_mb->m_nextpkt == NULL) 2772 sb->sb_lastrecord = sb->sb_mb; 2773 } 2774 2775 /* 2776 * Implement receive operations on a socket. We depend on the way that 2777 * records are added to the sockbuf by sbappend. In particular, each record 2778 * (mbufs linked through m_next) must begin with an address if the protocol 2779 * so specifies, followed by an optional mbuf or mbufs containing ancillary 2780 * data, and then zero or more mbufs of data. In order to allow parallelism 2781 * between network receive and copying to user space, as well as avoid 2782 * sleeping with a mutex held, we release the socket buffer mutex during the 2783 * user space copy. Although the sockbuf is locked, new data may still be 2784 * appended, and thus we must maintain consistency of the sockbuf during that 2785 * time. 2786 * 2787 * The caller may receive the data as a single mbuf chain by supplying an 2788 * mbuf **mp for use in returning the chain. The uio is then used only for 2789 * the count in uio_resid. 2790 */ 2791 static int 2792 soreceive_generic_locked(struct socket *so, struct sockaddr **psa, 2793 struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) 2794 { 2795 struct mbuf *m; 2796 int flags, error, offset; 2797 ssize_t len; 2798 struct protosw *pr = so->so_proto; 2799 struct mbuf *nextrecord; 2800 int moff, type = 0; 2801 ssize_t orig_resid = uio->uio_resid; 2802 bool report_real_len = false; 2803 2804 SOCK_IO_RECV_ASSERT_LOCKED(so); 2805 2806 error = 0; 2807 if (flagsp != NULL) { 2808 report_real_len = *flagsp & MSG_TRUNC; 2809 *flagsp &= ~MSG_TRUNC; 2810 flags = *flagsp &~ MSG_EOR; 2811 } else 2812 flags = 0; 2813 2814 restart: 2815 SOCKBUF_LOCK(&so->so_rcv); 2816 m = so->so_rcv.sb_mb; 2817 /* 2818 * If we have less data than requested, block awaiting more (subject 2819 * to any timeout) if: 2820 * 1. the current count is less than the low water mark, or 2821 * 2. MSG_DONTWAIT is not set 2822 */ 2823 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 2824 sbavail(&so->so_rcv) < uio->uio_resid) && 2825 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 2826 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 2827 KASSERT(m != NULL || !sbavail(&so->so_rcv), 2828 ("receive: m == %p sbavail == %u", 2829 m, sbavail(&so->so_rcv))); 2830 if (so->so_error || so->so_rerror) { 2831 if (m != NULL) 2832 goto dontblock; 2833 if (so->so_error) 2834 error = so->so_error; 2835 else 2836 error = so->so_rerror; 2837 if ((flags & MSG_PEEK) == 0) { 2838 if (so->so_error) 2839 so->so_error = 0; 2840 else 2841 so->so_rerror = 0; 2842 } 2843 SOCKBUF_UNLOCK(&so->so_rcv); 2844 goto release; 2845 } 2846 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2847 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2848 if (m != NULL) 2849 goto dontblock; 2850 #ifdef KERN_TLS 2851 else if (so->so_rcv.sb_tlsdcc == 0 && 2852 so->so_rcv.sb_tlscc == 0) { 2853 #else 2854 else { 2855 #endif 2856 SOCKBUF_UNLOCK(&so->so_rcv); 2857 goto release; 2858 } 2859 } 2860 for (; m != NULL; m = m->m_next) 2861 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2862 m = so->so_rcv.sb_mb; 2863 goto dontblock; 2864 } 2865 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2866 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2867 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2868 SOCKBUF_UNLOCK(&so->so_rcv); 2869 error = ENOTCONN; 2870 goto release; 2871 } 2872 if (uio->uio_resid == 0 && !report_real_len) { 2873 SOCKBUF_UNLOCK(&so->so_rcv); 2874 goto release; 2875 } 2876 if ((so->so_state & SS_NBIO) || 2877 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2878 SOCKBUF_UNLOCK(&so->so_rcv); 2879 error = EWOULDBLOCK; 2880 goto release; 2881 } 2882 SBLASTRECORDCHK(&so->so_rcv); 2883 SBLASTMBUFCHK(&so->so_rcv); 2884 error = sbwait(so, SO_RCV); 2885 SOCKBUF_UNLOCK(&so->so_rcv); 2886 if (error) 2887 goto release; 2888 goto restart; 2889 } 2890 dontblock: 2891 /* 2892 * From this point onward, we maintain 'nextrecord' as a cache of the 2893 * pointer to the next record in the socket buffer. We must keep the 2894 * various socket buffer pointers and local stack versions of the 2895 * pointers in sync, pushing out modifications before dropping the 2896 * socket buffer mutex, and re-reading them when picking it up. 2897 * 2898 * Otherwise, we will race with the network stack appending new data 2899 * or records onto the socket buffer by using inconsistent/stale 2900 * versions of the field, possibly resulting in socket buffer 2901 * corruption. 2902 * 2903 * By holding the high-level sblock(), we prevent simultaneous 2904 * readers from pulling off the front of the socket buffer. 2905 */ 2906 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2907 if (uio->uio_td) 2908 uio->uio_td->td_ru.ru_msgrcv++; 2909 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2910 SBLASTRECORDCHK(&so->so_rcv); 2911 SBLASTMBUFCHK(&so->so_rcv); 2912 nextrecord = m->m_nextpkt; 2913 if (pr->pr_flags & PR_ADDR) { 2914 KASSERT(m->m_type == MT_SONAME, 2915 ("m->m_type == %d", m->m_type)); 2916 orig_resid = 0; 2917 if (psa != NULL) 2918 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2919 M_NOWAIT); 2920 if (flags & MSG_PEEK) { 2921 m = m->m_next; 2922 } else { 2923 sbfree(&so->so_rcv, m); 2924 so->so_rcv.sb_mb = m_free(m); 2925 m = so->so_rcv.sb_mb; 2926 sockbuf_pushsync(&so->so_rcv, nextrecord); 2927 } 2928 } 2929 2930 /* 2931 * Process one or more MT_CONTROL mbufs present before any data mbufs 2932 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2933 * just copy the data; if !MSG_PEEK, we call into the protocol to 2934 * perform externalization (or freeing if controlp == NULL). 2935 */ 2936 if (m != NULL && m->m_type == MT_CONTROL) { 2937 struct mbuf *cm = NULL, *cmn; 2938 struct mbuf **cme = &cm; 2939 #ifdef KERN_TLS 2940 struct cmsghdr *cmsg; 2941 struct tls_get_record tgr; 2942 2943 /* 2944 * For MSG_TLSAPPDATA, check for an alert record. 2945 * If found, return ENXIO without removing 2946 * it from the receive queue. This allows a subsequent 2947 * call without MSG_TLSAPPDATA to receive it. 2948 * Note that, for TLS, there should only be a single 2949 * control mbuf with the TLS_GET_RECORD message in it. 2950 */ 2951 if (flags & MSG_TLSAPPDATA) { 2952 cmsg = mtod(m, struct cmsghdr *); 2953 if (cmsg->cmsg_type == TLS_GET_RECORD && 2954 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2955 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2956 if (__predict_false(tgr.tls_type == 2957 TLS_RLTYPE_ALERT)) { 2958 SOCKBUF_UNLOCK(&so->so_rcv); 2959 error = ENXIO; 2960 goto release; 2961 } 2962 } 2963 } 2964 #endif 2965 2966 do { 2967 if (flags & MSG_PEEK) { 2968 if (controlp != NULL) { 2969 *controlp = m_copym(m, 0, m->m_len, 2970 M_NOWAIT); 2971 controlp = &(*controlp)->m_next; 2972 } 2973 m = m->m_next; 2974 } else { 2975 sbfree(&so->so_rcv, m); 2976 so->so_rcv.sb_mb = m->m_next; 2977 m->m_next = NULL; 2978 *cme = m; 2979 cme = &(*cme)->m_next; 2980 m = so->so_rcv.sb_mb; 2981 } 2982 } while (m != NULL && m->m_type == MT_CONTROL); 2983 if ((flags & MSG_PEEK) == 0) 2984 sockbuf_pushsync(&so->so_rcv, nextrecord); 2985 while (cm != NULL) { 2986 cmn = cm->m_next; 2987 cm->m_next = NULL; 2988 if (controlp != NULL) 2989 *controlp = cm; 2990 else 2991 m_freem(cm); 2992 if (controlp != NULL) { 2993 while (*controlp != NULL) 2994 controlp = &(*controlp)->m_next; 2995 } 2996 cm = cmn; 2997 } 2998 if (m != NULL) 2999 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 3000 else 3001 nextrecord = so->so_rcv.sb_mb; 3002 orig_resid = 0; 3003 } 3004 if (m != NULL) { 3005 if ((flags & MSG_PEEK) == 0) { 3006 KASSERT(m->m_nextpkt == nextrecord, 3007 ("soreceive: post-control, nextrecord !sync")); 3008 if (nextrecord == NULL) { 3009 KASSERT(so->so_rcv.sb_mb == m, 3010 ("soreceive: post-control, sb_mb!=m")); 3011 KASSERT(so->so_rcv.sb_lastrecord == m, 3012 ("soreceive: post-control, lastrecord!=m")); 3013 } 3014 } 3015 type = m->m_type; 3016 if (type == MT_OOBDATA) 3017 flags |= MSG_OOB; 3018 } else { 3019 if ((flags & MSG_PEEK) == 0) { 3020 KASSERT(so->so_rcv.sb_mb == nextrecord, 3021 ("soreceive: sb_mb != nextrecord")); 3022 if (so->so_rcv.sb_mb == NULL) { 3023 KASSERT(so->so_rcv.sb_lastrecord == NULL, 3024 ("soreceive: sb_lastercord != NULL")); 3025 } 3026 } 3027 } 3028 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3029 SBLASTRECORDCHK(&so->so_rcv); 3030 SBLASTMBUFCHK(&so->so_rcv); 3031 3032 /* 3033 * Now continue to read any data mbufs off of the head of the socket 3034 * buffer until the read request is satisfied. Note that 'type' is 3035 * used to store the type of any mbuf reads that have happened so far 3036 * such that soreceive() can stop reading if the type changes, which 3037 * causes soreceive() to return only one of regular data and inline 3038 * out-of-band data in a single socket receive operation. 3039 */ 3040 moff = 0; 3041 offset = 0; 3042 while (m != NULL && !(m->m_flags & M_NOTREADY) && uio->uio_resid > 0 && 3043 error == 0) { 3044 /* 3045 * If the type of mbuf has changed since the last mbuf 3046 * examined ('type'), end the receive operation. 3047 */ 3048 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3049 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 3050 if (type != m->m_type) 3051 break; 3052 } else if (type == MT_OOBDATA) 3053 break; 3054 else 3055 KASSERT(m->m_type == MT_DATA, 3056 ("m->m_type == %d", m->m_type)); 3057 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 3058 len = uio->uio_resid; 3059 if (so->so_oobmark && len > so->so_oobmark - offset) 3060 len = so->so_oobmark - offset; 3061 if (len > m->m_len - moff) 3062 len = m->m_len - moff; 3063 /* 3064 * If mp is set, just pass back the mbufs. Otherwise copy 3065 * them out via the uio, then free. Sockbuf must be 3066 * consistent here (points to current mbuf, it points to next 3067 * record) when we drop priority; we must note any additions 3068 * to the sockbuf when we block interrupts again. 3069 */ 3070 if (mp == NULL) { 3071 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3072 SBLASTRECORDCHK(&so->so_rcv); 3073 SBLASTMBUFCHK(&so->so_rcv); 3074 SOCKBUF_UNLOCK(&so->so_rcv); 3075 if ((m->m_flags & M_EXTPG) != 0) 3076 error = m_unmapped_uiomove(m, moff, uio, 3077 (int)len); 3078 else 3079 error = uiomove(mtod(m, char *) + moff, 3080 (int)len, uio); 3081 SOCKBUF_LOCK(&so->so_rcv); 3082 if (error) { 3083 /* 3084 * The MT_SONAME mbuf has already been removed 3085 * from the record, so it is necessary to 3086 * remove the data mbufs, if any, to preserve 3087 * the invariant in the case of PR_ADDR that 3088 * requires MT_SONAME mbufs at the head of 3089 * each record. 3090 */ 3091 if (pr->pr_flags & PR_ATOMIC && 3092 ((flags & MSG_PEEK) == 0)) 3093 (void)sbdroprecord_locked(&so->so_rcv); 3094 SOCKBUF_UNLOCK(&so->so_rcv); 3095 goto release; 3096 } 3097 } else 3098 uio->uio_resid -= len; 3099 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3100 if (len == m->m_len - moff) { 3101 if (m->m_flags & M_EOR) 3102 flags |= MSG_EOR; 3103 if (flags & MSG_PEEK) { 3104 m = m->m_next; 3105 moff = 0; 3106 } else { 3107 nextrecord = m->m_nextpkt; 3108 sbfree(&so->so_rcv, m); 3109 if (mp != NULL) { 3110 m->m_nextpkt = NULL; 3111 *mp = m; 3112 mp = &m->m_next; 3113 so->so_rcv.sb_mb = m = m->m_next; 3114 *mp = NULL; 3115 } else { 3116 so->so_rcv.sb_mb = m_free(m); 3117 m = so->so_rcv.sb_mb; 3118 } 3119 sockbuf_pushsync(&so->so_rcv, nextrecord); 3120 SBLASTRECORDCHK(&so->so_rcv); 3121 SBLASTMBUFCHK(&so->so_rcv); 3122 } 3123 } else { 3124 if (flags & MSG_PEEK) 3125 moff += len; 3126 else { 3127 if (mp != NULL) { 3128 if (flags & MSG_DONTWAIT) { 3129 *mp = m_copym(m, 0, len, 3130 M_NOWAIT); 3131 if (*mp == NULL) { 3132 /* 3133 * m_copym() couldn't 3134 * allocate an mbuf. 3135 * Adjust uio_resid back 3136 * (it was adjusted 3137 * down by len bytes, 3138 * which we didn't end 3139 * up "copying" over). 3140 */ 3141 uio->uio_resid += len; 3142 break; 3143 } 3144 } else { 3145 SOCKBUF_UNLOCK(&so->so_rcv); 3146 *mp = m_copym(m, 0, len, 3147 M_WAITOK); 3148 SOCKBUF_LOCK(&so->so_rcv); 3149 } 3150 } 3151 sbcut_locked(&so->so_rcv, len); 3152 } 3153 } 3154 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3155 if (so->so_oobmark) { 3156 if ((flags & MSG_PEEK) == 0) { 3157 so->so_oobmark -= len; 3158 if (so->so_oobmark == 0) { 3159 so->so_rcv.sb_state |= SBS_RCVATMARK; 3160 break; 3161 } 3162 } else { 3163 offset += len; 3164 if (offset == so->so_oobmark) 3165 break; 3166 } 3167 } 3168 if (flags & MSG_EOR) 3169 break; 3170 /* 3171 * If the MSG_WAITALL flag is set (for non-atomic socket), we 3172 * must not quit until "uio->uio_resid == 0" or an error 3173 * termination. If a signal/timeout occurs, return with a 3174 * short count but without error. Keep sockbuf locked 3175 * against other readers. 3176 */ 3177 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 3178 !sosendallatonce(so) && nextrecord == NULL) { 3179 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3180 if (so->so_error || so->so_rerror || 3181 so->so_rcv.sb_state & SBS_CANTRCVMORE) 3182 break; 3183 /* 3184 * Notify the protocol that some data has been 3185 * drained before blocking. 3186 */ 3187 if (pr->pr_flags & PR_WANTRCVD) { 3188 SOCKBUF_UNLOCK(&so->so_rcv); 3189 VNET_SO_ASSERT(so); 3190 pr->pr_rcvd(so, flags); 3191 SOCKBUF_LOCK(&so->so_rcv); 3192 if (__predict_false(so->so_rcv.sb_mb == NULL && 3193 (so->so_error || so->so_rerror || 3194 so->so_rcv.sb_state & SBS_CANTRCVMORE))) 3195 break; 3196 } 3197 SBLASTRECORDCHK(&so->so_rcv); 3198 SBLASTMBUFCHK(&so->so_rcv); 3199 /* 3200 * We could receive some data while was notifying 3201 * the protocol. Skip blocking in this case. 3202 */ 3203 if (so->so_rcv.sb_mb == NULL) { 3204 error = sbwait(so, SO_RCV); 3205 if (error) { 3206 SOCKBUF_UNLOCK(&so->so_rcv); 3207 goto release; 3208 } 3209 } 3210 m = so->so_rcv.sb_mb; 3211 if (m != NULL) 3212 nextrecord = m->m_nextpkt; 3213 } 3214 } 3215 3216 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3217 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 3218 if (report_real_len) 3219 uio->uio_resid -= m_length(m, NULL) - moff; 3220 flags |= MSG_TRUNC; 3221 if ((flags & MSG_PEEK) == 0) 3222 (void) sbdroprecord_locked(&so->so_rcv); 3223 } 3224 if ((flags & MSG_PEEK) == 0) { 3225 if (m == NULL) { 3226 /* 3227 * First part is an inline SB_EMPTY_FIXUP(). Second 3228 * part makes sure sb_lastrecord is up-to-date if 3229 * there is still data in the socket buffer. 3230 */ 3231 so->so_rcv.sb_mb = nextrecord; 3232 if (so->so_rcv.sb_mb == NULL) { 3233 so->so_rcv.sb_mbtail = NULL; 3234 so->so_rcv.sb_lastrecord = NULL; 3235 } else if (nextrecord->m_nextpkt == NULL) 3236 so->so_rcv.sb_lastrecord = nextrecord; 3237 } 3238 SBLASTRECORDCHK(&so->so_rcv); 3239 SBLASTMBUFCHK(&so->so_rcv); 3240 /* 3241 * If soreceive() is being done from the socket callback, 3242 * then don't need to generate ACK to peer to update window, 3243 * since ACK will be generated on return to TCP. 3244 */ 3245 if (!(flags & MSG_SOCALLBCK) && 3246 (pr->pr_flags & PR_WANTRCVD)) { 3247 SOCKBUF_UNLOCK(&so->so_rcv); 3248 VNET_SO_ASSERT(so); 3249 pr->pr_rcvd(so, flags); 3250 SOCKBUF_LOCK(&so->so_rcv); 3251 } 3252 } 3253 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3254 if (orig_resid == uio->uio_resid && orig_resid && 3255 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 3256 SOCKBUF_UNLOCK(&so->so_rcv); 3257 goto restart; 3258 } 3259 SOCKBUF_UNLOCK(&so->so_rcv); 3260 3261 if (flagsp != NULL) 3262 *flagsp |= flags; 3263 release: 3264 return (error); 3265 } 3266 3267 int 3268 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 3269 struct mbuf **mp, struct mbuf **controlp, int *flagsp) 3270 { 3271 int error, flags; 3272 3273 if (psa != NULL) 3274 *psa = NULL; 3275 if (controlp != NULL) 3276 *controlp = NULL; 3277 if (flagsp != NULL) { 3278 flags = *flagsp; 3279 if ((flags & MSG_OOB) != 0) 3280 return (soreceive_rcvoob(so, uio, flags)); 3281 } else { 3282 flags = 0; 3283 } 3284 if (mp != NULL) 3285 *mp = NULL; 3286 3287 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3288 if (error) 3289 return (error); 3290 error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp); 3291 SOCK_IO_RECV_UNLOCK(so); 3292 return (error); 3293 } 3294 3295 /* 3296 * Optimized version of soreceive() for stream (TCP) sockets. 3297 */ 3298 static int 3299 soreceive_stream_locked(struct socket *so, struct sockbuf *sb, 3300 struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, 3301 struct mbuf **controlp, int flags) 3302 { 3303 int len = 0, error = 0, oresid; 3304 struct mbuf *m, *n = NULL; 3305 3306 SOCK_IO_RECV_ASSERT_LOCKED(so); 3307 3308 /* Easy one, no space to copyout anything. */ 3309 if (uio->uio_resid == 0) 3310 return (EINVAL); 3311 oresid = uio->uio_resid; 3312 3313 SOCKBUF_LOCK(sb); 3314 /* We will never ever get anything unless we are or were connected. */ 3315 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 3316 error = ENOTCONN; 3317 goto out; 3318 } 3319 3320 restart: 3321 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3322 3323 /* Abort if socket has reported problems. */ 3324 if (so->so_error) { 3325 if (sbavail(sb) > 0) 3326 goto deliver; 3327 if (oresid > uio->uio_resid) 3328 goto out; 3329 error = so->so_error; 3330 if (!(flags & MSG_PEEK)) 3331 so->so_error = 0; 3332 goto out; 3333 } 3334 3335 /* Door is closed. Deliver what is left, if any. */ 3336 if (sb->sb_state & SBS_CANTRCVMORE) { 3337 if (sbavail(sb) > 0) 3338 goto deliver; 3339 else 3340 goto out; 3341 } 3342 3343 /* Socket buffer is empty and we shall not block. */ 3344 if (sbavail(sb) == 0 && 3345 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 3346 error = EAGAIN; 3347 goto out; 3348 } 3349 3350 /* Socket buffer got some data that we shall deliver now. */ 3351 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 3352 ((so->so_state & SS_NBIO) || 3353 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 3354 sbavail(sb) >= sb->sb_lowat || 3355 sbavail(sb) >= uio->uio_resid || 3356 sbavail(sb) >= sb->sb_hiwat) ) { 3357 goto deliver; 3358 } 3359 3360 /* On MSG_WAITALL we must wait until all data or error arrives. */ 3361 if ((flags & MSG_WAITALL) && 3362 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 3363 goto deliver; 3364 3365 /* 3366 * Wait and block until (more) data comes in. 3367 * NB: Drops the sockbuf lock during wait. 3368 */ 3369 error = sbwait(so, SO_RCV); 3370 if (error) 3371 goto out; 3372 goto restart; 3373 3374 deliver: 3375 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3376 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 3377 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 3378 3379 /* Statistics. */ 3380 if (uio->uio_td) 3381 uio->uio_td->td_ru.ru_msgrcv++; 3382 3383 /* Fill uio until full or current end of socket buffer is reached. */ 3384 len = min(uio->uio_resid, sbavail(sb)); 3385 if (mp0 != NULL) { 3386 /* Dequeue as many mbufs as possible. */ 3387 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 3388 if (*mp0 == NULL) 3389 *mp0 = sb->sb_mb; 3390 else 3391 m_cat(*mp0, sb->sb_mb); 3392 for (m = sb->sb_mb; 3393 m != NULL && m->m_len <= len; 3394 m = m->m_next) { 3395 KASSERT(!(m->m_flags & M_NOTREADY), 3396 ("%s: m %p not available", __func__, m)); 3397 len -= m->m_len; 3398 uio->uio_resid -= m->m_len; 3399 sbfree(sb, m); 3400 n = m; 3401 } 3402 n->m_next = NULL; 3403 sb->sb_mb = m; 3404 sb->sb_lastrecord = sb->sb_mb; 3405 if (sb->sb_mb == NULL) 3406 SB_EMPTY_FIXUP(sb); 3407 } 3408 /* Copy the remainder. */ 3409 if (len > 0) { 3410 KASSERT(sb->sb_mb != NULL, 3411 ("%s: len > 0 && sb->sb_mb empty", __func__)); 3412 3413 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 3414 if (m == NULL) 3415 len = 0; /* Don't flush data from sockbuf. */ 3416 else 3417 uio->uio_resid -= len; 3418 if (*mp0 != NULL) 3419 m_cat(*mp0, m); 3420 else 3421 *mp0 = m; 3422 if (*mp0 == NULL) { 3423 error = ENOBUFS; 3424 goto out; 3425 } 3426 } 3427 } else { 3428 /* NB: Must unlock socket buffer as uiomove may sleep. */ 3429 SOCKBUF_UNLOCK(sb); 3430 error = m_mbuftouio(uio, sb->sb_mb, len); 3431 SOCKBUF_LOCK(sb); 3432 if (error) 3433 goto out; 3434 } 3435 SBLASTRECORDCHK(sb); 3436 SBLASTMBUFCHK(sb); 3437 3438 /* 3439 * Remove the delivered data from the socket buffer unless we 3440 * were only peeking. 3441 */ 3442 if (!(flags & MSG_PEEK)) { 3443 if (len > 0) 3444 sbdrop_locked(sb, len); 3445 3446 /* Notify protocol that we drained some data. */ 3447 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 3448 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 3449 !(flags & MSG_SOCALLBCK))) { 3450 SOCKBUF_UNLOCK(sb); 3451 VNET_SO_ASSERT(so); 3452 so->so_proto->pr_rcvd(so, flags); 3453 SOCKBUF_LOCK(sb); 3454 } 3455 } 3456 3457 /* 3458 * For MSG_WAITALL we may have to loop again and wait for 3459 * more data to come in. 3460 */ 3461 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 3462 goto restart; 3463 out: 3464 SBLASTRECORDCHK(sb); 3465 SBLASTMBUFCHK(sb); 3466 SOCKBUF_UNLOCK(sb); 3467 return (error); 3468 } 3469 3470 int 3471 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 3472 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3473 { 3474 struct sockbuf *sb; 3475 int error, flags; 3476 3477 sb = &so->so_rcv; 3478 3479 /* We only do stream sockets. */ 3480 if (so->so_type != SOCK_STREAM) 3481 return (EINVAL); 3482 if (psa != NULL) 3483 *psa = NULL; 3484 if (flagsp != NULL) 3485 flags = *flagsp & ~MSG_EOR; 3486 else 3487 flags = 0; 3488 if (controlp != NULL) 3489 *controlp = NULL; 3490 if (flags & MSG_OOB) 3491 return (soreceive_rcvoob(so, uio, flags)); 3492 if (mp0 != NULL) 3493 *mp0 = NULL; 3494 3495 #ifdef KERN_TLS 3496 /* 3497 * KTLS store TLS records as records with a control message to 3498 * describe the framing. 3499 * 3500 * We check once here before acquiring locks to optimize the 3501 * common case. 3502 */ 3503 if (sb->sb_tls_info != NULL) 3504 return (soreceive_generic(so, psa, uio, mp0, controlp, 3505 flagsp)); 3506 #endif 3507 3508 /* 3509 * Prevent other threads from reading from the socket. This lock may be 3510 * dropped in order to sleep waiting for data to arrive. 3511 */ 3512 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 3513 if (error) 3514 return (error); 3515 #ifdef KERN_TLS 3516 if (__predict_false(sb->sb_tls_info != NULL)) { 3517 SOCK_IO_RECV_UNLOCK(so); 3518 return (soreceive_generic(so, psa, uio, mp0, controlp, 3519 flagsp)); 3520 } 3521 #endif 3522 error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags); 3523 SOCK_IO_RECV_UNLOCK(so); 3524 return (error); 3525 } 3526 3527 /* 3528 * Optimized version of soreceive() for simple datagram cases from userspace. 3529 * Unlike in the stream case, we're able to drop a datagram if copyout() 3530 * fails, and because we handle datagrams atomically, we don't need to use a 3531 * sleep lock to prevent I/O interlacing. 3532 */ 3533 int 3534 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 3535 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3536 { 3537 struct mbuf *m, *m2; 3538 int flags, error; 3539 ssize_t len; 3540 struct protosw *pr = so->so_proto; 3541 struct mbuf *nextrecord; 3542 3543 if (psa != NULL) 3544 *psa = NULL; 3545 if (controlp != NULL) 3546 *controlp = NULL; 3547 if (flagsp != NULL) 3548 flags = *flagsp &~ MSG_EOR; 3549 else 3550 flags = 0; 3551 3552 /* 3553 * For any complicated cases, fall back to the full 3554 * soreceive_generic(). 3555 */ 3556 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) 3557 return (soreceive_generic(so, psa, uio, mp0, controlp, 3558 flagsp)); 3559 3560 /* 3561 * Enforce restrictions on use. 3562 */ 3563 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 3564 ("soreceive_dgram: wantrcvd")); 3565 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 3566 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 3567 ("soreceive_dgram: SBS_RCVATMARK")); 3568 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 3569 ("soreceive_dgram: P_CONNREQUIRED")); 3570 3571 /* 3572 * Loop blocking while waiting for a datagram. 3573 */ 3574 SOCKBUF_LOCK(&so->so_rcv); 3575 while ((m = so->so_rcv.sb_mb) == NULL) { 3576 KASSERT(sbavail(&so->so_rcv) == 0, 3577 ("soreceive_dgram: sb_mb NULL but sbavail %u", 3578 sbavail(&so->so_rcv))); 3579 if (so->so_error) { 3580 error = so->so_error; 3581 so->so_error = 0; 3582 SOCKBUF_UNLOCK(&so->so_rcv); 3583 return (error); 3584 } 3585 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 3586 uio->uio_resid == 0) { 3587 SOCKBUF_UNLOCK(&so->so_rcv); 3588 return (0); 3589 } 3590 if ((so->so_state & SS_NBIO) || 3591 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 3592 SOCKBUF_UNLOCK(&so->so_rcv); 3593 return (EWOULDBLOCK); 3594 } 3595 SBLASTRECORDCHK(&so->so_rcv); 3596 SBLASTMBUFCHK(&so->so_rcv); 3597 error = sbwait(so, SO_RCV); 3598 if (error) { 3599 SOCKBUF_UNLOCK(&so->so_rcv); 3600 return (error); 3601 } 3602 } 3603 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3604 3605 if (uio->uio_td) 3606 uio->uio_td->td_ru.ru_msgrcv++; 3607 SBLASTRECORDCHK(&so->so_rcv); 3608 SBLASTMBUFCHK(&so->so_rcv); 3609 nextrecord = m->m_nextpkt; 3610 if (nextrecord == NULL) { 3611 KASSERT(so->so_rcv.sb_lastrecord == m, 3612 ("soreceive_dgram: lastrecord != m")); 3613 } 3614 3615 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 3616 ("soreceive_dgram: m_nextpkt != nextrecord")); 3617 3618 /* 3619 * Pull 'm' and its chain off the front of the packet queue. 3620 */ 3621 so->so_rcv.sb_mb = NULL; 3622 sockbuf_pushsync(&so->so_rcv, nextrecord); 3623 3624 /* 3625 * Walk 'm's chain and free that many bytes from the socket buffer. 3626 */ 3627 for (m2 = m; m2 != NULL; m2 = m2->m_next) 3628 sbfree(&so->so_rcv, m2); 3629 3630 /* 3631 * Do a few last checks before we let go of the lock. 3632 */ 3633 SBLASTRECORDCHK(&so->so_rcv); 3634 SBLASTMBUFCHK(&so->so_rcv); 3635 SOCKBUF_UNLOCK(&so->so_rcv); 3636 3637 if (pr->pr_flags & PR_ADDR) { 3638 KASSERT(m->m_type == MT_SONAME, 3639 ("m->m_type == %d", m->m_type)); 3640 if (psa != NULL) 3641 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 3642 M_WAITOK); 3643 m = m_free(m); 3644 } 3645 KASSERT(m, ("%s: no data or control after soname", __func__)); 3646 3647 /* 3648 * Packet to copyout() is now in 'm' and it is disconnected from the 3649 * queue. 3650 * 3651 * Process one or more MT_CONTROL mbufs present before any data mbufs 3652 * in the first mbuf chain on the socket buffer. We call into the 3653 * protocol to perform externalization (or freeing if controlp == 3654 * NULL). In some cases there can be only MT_CONTROL mbufs without 3655 * MT_DATA mbufs. 3656 */ 3657 if (m->m_type == MT_CONTROL) { 3658 struct mbuf *cm = NULL, *cmn; 3659 struct mbuf **cme = &cm; 3660 3661 do { 3662 m2 = m->m_next; 3663 m->m_next = NULL; 3664 *cme = m; 3665 cme = &(*cme)->m_next; 3666 m = m2; 3667 } while (m != NULL && m->m_type == MT_CONTROL); 3668 while (cm != NULL) { 3669 cmn = cm->m_next; 3670 cm->m_next = NULL; 3671 if (controlp != NULL) 3672 *controlp = cm; 3673 else 3674 m_freem(cm); 3675 if (controlp != NULL) { 3676 while (*controlp != NULL) 3677 controlp = &(*controlp)->m_next; 3678 } 3679 cm = cmn; 3680 } 3681 } 3682 KASSERT(m == NULL || m->m_type == MT_DATA, 3683 ("soreceive_dgram: !data")); 3684 while (m != NULL && uio->uio_resid > 0) { 3685 len = uio->uio_resid; 3686 if (len > m->m_len) 3687 len = m->m_len; 3688 error = uiomove(mtod(m, char *), (int)len, uio); 3689 if (error) { 3690 m_freem(m); 3691 return (error); 3692 } 3693 if (len == m->m_len) 3694 m = m_free(m); 3695 else { 3696 m->m_data += len; 3697 m->m_len -= len; 3698 } 3699 } 3700 if (m != NULL) { 3701 flags |= MSG_TRUNC; 3702 m_freem(m); 3703 } 3704 if (flagsp != NULL) 3705 *flagsp |= flags; 3706 return (0); 3707 } 3708 3709 int 3710 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 3711 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3712 { 3713 int error; 3714 3715 CURVNET_SET(so->so_vnet); 3716 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); 3717 CURVNET_RESTORE(); 3718 return (error); 3719 } 3720 3721 int 3722 soshutdown(struct socket *so, enum shutdown_how how) 3723 { 3724 int error; 3725 3726 CURVNET_SET(so->so_vnet); 3727 error = so->so_proto->pr_shutdown(so, how); 3728 CURVNET_RESTORE(); 3729 3730 return (error); 3731 } 3732 3733 /* 3734 * Used by several pr_shutdown implementations that use generic socket buffers. 3735 */ 3736 void 3737 sorflush(struct socket *so) 3738 { 3739 int error; 3740 3741 VNET_SO_ASSERT(so); 3742 3743 /* 3744 * Dislodge threads currently blocked in receive and wait to acquire 3745 * a lock against other simultaneous readers before clearing the 3746 * socket buffer. Don't let our acquire be interrupted by a signal 3747 * despite any existing socket disposition on interruptable waiting. 3748 * 3749 * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive 3750 * methods that read the top of the socket buffer without acquisition 3751 * of the socket buffer mutex, assuming that top of the buffer 3752 * exclusively belongs to the read(2) syscall. This is handy when 3753 * performing MSG_PEEK. 3754 */ 3755 socantrcvmore(so); 3756 3757 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 3758 if (error != 0) { 3759 KASSERT(SOLISTENING(so), 3760 ("%s: soiolock(%p) failed", __func__, so)); 3761 return; 3762 } 3763 3764 sbrelease(so, SO_RCV); 3765 SOCK_IO_RECV_UNLOCK(so); 3766 3767 } 3768 3769 int 3770 sosetfib(struct socket *so, int fibnum) 3771 { 3772 if (fibnum < 0 || fibnum >= rt_numfibs) 3773 return (EINVAL); 3774 3775 SOCK_LOCK(so); 3776 so->so_fibnum = fibnum; 3777 SOCK_UNLOCK(so); 3778 3779 return (0); 3780 } 3781 3782 #ifdef SOCKET_HHOOK 3783 /* 3784 * Wrapper for Socket established helper hook. 3785 * Parameters: socket, context of the hook point, hook id. 3786 */ 3787 static inline int 3788 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 3789 { 3790 struct socket_hhook_data hhook_data = { 3791 .so = so, 3792 .hctx = hctx, 3793 .m = NULL, 3794 .status = 0 3795 }; 3796 3797 CURVNET_SET(so->so_vnet); 3798 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 3799 CURVNET_RESTORE(); 3800 3801 /* Ugly but needed, since hhooks return void for now */ 3802 return (hhook_data.status); 3803 } 3804 #endif 3805 3806 /* 3807 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 3808 * additional variant to handle the case where the option value needs to be 3809 * some kind of integer, but not a specific size. In addition to their use 3810 * here, these functions are also called by the protocol-level pr_ctloutput() 3811 * routines. 3812 */ 3813 int 3814 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 3815 { 3816 size_t valsize; 3817 3818 /* 3819 * If the user gives us more than we wanted, we ignore it, but if we 3820 * don't get the minimum length the caller wants, we return EINVAL. 3821 * On success, sopt->sopt_valsize is set to however much we actually 3822 * retrieved. 3823 */ 3824 if ((valsize = sopt->sopt_valsize) < minlen) 3825 return EINVAL; 3826 if (valsize > len) 3827 sopt->sopt_valsize = valsize = len; 3828 3829 if (sopt->sopt_td != NULL) 3830 return (copyin(sopt->sopt_val, buf, valsize)); 3831 3832 bcopy(sopt->sopt_val, buf, valsize); 3833 return (0); 3834 } 3835 3836 /* 3837 * Kernel version of setsockopt(2). 3838 * 3839 * XXX: optlen is size_t, not socklen_t 3840 */ 3841 int 3842 so_setsockopt(struct socket *so, int level, int optname, void *optval, 3843 size_t optlen) 3844 { 3845 struct sockopt sopt; 3846 3847 sopt.sopt_level = level; 3848 sopt.sopt_name = optname; 3849 sopt.sopt_dir = SOPT_SET; 3850 sopt.sopt_val = optval; 3851 sopt.sopt_valsize = optlen; 3852 sopt.sopt_td = NULL; 3853 return (sosetopt(so, &sopt)); 3854 } 3855 3856 int 3857 sosetopt(struct socket *so, struct sockopt *sopt) 3858 { 3859 int error, optval; 3860 struct linger l; 3861 struct timeval tv; 3862 sbintime_t val, *valp; 3863 uint32_t val32; 3864 #ifdef MAC 3865 struct mac extmac; 3866 #endif 3867 3868 CURVNET_SET(so->so_vnet); 3869 error = 0; 3870 if (sopt->sopt_level != SOL_SOCKET) { 3871 error = so->so_proto->pr_ctloutput(so, sopt); 3872 } else { 3873 switch (sopt->sopt_name) { 3874 case SO_ACCEPTFILTER: 3875 error = accept_filt_setopt(so, sopt); 3876 if (error) 3877 goto bad; 3878 break; 3879 3880 case SO_LINGER: 3881 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3882 if (error) 3883 goto bad; 3884 if (l.l_linger < 0 || 3885 l.l_linger > USHRT_MAX || 3886 l.l_linger > (INT_MAX / hz)) { 3887 error = EDOM; 3888 goto bad; 3889 } 3890 SOCK_LOCK(so); 3891 so->so_linger = l.l_linger; 3892 if (l.l_onoff) 3893 so->so_options |= SO_LINGER; 3894 else 3895 so->so_options &= ~SO_LINGER; 3896 SOCK_UNLOCK(so); 3897 break; 3898 3899 case SO_DEBUG: 3900 case SO_KEEPALIVE: 3901 case SO_DONTROUTE: 3902 case SO_USELOOPBACK: 3903 case SO_BROADCAST: 3904 case SO_REUSEADDR: 3905 case SO_REUSEPORT: 3906 case SO_REUSEPORT_LB: 3907 case SO_OOBINLINE: 3908 case SO_TIMESTAMP: 3909 case SO_BINTIME: 3910 case SO_NOSIGPIPE: 3911 case SO_NO_DDP: 3912 case SO_NO_OFFLOAD: 3913 case SO_RERROR: 3914 error = sooptcopyin(sopt, &optval, sizeof optval, 3915 sizeof optval); 3916 if (error) 3917 goto bad; 3918 SOCK_LOCK(so); 3919 if (optval) 3920 so->so_options |= sopt->sopt_name; 3921 else 3922 so->so_options &= ~sopt->sopt_name; 3923 SOCK_UNLOCK(so); 3924 break; 3925 3926 case SO_SETFIB: 3927 error = so->so_proto->pr_ctloutput(so, sopt); 3928 break; 3929 3930 case SO_USER_COOKIE: 3931 error = sooptcopyin(sopt, &val32, sizeof val32, 3932 sizeof val32); 3933 if (error) 3934 goto bad; 3935 so->so_user_cookie = val32; 3936 break; 3937 3938 case SO_SNDBUF: 3939 case SO_RCVBUF: 3940 case SO_SNDLOWAT: 3941 case SO_RCVLOWAT: 3942 error = so->so_proto->pr_setsbopt(so, sopt); 3943 if (error) 3944 goto bad; 3945 break; 3946 3947 case SO_SNDTIMEO: 3948 case SO_RCVTIMEO: 3949 #ifdef COMPAT_FREEBSD32 3950 if (SV_CURPROC_FLAG(SV_ILP32)) { 3951 struct timeval32 tv32; 3952 3953 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3954 sizeof tv32); 3955 CP(tv32, tv, tv_sec); 3956 CP(tv32, tv, tv_usec); 3957 } else 3958 #endif 3959 error = sooptcopyin(sopt, &tv, sizeof tv, 3960 sizeof tv); 3961 if (error) 3962 goto bad; 3963 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3964 tv.tv_usec >= 1000000) { 3965 error = EDOM; 3966 goto bad; 3967 } 3968 if (tv.tv_sec > INT32_MAX) 3969 val = SBT_MAX; 3970 else 3971 val = tvtosbt(tv); 3972 SOCK_LOCK(so); 3973 valp = sopt->sopt_name == SO_SNDTIMEO ? 3974 (SOLISTENING(so) ? &so->sol_sbsnd_timeo : 3975 &so->so_snd.sb_timeo) : 3976 (SOLISTENING(so) ? &so->sol_sbrcv_timeo : 3977 &so->so_rcv.sb_timeo); 3978 *valp = val; 3979 SOCK_UNLOCK(so); 3980 break; 3981 3982 case SO_LABEL: 3983 #ifdef MAC 3984 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3985 sizeof extmac); 3986 if (error) 3987 goto bad; 3988 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3989 so, &extmac); 3990 #else 3991 error = EOPNOTSUPP; 3992 #endif 3993 break; 3994 3995 case SO_TS_CLOCK: 3996 error = sooptcopyin(sopt, &optval, sizeof optval, 3997 sizeof optval); 3998 if (error) 3999 goto bad; 4000 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 4001 error = EINVAL; 4002 goto bad; 4003 } 4004 so->so_ts_clock = optval; 4005 break; 4006 4007 case SO_MAX_PACING_RATE: 4008 error = sooptcopyin(sopt, &val32, sizeof(val32), 4009 sizeof(val32)); 4010 if (error) 4011 goto bad; 4012 so->so_max_pacing_rate = val32; 4013 break; 4014 4015 case SO_SPLICE: { 4016 struct splice splice; 4017 4018 #ifdef COMPAT_FREEBSD32 4019 if (SV_CURPROC_FLAG(SV_ILP32)) { 4020 struct splice32 splice32; 4021 4022 error = sooptcopyin(sopt, &splice32, 4023 sizeof(splice32), sizeof(splice32)); 4024 if (error == 0) { 4025 splice.sp_fd = splice32.sp_fd; 4026 splice.sp_max = splice32.sp_max; 4027 CP(splice32.sp_idle, splice.sp_idle, 4028 tv_sec); 4029 CP(splice32.sp_idle, splice.sp_idle, 4030 tv_usec); 4031 } 4032 } else 4033 #endif 4034 { 4035 error = sooptcopyin(sopt, &splice, 4036 sizeof(splice), sizeof(splice)); 4037 } 4038 if (error) 4039 goto bad; 4040 #ifdef KTRACE 4041 if (KTRPOINT(curthread, KTR_STRUCT)) 4042 ktrsplice(&splice); 4043 #endif 4044 4045 error = splice_init(); 4046 if (error != 0) 4047 goto bad; 4048 4049 if (splice.sp_fd >= 0) { 4050 struct file *fp; 4051 struct socket *so2; 4052 4053 if (!cap_rights_contains(sopt->sopt_rights, 4054 &cap_recv_rights)) { 4055 error = ENOTCAPABLE; 4056 goto bad; 4057 } 4058 error = getsock(sopt->sopt_td, splice.sp_fd, 4059 &cap_send_rights, &fp); 4060 if (error != 0) 4061 goto bad; 4062 so2 = fp->f_data; 4063 4064 error = so_splice(so, so2, &splice); 4065 fdrop(fp, sopt->sopt_td); 4066 } else { 4067 error = so_unsplice(so, false); 4068 } 4069 break; 4070 } 4071 default: 4072 #ifdef SOCKET_HHOOK 4073 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 4074 error = hhook_run_socket(so, sopt, 4075 HHOOK_SOCKET_OPT); 4076 else 4077 #endif 4078 error = ENOPROTOOPT; 4079 break; 4080 } 4081 if (error == 0) 4082 (void)so->so_proto->pr_ctloutput(so, sopt); 4083 } 4084 bad: 4085 CURVNET_RESTORE(); 4086 return (error); 4087 } 4088 4089 /* 4090 * Helper routine for getsockopt. 4091 */ 4092 int 4093 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 4094 { 4095 int error; 4096 size_t valsize; 4097 4098 error = 0; 4099 4100 /* 4101 * Documented get behavior is that we always return a value, possibly 4102 * truncated to fit in the user's buffer. Traditional behavior is 4103 * that we always tell the user precisely how much we copied, rather 4104 * than something useful like the total amount we had available for 4105 * her. Note that this interface is not idempotent; the entire 4106 * answer must be generated ahead of time. 4107 */ 4108 valsize = min(len, sopt->sopt_valsize); 4109 sopt->sopt_valsize = valsize; 4110 if (sopt->sopt_val != NULL) { 4111 if (sopt->sopt_td != NULL) 4112 error = copyout(buf, sopt->sopt_val, valsize); 4113 else 4114 bcopy(buf, sopt->sopt_val, valsize); 4115 } 4116 return (error); 4117 } 4118 4119 int 4120 sogetopt(struct socket *so, struct sockopt *sopt) 4121 { 4122 int error, optval; 4123 struct linger l; 4124 struct timeval tv; 4125 #ifdef MAC 4126 struct mac extmac; 4127 #endif 4128 4129 CURVNET_SET(so->so_vnet); 4130 error = 0; 4131 if (sopt->sopt_level != SOL_SOCKET) { 4132 error = so->so_proto->pr_ctloutput(so, sopt); 4133 CURVNET_RESTORE(); 4134 return (error); 4135 } else { 4136 switch (sopt->sopt_name) { 4137 case SO_ACCEPTFILTER: 4138 error = accept_filt_getopt(so, sopt); 4139 break; 4140 4141 case SO_LINGER: 4142 SOCK_LOCK(so); 4143 l.l_onoff = so->so_options & SO_LINGER; 4144 l.l_linger = so->so_linger; 4145 SOCK_UNLOCK(so); 4146 error = sooptcopyout(sopt, &l, sizeof l); 4147 break; 4148 4149 case SO_USELOOPBACK: 4150 case SO_DONTROUTE: 4151 case SO_DEBUG: 4152 case SO_KEEPALIVE: 4153 case SO_REUSEADDR: 4154 case SO_REUSEPORT: 4155 case SO_REUSEPORT_LB: 4156 case SO_BROADCAST: 4157 case SO_OOBINLINE: 4158 case SO_ACCEPTCONN: 4159 case SO_TIMESTAMP: 4160 case SO_BINTIME: 4161 case SO_NOSIGPIPE: 4162 case SO_NO_DDP: 4163 case SO_NO_OFFLOAD: 4164 case SO_RERROR: 4165 optval = so->so_options & sopt->sopt_name; 4166 integer: 4167 error = sooptcopyout(sopt, &optval, sizeof optval); 4168 break; 4169 4170 case SO_FIB: 4171 SOCK_LOCK(so); 4172 optval = so->so_fibnum; 4173 SOCK_UNLOCK(so); 4174 goto integer; 4175 4176 case SO_DOMAIN: 4177 optval = so->so_proto->pr_domain->dom_family; 4178 goto integer; 4179 4180 case SO_TYPE: 4181 optval = so->so_type; 4182 goto integer; 4183 4184 case SO_PROTOCOL: 4185 optval = so->so_proto->pr_protocol; 4186 goto integer; 4187 4188 case SO_ERROR: 4189 SOCK_LOCK(so); 4190 if (so->so_error) { 4191 optval = so->so_error; 4192 so->so_error = 0; 4193 } else { 4194 optval = so->so_rerror; 4195 so->so_rerror = 0; 4196 } 4197 SOCK_UNLOCK(so); 4198 goto integer; 4199 4200 case SO_SNDBUF: 4201 SOCK_LOCK(so); 4202 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 4203 so->so_snd.sb_hiwat; 4204 SOCK_UNLOCK(so); 4205 goto integer; 4206 4207 case SO_RCVBUF: 4208 SOCK_LOCK(so); 4209 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 4210 so->so_rcv.sb_hiwat; 4211 SOCK_UNLOCK(so); 4212 goto integer; 4213 4214 case SO_SNDLOWAT: 4215 SOCK_LOCK(so); 4216 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 4217 so->so_snd.sb_lowat; 4218 SOCK_UNLOCK(so); 4219 goto integer; 4220 4221 case SO_RCVLOWAT: 4222 SOCK_LOCK(so); 4223 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 4224 so->so_rcv.sb_lowat; 4225 SOCK_UNLOCK(so); 4226 goto integer; 4227 4228 case SO_SNDTIMEO: 4229 case SO_RCVTIMEO: 4230 SOCK_LOCK(so); 4231 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 4232 (SOLISTENING(so) ? so->sol_sbsnd_timeo : 4233 so->so_snd.sb_timeo) : 4234 (SOLISTENING(so) ? so->sol_sbrcv_timeo : 4235 so->so_rcv.sb_timeo)); 4236 SOCK_UNLOCK(so); 4237 #ifdef COMPAT_FREEBSD32 4238 if (SV_CURPROC_FLAG(SV_ILP32)) { 4239 struct timeval32 tv32; 4240 4241 CP(tv, tv32, tv_sec); 4242 CP(tv, tv32, tv_usec); 4243 error = sooptcopyout(sopt, &tv32, sizeof tv32); 4244 } else 4245 #endif 4246 error = sooptcopyout(sopt, &tv, sizeof tv); 4247 break; 4248 4249 case SO_LABEL: 4250 #ifdef MAC 4251 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4252 sizeof(extmac)); 4253 if (error) 4254 goto bad; 4255 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 4256 so, &extmac); 4257 if (error) 4258 goto bad; 4259 /* Don't copy out extmac, it is unchanged. */ 4260 #else 4261 error = EOPNOTSUPP; 4262 #endif 4263 break; 4264 4265 case SO_PEERLABEL: 4266 #ifdef MAC 4267 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 4268 sizeof(extmac)); 4269 if (error) 4270 goto bad; 4271 error = mac_getsockopt_peerlabel( 4272 sopt->sopt_td->td_ucred, so, &extmac); 4273 if (error) 4274 goto bad; 4275 /* Don't copy out extmac, it is unchanged. */ 4276 #else 4277 error = EOPNOTSUPP; 4278 #endif 4279 break; 4280 4281 case SO_LISTENQLIMIT: 4282 SOCK_LOCK(so); 4283 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 4284 SOCK_UNLOCK(so); 4285 goto integer; 4286 4287 case SO_LISTENQLEN: 4288 SOCK_LOCK(so); 4289 optval = SOLISTENING(so) ? so->sol_qlen : 0; 4290 SOCK_UNLOCK(so); 4291 goto integer; 4292 4293 case SO_LISTENINCQLEN: 4294 SOCK_LOCK(so); 4295 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 4296 SOCK_UNLOCK(so); 4297 goto integer; 4298 4299 case SO_TS_CLOCK: 4300 optval = so->so_ts_clock; 4301 goto integer; 4302 4303 case SO_MAX_PACING_RATE: 4304 optval = so->so_max_pacing_rate; 4305 goto integer; 4306 4307 case SO_SPLICE: { 4308 off_t n; 4309 4310 /* 4311 * Acquire the I/O lock to serialize with 4312 * so_splice_xfer(). This is not required for 4313 * correctness, but makes testing simpler: once a byte 4314 * has been transmitted to the sink and observed (e.g., 4315 * by reading from the socket to which the sink is 4316 * connected), a subsequent getsockopt(SO_SPLICE) will 4317 * return an up-to-date value. 4318 */ 4319 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT); 4320 if (error != 0) 4321 goto bad; 4322 SOCK_LOCK(so); 4323 if (SOLISTENING(so)) { 4324 n = 0; 4325 } else { 4326 n = so->so_splice_sent; 4327 } 4328 SOCK_UNLOCK(so); 4329 SOCK_IO_RECV_UNLOCK(so); 4330 error = sooptcopyout(sopt, &n, sizeof(n)); 4331 break; 4332 } 4333 4334 default: 4335 #ifdef SOCKET_HHOOK 4336 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 4337 error = hhook_run_socket(so, sopt, 4338 HHOOK_SOCKET_OPT); 4339 else 4340 #endif 4341 error = ENOPROTOOPT; 4342 break; 4343 } 4344 } 4345 bad: 4346 CURVNET_RESTORE(); 4347 return (error); 4348 } 4349 4350 int 4351 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 4352 { 4353 struct mbuf *m, *m_prev; 4354 int sopt_size = sopt->sopt_valsize; 4355 4356 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4357 if (m == NULL) 4358 return ENOBUFS; 4359 if (sopt_size > MLEN) { 4360 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 4361 if ((m->m_flags & M_EXT) == 0) { 4362 m_free(m); 4363 return ENOBUFS; 4364 } 4365 m->m_len = min(MCLBYTES, sopt_size); 4366 } else { 4367 m->m_len = min(MLEN, sopt_size); 4368 } 4369 sopt_size -= m->m_len; 4370 *mp = m; 4371 m_prev = m; 4372 4373 while (sopt_size) { 4374 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 4375 if (m == NULL) { 4376 m_freem(*mp); 4377 return ENOBUFS; 4378 } 4379 if (sopt_size > MLEN) { 4380 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 4381 M_NOWAIT); 4382 if ((m->m_flags & M_EXT) == 0) { 4383 m_freem(m); 4384 m_freem(*mp); 4385 return ENOBUFS; 4386 } 4387 m->m_len = min(MCLBYTES, sopt_size); 4388 } else { 4389 m->m_len = min(MLEN, sopt_size); 4390 } 4391 sopt_size -= m->m_len; 4392 m_prev->m_next = m; 4393 m_prev = m; 4394 } 4395 return (0); 4396 } 4397 4398 int 4399 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 4400 { 4401 struct mbuf *m0 = m; 4402 4403 if (sopt->sopt_val == NULL) 4404 return (0); 4405 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4406 if (sopt->sopt_td != NULL) { 4407 int error; 4408 4409 error = copyin(sopt->sopt_val, mtod(m, char *), 4410 m->m_len); 4411 if (error != 0) { 4412 m_freem(m0); 4413 return(error); 4414 } 4415 } else 4416 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 4417 sopt->sopt_valsize -= m->m_len; 4418 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4419 m = m->m_next; 4420 } 4421 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 4422 panic("ip6_sooptmcopyin"); 4423 return (0); 4424 } 4425 4426 int 4427 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 4428 { 4429 struct mbuf *m0 = m; 4430 size_t valsize = 0; 4431 4432 if (sopt->sopt_val == NULL) 4433 return (0); 4434 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 4435 if (sopt->sopt_td != NULL) { 4436 int error; 4437 4438 error = copyout(mtod(m, char *), sopt->sopt_val, 4439 m->m_len); 4440 if (error != 0) { 4441 m_freem(m0); 4442 return(error); 4443 } 4444 } else 4445 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 4446 sopt->sopt_valsize -= m->m_len; 4447 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 4448 valsize += m->m_len; 4449 m = m->m_next; 4450 } 4451 if (m != NULL) { 4452 /* enough soopt buffer should be given from user-land */ 4453 m_freem(m0); 4454 return(EINVAL); 4455 } 4456 sopt->sopt_valsize = valsize; 4457 return (0); 4458 } 4459 4460 /* 4461 * sohasoutofband(): protocol notifies socket layer of the arrival of new 4462 * out-of-band data, which will then notify socket consumers. 4463 */ 4464 void 4465 sohasoutofband(struct socket *so) 4466 { 4467 4468 if (so->so_sigio != NULL) 4469 pgsigio(&so->so_sigio, SIGURG, 0); 4470 selwakeuppri(&so->so_rdsel, PSOCK); 4471 } 4472 4473 int 4474 sopoll_generic(struct socket *so, int events, struct thread *td) 4475 { 4476 int revents; 4477 4478 SOCK_LOCK(so); 4479 if (SOLISTENING(so)) { 4480 if (!(events & (POLLIN | POLLRDNORM))) 4481 revents = 0; 4482 else if (!TAILQ_EMPTY(&so->sol_comp)) 4483 revents = events & (POLLIN | POLLRDNORM); 4484 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 4485 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 4486 else { 4487 selrecord(td, &so->so_rdsel); 4488 revents = 0; 4489 } 4490 } else { 4491 revents = 0; 4492 SOCK_SENDBUF_LOCK(so); 4493 SOCK_RECVBUF_LOCK(so); 4494 if (events & (POLLIN | POLLRDNORM)) 4495 if (soreadabledata(so) && !isspliced(so)) 4496 revents |= events & (POLLIN | POLLRDNORM); 4497 if (events & (POLLOUT | POLLWRNORM)) 4498 if (sowriteable(so) && !issplicedback(so)) 4499 revents |= events & (POLLOUT | POLLWRNORM); 4500 if (events & (POLLPRI | POLLRDBAND)) 4501 if (so->so_oobmark || 4502 (so->so_rcv.sb_state & SBS_RCVATMARK)) 4503 revents |= events & (POLLPRI | POLLRDBAND); 4504 if ((events & POLLINIGNEOF) == 0) { 4505 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4506 revents |= events & (POLLIN | POLLRDNORM); 4507 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 4508 revents |= POLLHUP; 4509 } 4510 } 4511 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4512 revents |= events & POLLRDHUP; 4513 if (revents == 0) { 4514 if (events & 4515 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 4516 selrecord(td, &so->so_rdsel); 4517 so->so_rcv.sb_flags |= SB_SEL; 4518 } 4519 if (events & (POLLOUT | POLLWRNORM)) { 4520 selrecord(td, &so->so_wrsel); 4521 so->so_snd.sb_flags |= SB_SEL; 4522 } 4523 } 4524 SOCK_RECVBUF_UNLOCK(so); 4525 SOCK_SENDBUF_UNLOCK(so); 4526 } 4527 SOCK_UNLOCK(so); 4528 return (revents); 4529 } 4530 4531 int 4532 sokqfilter_generic(struct socket *so, struct knote *kn) 4533 { 4534 struct sockbuf *sb; 4535 sb_which which; 4536 struct knlist *knl; 4537 4538 switch (kn->kn_filter) { 4539 case EVFILT_READ: 4540 kn->kn_fop = &soread_filtops; 4541 knl = &so->so_rdsel.si_note; 4542 sb = &so->so_rcv; 4543 which = SO_RCV; 4544 break; 4545 case EVFILT_WRITE: 4546 kn->kn_fop = &sowrite_filtops; 4547 knl = &so->so_wrsel.si_note; 4548 sb = &so->so_snd; 4549 which = SO_SND; 4550 break; 4551 case EVFILT_EMPTY: 4552 kn->kn_fop = &soempty_filtops; 4553 knl = &so->so_wrsel.si_note; 4554 sb = &so->so_snd; 4555 which = SO_SND; 4556 break; 4557 default: 4558 return (EINVAL); 4559 } 4560 4561 SOCK_LOCK(so); 4562 if (SOLISTENING(so)) { 4563 knlist_add(knl, kn, 1); 4564 } else { 4565 SOCK_BUF_LOCK(so, which); 4566 knlist_add(knl, kn, 1); 4567 sb->sb_flags |= SB_KNOTE; 4568 if ((kn->kn_sfflags & NOTE_LOWAT) && 4569 (sb->sb_flags & SB_AUTOLOWAT)) 4570 sb->sb_flags &= ~SB_AUTOLOWAT; 4571 SOCK_BUF_UNLOCK(so, which); 4572 } 4573 SOCK_UNLOCK(so); 4574 return (0); 4575 } 4576 4577 static void 4578 filt_sordetach(struct knote *kn) 4579 { 4580 struct socket *so = kn->kn_fp->f_data; 4581 4582 so_rdknl_lock(so); 4583 knlist_remove(&so->so_rdsel.si_note, kn, 1); 4584 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 4585 so->so_rcv.sb_flags &= ~SB_KNOTE; 4586 so_rdknl_unlock(so); 4587 } 4588 4589 /*ARGSUSED*/ 4590 static int 4591 filt_soread(struct knote *kn, long hint) 4592 { 4593 struct socket *so; 4594 4595 so = kn->kn_fp->f_data; 4596 4597 if (SOLISTENING(so)) { 4598 SOCK_LOCK_ASSERT(so); 4599 kn->kn_data = so->sol_qlen; 4600 if (so->so_error) { 4601 kn->kn_flags |= EV_EOF; 4602 kn->kn_fflags = so->so_error; 4603 return (1); 4604 } 4605 return (!TAILQ_EMPTY(&so->sol_comp)); 4606 } 4607 4608 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 4609 return (0); 4610 4611 SOCK_RECVBUF_LOCK_ASSERT(so); 4612 4613 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 4614 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4615 kn->kn_flags |= EV_EOF; 4616 kn->kn_fflags = so->so_error; 4617 return (1); 4618 } else if (so->so_error || so->so_rerror) 4619 return (1); 4620 4621 if (kn->kn_sfflags & NOTE_LOWAT) { 4622 if (kn->kn_data >= kn->kn_sdata) 4623 return (1); 4624 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 4625 return (1); 4626 4627 #ifdef SOCKET_HHOOK 4628 /* This hook returning non-zero indicates an event, not error */ 4629 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 4630 #else 4631 return (0); 4632 #endif 4633 } 4634 4635 static void 4636 filt_sowdetach(struct knote *kn) 4637 { 4638 struct socket *so = kn->kn_fp->f_data; 4639 4640 so_wrknl_lock(so); 4641 knlist_remove(&so->so_wrsel.si_note, kn, 1); 4642 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 4643 so->so_snd.sb_flags &= ~SB_KNOTE; 4644 so_wrknl_unlock(so); 4645 } 4646 4647 /*ARGSUSED*/ 4648 static int 4649 filt_sowrite(struct knote *kn, long hint) 4650 { 4651 struct socket *so; 4652 4653 so = kn->kn_fp->f_data; 4654 4655 if (SOLISTENING(so)) 4656 return (0); 4657 4658 SOCK_SENDBUF_LOCK_ASSERT(so); 4659 kn->kn_data = sbspace(&so->so_snd); 4660 4661 #ifdef SOCKET_HHOOK 4662 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 4663 #endif 4664 4665 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 4666 kn->kn_flags |= EV_EOF; 4667 kn->kn_fflags = so->so_error; 4668 return (1); 4669 } else if (so->so_error) /* temporary udp error */ 4670 return (1); 4671 else if (((so->so_state & SS_ISCONNECTED) == 0) && 4672 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 4673 return (0); 4674 else if (kn->kn_sfflags & NOTE_LOWAT) 4675 return (kn->kn_data >= kn->kn_sdata); 4676 else 4677 return (kn->kn_data >= so->so_snd.sb_lowat); 4678 } 4679 4680 static int 4681 filt_soempty(struct knote *kn, long hint) 4682 { 4683 struct socket *so; 4684 4685 so = kn->kn_fp->f_data; 4686 4687 if (SOLISTENING(so)) 4688 return (1); 4689 4690 SOCK_SENDBUF_LOCK_ASSERT(so); 4691 kn->kn_data = sbused(&so->so_snd); 4692 4693 if (kn->kn_data == 0) 4694 return (1); 4695 else 4696 return (0); 4697 } 4698 4699 int 4700 socheckuid(struct socket *so, uid_t uid) 4701 { 4702 4703 if (so == NULL) 4704 return (EPERM); 4705 if (so->so_cred->cr_uid != uid) 4706 return (EPERM); 4707 return (0); 4708 } 4709 4710 /* 4711 * These functions are used by protocols to notify the socket layer (and its 4712 * consumers) of state changes in the sockets driven by protocol-side events. 4713 */ 4714 4715 /* 4716 * Procedures to manipulate state flags of socket and do appropriate wakeups. 4717 * 4718 * Normal sequence from the active (originating) side is that 4719 * soisconnecting() is called during processing of connect() call, resulting 4720 * in an eventual call to soisconnected() if/when the connection is 4721 * established. When the connection is torn down soisdisconnecting() is 4722 * called during processing of disconnect() call, and soisdisconnected() is 4723 * called when the connection to the peer is totally severed. The semantics 4724 * of these routines are such that connectionless protocols can call 4725 * soisconnected() and soisdisconnected() only, bypassing the in-progress 4726 * calls when setting up a ``connection'' takes no time. 4727 * 4728 * From the passive side, a socket is created with two queues of sockets: 4729 * so_incomp for connections in progress and so_comp for connections already 4730 * made and awaiting user acceptance. As a protocol is preparing incoming 4731 * connections, it creates a socket structure queued on so_incomp by calling 4732 * sonewconn(). When the connection is established, soisconnected() is 4733 * called, and transfers the socket structure to so_comp, making it available 4734 * to accept(). 4735 * 4736 * If a socket is closed with sockets on either so_incomp or so_comp, these 4737 * sockets are dropped. 4738 * 4739 * If higher-level protocols are implemented in the kernel, the wakeups done 4740 * here will sometimes cause software-interrupt process scheduling. 4741 */ 4742 void 4743 soisconnecting(struct socket *so) 4744 { 4745 4746 SOCK_LOCK(so); 4747 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 4748 so->so_state |= SS_ISCONNECTING; 4749 SOCK_UNLOCK(so); 4750 } 4751 4752 void 4753 soisconnected(struct socket *so) 4754 { 4755 bool last __diagused; 4756 4757 SOCK_LOCK(so); 4758 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 4759 so->so_state |= SS_ISCONNECTED; 4760 4761 if (so->so_qstate == SQ_INCOMP) { 4762 struct socket *head = so->so_listen; 4763 int ret; 4764 4765 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 4766 /* 4767 * Promoting a socket from incomplete queue to complete, we 4768 * need to go through reverse order of locking. We first do 4769 * trylock, and if that doesn't succeed, we go the hard way 4770 * leaving a reference and rechecking consistency after proper 4771 * locking. 4772 */ 4773 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 4774 soref(head); 4775 SOCK_UNLOCK(so); 4776 SOLISTEN_LOCK(head); 4777 SOCK_LOCK(so); 4778 if (__predict_false(head != so->so_listen)) { 4779 /* 4780 * The socket went off the listen queue, 4781 * should be lost race to close(2) of sol. 4782 * The socket is about to soabort(). 4783 */ 4784 SOCK_UNLOCK(so); 4785 sorele_locked(head); 4786 return; 4787 } 4788 last = refcount_release(&head->so_count); 4789 KASSERT(!last, ("%s: released last reference for %p", 4790 __func__, head)); 4791 } 4792 again: 4793 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 4794 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 4795 head->sol_incqlen--; 4796 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 4797 head->sol_qlen++; 4798 so->so_qstate = SQ_COMP; 4799 SOCK_UNLOCK(so); 4800 solisten_wakeup(head); /* unlocks */ 4801 } else { 4802 SOCK_RECVBUF_LOCK(so); 4803 soupcall_set(so, SO_RCV, 4804 head->sol_accept_filter->accf_callback, 4805 head->sol_accept_filter_arg); 4806 so->so_options &= ~SO_ACCEPTFILTER; 4807 ret = head->sol_accept_filter->accf_callback(so, 4808 head->sol_accept_filter_arg, M_NOWAIT); 4809 if (ret == SU_ISCONNECTED) { 4810 soupcall_clear(so, SO_RCV); 4811 SOCK_RECVBUF_UNLOCK(so); 4812 goto again; 4813 } 4814 SOCK_RECVBUF_UNLOCK(so); 4815 SOCK_UNLOCK(so); 4816 SOLISTEN_UNLOCK(head); 4817 } 4818 return; 4819 } 4820 SOCK_UNLOCK(so); 4821 wakeup(&so->so_timeo); 4822 sorwakeup(so); 4823 sowwakeup(so); 4824 } 4825 4826 void 4827 soisdisconnecting(struct socket *so) 4828 { 4829 4830 SOCK_LOCK(so); 4831 so->so_state &= ~SS_ISCONNECTING; 4832 so->so_state |= SS_ISDISCONNECTING; 4833 4834 if (!SOLISTENING(so)) { 4835 SOCK_RECVBUF_LOCK(so); 4836 socantrcvmore_locked(so); 4837 SOCK_SENDBUF_LOCK(so); 4838 socantsendmore_locked(so); 4839 } 4840 SOCK_UNLOCK(so); 4841 wakeup(&so->so_timeo); 4842 } 4843 4844 void 4845 soisdisconnected(struct socket *so) 4846 { 4847 4848 SOCK_LOCK(so); 4849 4850 /* 4851 * There is at least one reader of so_state that does not 4852 * acquire socket lock, namely soreceive_generic(). Ensure 4853 * that it never sees all flags that track connection status 4854 * cleared, by ordering the update with a barrier semantic of 4855 * our release thread fence. 4856 */ 4857 so->so_state |= SS_ISDISCONNECTED; 4858 atomic_thread_fence_rel(); 4859 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 4860 4861 if (!SOLISTENING(so)) { 4862 SOCK_UNLOCK(so); 4863 SOCK_RECVBUF_LOCK(so); 4864 socantrcvmore_locked(so); 4865 SOCK_SENDBUF_LOCK(so); 4866 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 4867 socantsendmore_locked(so); 4868 } else 4869 SOCK_UNLOCK(so); 4870 wakeup(&so->so_timeo); 4871 } 4872 4873 int 4874 soiolock(struct socket *so, struct sx *sx, int flags) 4875 { 4876 int error; 4877 4878 KASSERT((flags & SBL_VALID) == flags, 4879 ("soiolock: invalid flags %#x", flags)); 4880 4881 if ((flags & SBL_WAIT) != 0) { 4882 if ((flags & SBL_NOINTR) != 0) { 4883 sx_xlock(sx); 4884 } else { 4885 error = sx_xlock_sig(sx); 4886 if (error != 0) 4887 return (error); 4888 } 4889 } else if (!sx_try_xlock(sx)) { 4890 return (EWOULDBLOCK); 4891 } 4892 4893 if (__predict_false(SOLISTENING(so))) { 4894 sx_xunlock(sx); 4895 return (ENOTCONN); 4896 } 4897 return (0); 4898 } 4899 4900 void 4901 soiounlock(struct sx *sx) 4902 { 4903 sx_xunlock(sx); 4904 } 4905 4906 /* 4907 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 4908 */ 4909 struct sockaddr * 4910 sodupsockaddr(const struct sockaddr *sa, int mflags) 4911 { 4912 struct sockaddr *sa2; 4913 4914 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 4915 if (sa2) 4916 bcopy(sa, sa2, sa->sa_len); 4917 return sa2; 4918 } 4919 4920 /* 4921 * Register per-socket destructor. 4922 */ 4923 void 4924 sodtor_set(struct socket *so, so_dtor_t *func) 4925 { 4926 4927 SOCK_LOCK_ASSERT(so); 4928 so->so_dtor = func; 4929 } 4930 4931 /* 4932 * Register per-socket buffer upcalls. 4933 */ 4934 void 4935 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) 4936 { 4937 struct sockbuf *sb; 4938 4939 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4940 4941 switch (which) { 4942 case SO_RCV: 4943 sb = &so->so_rcv; 4944 break; 4945 case SO_SND: 4946 sb = &so->so_snd; 4947 break; 4948 } 4949 SOCK_BUF_LOCK_ASSERT(so, which); 4950 sb->sb_upcall = func; 4951 sb->sb_upcallarg = arg; 4952 sb->sb_flags |= SB_UPCALL; 4953 } 4954 4955 void 4956 soupcall_clear(struct socket *so, sb_which which) 4957 { 4958 struct sockbuf *sb; 4959 4960 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4961 4962 switch (which) { 4963 case SO_RCV: 4964 sb = &so->so_rcv; 4965 break; 4966 case SO_SND: 4967 sb = &so->so_snd; 4968 break; 4969 } 4970 SOCK_BUF_LOCK_ASSERT(so, which); 4971 KASSERT(sb->sb_upcall != NULL, 4972 ("%s: so %p no upcall to clear", __func__, so)); 4973 sb->sb_upcall = NULL; 4974 sb->sb_upcallarg = NULL; 4975 sb->sb_flags &= ~SB_UPCALL; 4976 } 4977 4978 void 4979 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4980 { 4981 4982 SOLISTEN_LOCK_ASSERT(so); 4983 so->sol_upcall = func; 4984 so->sol_upcallarg = arg; 4985 } 4986 4987 static void 4988 so_rdknl_lock(void *arg) 4989 { 4990 struct socket *so = arg; 4991 4992 retry: 4993 if (SOLISTENING(so)) { 4994 SOLISTEN_LOCK(so); 4995 } else { 4996 SOCK_RECVBUF_LOCK(so); 4997 if (__predict_false(SOLISTENING(so))) { 4998 SOCK_RECVBUF_UNLOCK(so); 4999 goto retry; 5000 } 5001 } 5002 } 5003 5004 static void 5005 so_rdknl_unlock(void *arg) 5006 { 5007 struct socket *so = arg; 5008 5009 if (SOLISTENING(so)) 5010 SOLISTEN_UNLOCK(so); 5011 else 5012 SOCK_RECVBUF_UNLOCK(so); 5013 } 5014 5015 static void 5016 so_rdknl_assert_lock(void *arg, int what) 5017 { 5018 struct socket *so = arg; 5019 5020 if (what == LA_LOCKED) { 5021 if (SOLISTENING(so)) 5022 SOLISTEN_LOCK_ASSERT(so); 5023 else 5024 SOCK_RECVBUF_LOCK_ASSERT(so); 5025 } else { 5026 if (SOLISTENING(so)) 5027 SOLISTEN_UNLOCK_ASSERT(so); 5028 else 5029 SOCK_RECVBUF_UNLOCK_ASSERT(so); 5030 } 5031 } 5032 5033 static void 5034 so_wrknl_lock(void *arg) 5035 { 5036 struct socket *so = arg; 5037 5038 retry: 5039 if (SOLISTENING(so)) { 5040 SOLISTEN_LOCK(so); 5041 } else { 5042 SOCK_SENDBUF_LOCK(so); 5043 if (__predict_false(SOLISTENING(so))) { 5044 SOCK_SENDBUF_UNLOCK(so); 5045 goto retry; 5046 } 5047 } 5048 } 5049 5050 static void 5051 so_wrknl_unlock(void *arg) 5052 { 5053 struct socket *so = arg; 5054 5055 if (SOLISTENING(so)) 5056 SOLISTEN_UNLOCK(so); 5057 else 5058 SOCK_SENDBUF_UNLOCK(so); 5059 } 5060 5061 static void 5062 so_wrknl_assert_lock(void *arg, int what) 5063 { 5064 struct socket *so = arg; 5065 5066 if (what == LA_LOCKED) { 5067 if (SOLISTENING(so)) 5068 SOLISTEN_LOCK_ASSERT(so); 5069 else 5070 SOCK_SENDBUF_LOCK_ASSERT(so); 5071 } else { 5072 if (SOLISTENING(so)) 5073 SOLISTEN_UNLOCK_ASSERT(so); 5074 else 5075 SOCK_SENDBUF_UNLOCK_ASSERT(so); 5076 } 5077 } 5078 5079 /* 5080 * Create an external-format (``xsocket'') structure using the information in 5081 * the kernel-format socket structure pointed to by so. This is done to 5082 * reduce the spew of irrelevant information over this interface, to isolate 5083 * user code from changes in the kernel structure, and potentially to provide 5084 * information-hiding if we decide that some of this information should be 5085 * hidden from users. 5086 */ 5087 void 5088 sotoxsocket(struct socket *so, struct xsocket *xso) 5089 { 5090 5091 bzero(xso, sizeof(*xso)); 5092 xso->xso_len = sizeof *xso; 5093 xso->xso_so = (uintptr_t)so; 5094 xso->so_type = so->so_type; 5095 xso->so_options = so->so_options; 5096 xso->so_linger = so->so_linger; 5097 xso->so_state = so->so_state; 5098 xso->so_pcb = (uintptr_t)so->so_pcb; 5099 xso->xso_protocol = so->so_proto->pr_protocol; 5100 xso->xso_family = so->so_proto->pr_domain->dom_family; 5101 xso->so_timeo = so->so_timeo; 5102 xso->so_error = so->so_error; 5103 xso->so_uid = so->so_cred->cr_uid; 5104 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 5105 SOCK_LOCK(so); 5106 xso->so_fibnum = so->so_fibnum; 5107 if (SOLISTENING(so)) { 5108 xso->so_qlen = so->sol_qlen; 5109 xso->so_incqlen = so->sol_incqlen; 5110 xso->so_qlimit = so->sol_qlimit; 5111 xso->so_oobmark = 0; 5112 } else { 5113 xso->so_state |= so->so_qstate; 5114 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 5115 xso->so_oobmark = so->so_oobmark; 5116 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 5117 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 5118 if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) 5119 xso->so_splice_so = (uintptr_t)so->so_splice->dst; 5120 } 5121 SOCK_UNLOCK(so); 5122 } 5123 5124 int 5125 so_options_get(const struct socket *so) 5126 { 5127 5128 return (so->so_options); 5129 } 5130 5131 void 5132 so_options_set(struct socket *so, int val) 5133 { 5134 5135 so->so_options = val; 5136 } 5137 5138 int 5139 so_error_get(const struct socket *so) 5140 { 5141 5142 return (so->so_error); 5143 } 5144 5145 void 5146 so_error_set(struct socket *so, int val) 5147 { 5148 5149 so->so_error = val; 5150 } 5151