1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 35 */ 36 37 /* 38 * Comments on the socket life cycle: 39 * 40 * soalloc() sets of socket layer state for a socket, called only by 41 * socreate() and sonewconn(). Socket layer private. 42 * 43 * sodealloc() tears down socket layer state for a socket, called only by 44 * sofree() and sonewconn(). Socket layer private. 45 * 46 * pru_attach() associates protocol layer state with an allocated socket; 47 * called only once, may fail, aborting socket allocation. This is called 48 * from socreate() and sonewconn(). Socket layer private. 49 * 50 * pru_detach() disassociates protocol layer state from an attached socket, 51 * and will be called exactly once for sockets in which pru_attach() has 52 * been successfully called. If pru_attach() returned an error, 53 * pru_detach() will not be called. Socket layer private. 54 * 55 * pru_abort() and pru_close() notify the protocol layer that the last 56 * consumer of a socket is starting to tear down the socket, and that the 57 * protocol should terminate the connection. Historically, pru_abort() also 58 * detached protocol state from the socket state, but this is no longer the 59 * case. 60 * 61 * socreate() creates a socket and attaches protocol state. This is a public 62 * interface that may be used by socket layer consumers to create new 63 * sockets. 64 * 65 * sonewconn() creates a socket and attaches protocol state. This is a 66 * public interface that may be used by protocols to create new sockets when 67 * a new connection is received and will be available for accept() on a 68 * listen socket. 69 * 70 * soclose() destroys a socket after possibly waiting for it to disconnect. 71 * This is a public interface that socket consumers should use to close and 72 * release a socket when done with it. 73 * 74 * soabort() destroys a socket without waiting for it to disconnect (used 75 * only for incoming connections that are already partially or fully 76 * connected). This is used internally by the socket layer when clearing 77 * listen socket queues (due to overflow or close on the listen socket), but 78 * is also a public interface protocols may use to abort connections in 79 * their incomplete listen queues should they no longer be required. Sockets 80 * placed in completed connection listen queues should not be aborted for 81 * reasons described in the comment above the soclose() implementation. This 82 * is not a general purpose close routine, and except in the specific 83 * circumstances described here, should not be used. 84 * 85 * sofree() will free a socket and its protocol state if all references on 86 * the socket have been released, and is the public interface to attempt to 87 * free a socket when a reference is removed. This is a socket layer private 88 * interface. 89 * 90 * NOTE: In addition to socreate() and soclose(), which provide a single 91 * socket reference to the consumer to be managed as required, there are two 92 * calls to explicitly manage socket references, soref(), and sorele(). 93 * Currently, these are generally required only when transitioning a socket 94 * from a listen queue to a file descriptor, in order to prevent garbage 95 * collection of the socket at an untimely moment. For a number of reasons, 96 * these interfaces are not preferred, and should be avoided. 97 * 98 * NOTE: With regard to VNETs the general rule is that callers do not set 99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 101 * and sorflush(), which are usually called from a pre-set VNET context. 102 * sopoll() currently does not need a VNET context to be set. 103 */ 104 105 #include <sys/cdefs.h> 106 __FBSDID("$FreeBSD$"); 107 108 #include "opt_inet.h" 109 #include "opt_inet6.h" 110 #include "opt_kern_tls.h" 111 #include "opt_sctp.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/capsicum.h> 116 #include <sys/fcntl.h> 117 #include <sys/limits.h> 118 #include <sys/lock.h> 119 #include <sys/mac.h> 120 #include <sys/malloc.h> 121 #include <sys/mbuf.h> 122 #include <sys/mutex.h> 123 #include <sys/domain.h> 124 #include <sys/file.h> /* for struct knote */ 125 #include <sys/hhook.h> 126 #include <sys/kernel.h> 127 #include <sys/khelp.h> 128 #include <sys/ktls.h> 129 #include <sys/event.h> 130 #include <sys/eventhandler.h> 131 #include <sys/poll.h> 132 #include <sys/proc.h> 133 #include <sys/protosw.h> 134 #include <sys/sbuf.h> 135 #include <sys/socket.h> 136 #include <sys/socketvar.h> 137 #include <sys/resourcevar.h> 138 #include <net/route.h> 139 #include <sys/signalvar.h> 140 #include <sys/stat.h> 141 #include <sys/sx.h> 142 #include <sys/sysctl.h> 143 #include <sys/taskqueue.h> 144 #include <sys/uio.h> 145 #include <sys/un.h> 146 #include <sys/unpcb.h> 147 #include <sys/jail.h> 148 #include <sys/syslog.h> 149 #include <netinet/in.h> 150 #include <netinet/in_pcb.h> 151 #include <netinet/tcp.h> 152 153 #include <net/vnet.h> 154 155 #include <security/mac/mac_framework.h> 156 157 #include <vm/uma.h> 158 159 #ifdef COMPAT_FREEBSD32 160 #include <sys/mount.h> 161 #include <sys/sysent.h> 162 #include <compat/freebsd32/freebsd32.h> 163 #endif 164 165 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 166 int flags); 167 static void so_rdknl_lock(void *); 168 static void so_rdknl_unlock(void *); 169 static void so_rdknl_assert_lock(void *, int); 170 static void so_wrknl_lock(void *); 171 static void so_wrknl_unlock(void *); 172 static void so_wrknl_assert_lock(void *, int); 173 174 static void filt_sordetach(struct knote *kn); 175 static int filt_soread(struct knote *kn, long hint); 176 static void filt_sowdetach(struct knote *kn); 177 static int filt_sowrite(struct knote *kn, long hint); 178 static int filt_soempty(struct knote *kn, long hint); 179 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); 180 fo_kqfilter_t soo_kqfilter; 181 182 static struct filterops soread_filtops = { 183 .f_isfd = 1, 184 .f_detach = filt_sordetach, 185 .f_event = filt_soread, 186 }; 187 static struct filterops sowrite_filtops = { 188 .f_isfd = 1, 189 .f_detach = filt_sowdetach, 190 .f_event = filt_sowrite, 191 }; 192 static struct filterops soempty_filtops = { 193 .f_isfd = 1, 194 .f_detach = filt_sowdetach, 195 .f_event = filt_soempty, 196 }; 197 198 so_gen_t so_gencnt; /* generation count for sockets */ 199 200 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 201 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 202 203 #define VNET_SO_ASSERT(so) \ 204 VNET_ASSERT(curvnet != NULL, \ 205 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 206 207 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 208 #define V_socket_hhh VNET(socket_hhh) 209 210 /* 211 * Limit on the number of connections in the listen queue waiting 212 * for accept(2). 213 * NB: The original sysctl somaxconn is still available but hidden 214 * to prevent confusion about the actual purpose of this number. 215 */ 216 static u_int somaxconn = SOMAXCONN; 217 218 static int 219 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 220 { 221 int error; 222 int val; 223 224 val = somaxconn; 225 error = sysctl_handle_int(oidp, &val, 0, req); 226 if (error || !req->newptr ) 227 return (error); 228 229 /* 230 * The purpose of the UINT_MAX / 3 limit, is so that the formula 231 * 3 * so_qlimit / 2 232 * below, will not overflow. 233 */ 234 235 if (val < 1 || val > UINT_MAX / 3) 236 return (EINVAL); 237 238 somaxconn = val; 239 return (0); 240 } 241 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 242 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), 243 sysctl_somaxconn, "I", 244 "Maximum listen socket pending connection accept queue size"); 245 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 246 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, 247 sizeof(int), sysctl_somaxconn, "I", 248 "Maximum listen socket pending connection accept queue size (compat)"); 249 250 static int numopensockets; 251 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 252 &numopensockets, 0, "Number of open sockets"); 253 254 /* 255 * accept_mtx locks down per-socket fields relating to accept queues. See 256 * socketvar.h for an annotation of the protected fields of struct socket. 257 */ 258 struct mtx accept_mtx; 259 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 260 261 /* 262 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 263 * so_gencnt field. 264 */ 265 static struct mtx so_global_mtx; 266 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 267 268 /* 269 * General IPC sysctl name space, used by sockets and a variety of other IPC 270 * types. 271 */ 272 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 273 "IPC"); 274 275 /* 276 * Initialize the socket subsystem and set up the socket 277 * memory allocator. 278 */ 279 static uma_zone_t socket_zone; 280 int maxsockets; 281 282 static void 283 socket_zone_change(void *tag) 284 { 285 286 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 287 } 288 289 static void 290 socket_hhook_register(int subtype) 291 { 292 293 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 294 &V_socket_hhh[subtype], 295 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 296 printf("%s: WARNING: unable to register hook\n", __func__); 297 } 298 299 static void 300 socket_hhook_deregister(int subtype) 301 { 302 303 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 304 printf("%s: WARNING: unable to deregister hook\n", __func__); 305 } 306 307 static void 308 socket_init(void *tag) 309 { 310 311 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 312 NULL, NULL, UMA_ALIGN_PTR, 0); 313 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 314 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 315 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 316 EVENTHANDLER_PRI_FIRST); 317 } 318 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 319 320 static void 321 socket_vnet_init(const void *unused __unused) 322 { 323 int i; 324 325 /* We expect a contiguous range */ 326 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 327 socket_hhook_register(i); 328 } 329 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 330 socket_vnet_init, NULL); 331 332 static void 333 socket_vnet_uninit(const void *unused __unused) 334 { 335 int i; 336 337 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 338 socket_hhook_deregister(i); 339 } 340 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 341 socket_vnet_uninit, NULL); 342 343 /* 344 * Initialise maxsockets. This SYSINIT must be run after 345 * tunable_mbinit(). 346 */ 347 static void 348 init_maxsockets(void *ignored) 349 { 350 351 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 352 maxsockets = imax(maxsockets, maxfiles); 353 } 354 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 355 356 /* 357 * Sysctl to get and set the maximum global sockets limit. Notify protocols 358 * of the change so that they can update their dependent limits as required. 359 */ 360 static int 361 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 362 { 363 int error, newmaxsockets; 364 365 newmaxsockets = maxsockets; 366 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 367 if (error == 0 && req->newptr && newmaxsockets != maxsockets) { 368 if (newmaxsockets > maxsockets && 369 newmaxsockets <= maxfiles) { 370 maxsockets = newmaxsockets; 371 EVENTHANDLER_INVOKE(maxsockets_change); 372 } else 373 error = EINVAL; 374 } 375 return (error); 376 } 377 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 378 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0, 379 sysctl_maxsockets, "IU", 380 "Maximum number of sockets available"); 381 382 /* 383 * Socket operation routines. These routines are called by the routines in 384 * sys_socket.c or from a system process, and implement the semantics of 385 * socket operations by switching out to the protocol specific routines. 386 */ 387 388 /* 389 * Get a socket structure from our zone, and initialize it. Note that it 390 * would probably be better to allocate socket and PCB at the same time, but 391 * I'm not convinced that all the protocols can be easily modified to do 392 * this. 393 * 394 * soalloc() returns a socket with a ref count of 0. 395 */ 396 static struct socket * 397 soalloc(struct vnet *vnet) 398 { 399 struct socket *so; 400 401 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 402 if (so == NULL) 403 return (NULL); 404 #ifdef MAC 405 if (mac_socket_init(so, M_NOWAIT) != 0) { 406 uma_zfree(socket_zone, so); 407 return (NULL); 408 } 409 #endif 410 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 411 uma_zfree(socket_zone, so); 412 return (NULL); 413 } 414 415 /* 416 * The socket locking protocol allows to lock 2 sockets at a time, 417 * however, the first one must be a listening socket. WITNESS lacks 418 * a feature to change class of an existing lock, so we use DUPOK. 419 */ 420 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 421 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); 422 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); 423 so->so_rcv.sb_sel = &so->so_rdsel; 424 so->so_snd.sb_sel = &so->so_wrsel; 425 sx_init(&so->so_snd_sx, "so_snd_sx"); 426 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 427 TAILQ_INIT(&so->so_snd.sb_aiojobq); 428 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 429 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 430 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 431 #ifdef VIMAGE 432 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 433 __func__, __LINE__, so)); 434 so->so_vnet = vnet; 435 #endif 436 /* We shouldn't need the so_global_mtx */ 437 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 438 /* Do we need more comprehensive error returns? */ 439 uma_zfree(socket_zone, so); 440 return (NULL); 441 } 442 mtx_lock(&so_global_mtx); 443 so->so_gencnt = ++so_gencnt; 444 ++numopensockets; 445 #ifdef VIMAGE 446 vnet->vnet_sockcnt++; 447 #endif 448 mtx_unlock(&so_global_mtx); 449 450 return (so); 451 } 452 453 /* 454 * Free the storage associated with a socket at the socket layer, tear down 455 * locks, labels, etc. All protocol state is assumed already to have been 456 * torn down (and possibly never set up) by the caller. 457 */ 458 void 459 sodealloc(struct socket *so) 460 { 461 462 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 463 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 464 465 mtx_lock(&so_global_mtx); 466 so->so_gencnt = ++so_gencnt; 467 --numopensockets; /* Could be below, but faster here. */ 468 #ifdef VIMAGE 469 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 470 __func__, __LINE__, so)); 471 so->so_vnet->vnet_sockcnt--; 472 #endif 473 mtx_unlock(&so_global_mtx); 474 #ifdef MAC 475 mac_socket_destroy(so); 476 #endif 477 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 478 479 khelp_destroy_osd(&so->osd); 480 if (SOLISTENING(so)) { 481 if (so->sol_accept_filter != NULL) 482 accept_filt_setopt(so, NULL); 483 } else { 484 if (so->so_rcv.sb_hiwat) 485 (void)chgsbsize(so->so_cred->cr_uidinfo, 486 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 487 if (so->so_snd.sb_hiwat) 488 (void)chgsbsize(so->so_cred->cr_uidinfo, 489 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 490 sx_destroy(&so->so_snd_sx); 491 sx_destroy(&so->so_rcv_sx); 492 mtx_destroy(&so->so_snd_mtx); 493 mtx_destroy(&so->so_rcv_mtx); 494 } 495 crfree(so->so_cred); 496 mtx_destroy(&so->so_lock); 497 uma_zfree(socket_zone, so); 498 } 499 500 /* 501 * socreate returns a socket with a ref count of 1 and a file descriptor 502 * reference. The socket should be closed with soclose(). 503 */ 504 int 505 socreate(int dom, struct socket **aso, int type, int proto, 506 struct ucred *cred, struct thread *td) 507 { 508 struct protosw *prp; 509 struct socket *so; 510 int error; 511 512 /* 513 * XXX: divert(4) historically abused PF_INET. Keep this compatibility 514 * shim until all applications have been updated. 515 */ 516 if (__predict_false(dom == PF_INET && type == SOCK_RAW && 517 proto == IPPROTO_DIVERT)) { 518 dom = PF_DIVERT; 519 printf("%s uses obsolete way to create divert(4) socket\n", 520 td->td_proc->p_comm); 521 } 522 523 prp = pffindproto(dom, type, proto); 524 if (prp == NULL) { 525 /* No support for domain. */ 526 if (pffinddomain(dom) == NULL) 527 return (EAFNOSUPPORT); 528 /* No support for socket type. */ 529 if (proto == 0 && type != 0) 530 return (EPROTOTYPE); 531 return (EPROTONOSUPPORT); 532 } 533 534 MPASS(prp->pr_attach); 535 536 if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) 537 return (ECAPMODE); 538 539 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 540 return (EPROTONOSUPPORT); 541 542 so = soalloc(CRED_TO_VNET(cred)); 543 if (so == NULL) 544 return (ENOBUFS); 545 546 so->so_type = type; 547 so->so_cred = crhold(cred); 548 if ((prp->pr_domain->dom_family == PF_INET) || 549 (prp->pr_domain->dom_family == PF_INET6) || 550 (prp->pr_domain->dom_family == PF_ROUTE)) 551 so->so_fibnum = td->td_proc->p_fibnum; 552 else 553 so->so_fibnum = 0; 554 so->so_proto = prp; 555 #ifdef MAC 556 mac_socket_create(cred, so); 557 #endif 558 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 559 so_rdknl_assert_lock); 560 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 561 so_wrknl_assert_lock); 562 if ((prp->pr_flags & PR_SOCKBUF) == 0) { 563 so->so_snd.sb_mtx = &so->so_snd_mtx; 564 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 565 } 566 /* 567 * Auto-sizing of socket buffers is managed by the protocols and 568 * the appropriate flags must be set in the pru_attach function. 569 */ 570 CURVNET_SET(so->so_vnet); 571 error = prp->pr_attach(so, proto, td); 572 CURVNET_RESTORE(); 573 if (error) { 574 sodealloc(so); 575 return (error); 576 } 577 soref(so); 578 *aso = so; 579 return (0); 580 } 581 582 #ifdef REGRESSION 583 static int regression_sonewconn_earlytest = 1; 584 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 585 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 586 #endif 587 588 static struct timeval overinterval = { 60, 0 }; 589 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 590 &overinterval, 591 "Delay in seconds between warnings for listen socket overflows"); 592 593 /* 594 * When an attempt at a new connection is noted on a socket which supports 595 * accept(2), the protocol has two options: 596 * 1) Call legacy sonewconn() function, which would call protocol attach 597 * method, same as used for socket(2). 598 * 2) Call solisten_clone(), do attach that is specific to a cloned connection, 599 * and then call solisten_enqueue(). 600 * 601 * Note: the ref count on the socket is 0 on return. 602 */ 603 struct socket * 604 solisten_clone(struct socket *head) 605 { 606 struct sbuf descrsb; 607 struct socket *so; 608 int len, overcount; 609 u_int qlen; 610 const char localprefix[] = "local:"; 611 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 612 #if defined(INET6) 613 char addrbuf[INET6_ADDRSTRLEN]; 614 #elif defined(INET) 615 char addrbuf[INET_ADDRSTRLEN]; 616 #endif 617 bool dolog, over; 618 619 SOLISTEN_LOCK(head); 620 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 621 #ifdef REGRESSION 622 if (regression_sonewconn_earlytest && over) { 623 #else 624 if (over) { 625 #endif 626 head->sol_overcount++; 627 dolog = !!ratecheck(&head->sol_lastover, &overinterval); 628 629 /* 630 * If we're going to log, copy the overflow count and queue 631 * length from the listen socket before dropping the lock. 632 * Also, reset the overflow count. 633 */ 634 if (dolog) { 635 overcount = head->sol_overcount; 636 head->sol_overcount = 0; 637 qlen = head->sol_qlen; 638 } 639 SOLISTEN_UNLOCK(head); 640 641 if (dolog) { 642 /* 643 * Try to print something descriptive about the 644 * socket for the error message. 645 */ 646 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 647 SBUF_FIXEDLEN); 648 switch (head->so_proto->pr_domain->dom_family) { 649 #if defined(INET) || defined(INET6) 650 #ifdef INET 651 case AF_INET: 652 #endif 653 #ifdef INET6 654 case AF_INET6: 655 if (head->so_proto->pr_domain->dom_family == 656 AF_INET6 || 657 (sotoinpcb(head)->inp_inc.inc_flags & 658 INC_ISIPV6)) { 659 ip6_sprintf(addrbuf, 660 &sotoinpcb(head)->inp_inc.inc6_laddr); 661 sbuf_printf(&descrsb, "[%s]", addrbuf); 662 } else 663 #endif 664 { 665 #ifdef INET 666 inet_ntoa_r( 667 sotoinpcb(head)->inp_inc.inc_laddr, 668 addrbuf); 669 sbuf_cat(&descrsb, addrbuf); 670 #endif 671 } 672 sbuf_printf(&descrsb, ":%hu (proto %u)", 673 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 674 head->so_proto->pr_protocol); 675 break; 676 #endif /* INET || INET6 */ 677 case AF_UNIX: 678 sbuf_cat(&descrsb, localprefix); 679 if (sotounpcb(head)->unp_addr != NULL) 680 len = 681 sotounpcb(head)->unp_addr->sun_len - 682 offsetof(struct sockaddr_un, 683 sun_path); 684 else 685 len = 0; 686 if (len > 0) 687 sbuf_bcat(&descrsb, 688 sotounpcb(head)->unp_addr->sun_path, 689 len); 690 else 691 sbuf_cat(&descrsb, "(unknown)"); 692 break; 693 } 694 695 /* 696 * If we can't print something more specific, at least 697 * print the domain name. 698 */ 699 if (sbuf_finish(&descrsb) != 0 || 700 sbuf_len(&descrsb) <= 0) { 701 sbuf_clear(&descrsb); 702 sbuf_cat(&descrsb, 703 head->so_proto->pr_domain->dom_name ?: 704 "unknown"); 705 sbuf_finish(&descrsb); 706 } 707 KASSERT(sbuf_len(&descrsb) > 0, 708 ("%s: sbuf creation failed", __func__)); 709 /* 710 * Preserve the historic listen queue overflow log 711 * message, that starts with "sonewconn:". It has 712 * been known to sysadmins for years and also test 713 * sys/kern/sonewconn_overflow checks for it. 714 */ 715 if (head->so_cred == 0) { 716 log(LOG_DEBUG, "sonewconn: pcb %p (%s): " 717 "Listen queue overflow: %i already in " 718 "queue awaiting acceptance (%d " 719 "occurrences)\n", head->so_pcb, 720 sbuf_data(&descrsb), 721 qlen, overcount); 722 } else { 723 log(LOG_DEBUG, "sonewconn: pcb %p (%s): " 724 "Listen queue overflow: " 725 "%i already in queue awaiting acceptance " 726 "(%d occurrences), euid %d, rgid %d, jail %s\n", 727 head->so_pcb, sbuf_data(&descrsb), qlen, 728 overcount, head->so_cred->cr_uid, 729 head->so_cred->cr_rgid, 730 head->so_cred->cr_prison ? 731 head->so_cred->cr_prison->pr_name : 732 "not_jailed"); 733 } 734 sbuf_delete(&descrsb); 735 736 overcount = 0; 737 } 738 739 return (NULL); 740 } 741 SOLISTEN_UNLOCK(head); 742 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 743 __func__, head)); 744 so = soalloc(head->so_vnet); 745 if (so == NULL) { 746 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 747 "limit reached or out of memory\n", 748 __func__, head->so_pcb); 749 return (NULL); 750 } 751 so->so_listen = head; 752 so->so_type = head->so_type; 753 so->so_options = head->so_options & ~SO_ACCEPTCONN; 754 so->so_linger = head->so_linger; 755 so->so_state = head->so_state; 756 so->so_fibnum = head->so_fibnum; 757 so->so_proto = head->so_proto; 758 so->so_cred = crhold(head->so_cred); 759 #ifdef MAC 760 mac_socket_newconn(head, so); 761 #endif 762 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 763 so_rdknl_assert_lock); 764 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 765 so_wrknl_assert_lock); 766 VNET_SO_ASSERT(head); 767 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 768 sodealloc(so); 769 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 770 __func__, head->so_pcb); 771 return (NULL); 772 } 773 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 774 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 775 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 776 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 777 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; 778 so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; 779 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 780 so->so_snd.sb_mtx = &so->so_snd_mtx; 781 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 782 } 783 784 return (so); 785 } 786 787 /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */ 788 struct socket * 789 sonewconn(struct socket *head, int connstatus) 790 { 791 struct socket *so; 792 793 if ((so = solisten_clone(head)) == NULL) 794 return (NULL); 795 796 if (so->so_proto->pr_attach(so, 0, NULL) != 0) { 797 sodealloc(so); 798 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 799 __func__, head->so_pcb); 800 return (NULL); 801 } 802 803 (void)solisten_enqueue(so, connstatus); 804 805 return (so); 806 } 807 808 /* 809 * Enqueue socket cloned by solisten_clone() to the listen queue of the 810 * listener it has been cloned from. 811 * 812 * Return 'true' if socket landed on complete queue, otherwise 'false'. 813 */ 814 bool 815 solisten_enqueue(struct socket *so, int connstatus) 816 { 817 struct socket *head = so->so_listen; 818 819 MPASS(refcount_load(&so->so_count) == 0); 820 refcount_init(&so->so_count, 1); 821 822 SOLISTEN_LOCK(head); 823 if (head->sol_accept_filter != NULL) 824 connstatus = 0; 825 so->so_state |= connstatus; 826 soref(head); /* A socket on (in)complete queue refs head. */ 827 if (connstatus) { 828 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 829 so->so_qstate = SQ_COMP; 830 head->sol_qlen++; 831 solisten_wakeup(head); /* unlocks */ 832 return (true); 833 } else { 834 /* 835 * Keep removing sockets from the head until there's room for 836 * us to insert on the tail. In pre-locking revisions, this 837 * was a simple if(), but as we could be racing with other 838 * threads and soabort() requires dropping locks, we must 839 * loop waiting for the condition to be true. 840 */ 841 while (head->sol_incqlen > head->sol_qlimit) { 842 struct socket *sp; 843 844 sp = TAILQ_FIRST(&head->sol_incomp); 845 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 846 head->sol_incqlen--; 847 SOCK_LOCK(sp); 848 sp->so_qstate = SQ_NONE; 849 sp->so_listen = NULL; 850 SOCK_UNLOCK(sp); 851 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ 852 soabort(sp); 853 SOLISTEN_LOCK(head); 854 } 855 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 856 so->so_qstate = SQ_INCOMP; 857 head->sol_incqlen++; 858 SOLISTEN_UNLOCK(head); 859 return (false); 860 } 861 } 862 863 #if defined(SCTP) || defined(SCTP_SUPPORT) 864 /* 865 * Socket part of sctp_peeloff(). Detach a new socket from an 866 * association. The new socket is returned with a reference. 867 * 868 * XXXGL: reduce copy-paste with solisten_clone(). 869 */ 870 struct socket * 871 sopeeloff(struct socket *head) 872 { 873 struct socket *so; 874 875 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 876 __func__, __LINE__, head)); 877 so = soalloc(head->so_vnet); 878 if (so == NULL) { 879 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 880 "limit reached or out of memory\n", 881 __func__, head->so_pcb); 882 return (NULL); 883 } 884 so->so_type = head->so_type; 885 so->so_options = head->so_options; 886 so->so_linger = head->so_linger; 887 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 888 so->so_fibnum = head->so_fibnum; 889 so->so_proto = head->so_proto; 890 so->so_cred = crhold(head->so_cred); 891 #ifdef MAC 892 mac_socket_newconn(head, so); 893 #endif 894 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 895 so_rdknl_assert_lock); 896 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 897 so_wrknl_assert_lock); 898 VNET_SO_ASSERT(head); 899 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 900 sodealloc(so); 901 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 902 __func__, head->so_pcb); 903 return (NULL); 904 } 905 if ((*so->so_proto->pr_attach)(so, 0, NULL)) { 906 sodealloc(so); 907 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 908 __func__, head->so_pcb); 909 return (NULL); 910 } 911 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 912 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 913 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 914 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 915 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 916 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 917 918 soref(so); 919 920 return (so); 921 } 922 #endif /* SCTP */ 923 924 int 925 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 926 { 927 int error; 928 929 CURVNET_SET(so->so_vnet); 930 error = so->so_proto->pr_bind(so, nam, td); 931 CURVNET_RESTORE(); 932 return (error); 933 } 934 935 int 936 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 937 { 938 int error; 939 940 CURVNET_SET(so->so_vnet); 941 error = so->so_proto->pr_bindat(fd, so, nam, td); 942 CURVNET_RESTORE(); 943 return (error); 944 } 945 946 /* 947 * solisten() transitions a socket from a non-listening state to a listening 948 * state, but can also be used to update the listen queue depth on an 949 * existing listen socket. The protocol will call back into the sockets 950 * layer using solisten_proto_check() and solisten_proto() to check and set 951 * socket-layer listen state. Call backs are used so that the protocol can 952 * acquire both protocol and socket layer locks in whatever order is required 953 * by the protocol. 954 * 955 * Protocol implementors are advised to hold the socket lock across the 956 * socket-layer test and set to avoid races at the socket layer. 957 */ 958 int 959 solisten(struct socket *so, int backlog, struct thread *td) 960 { 961 int error; 962 963 CURVNET_SET(so->so_vnet); 964 error = so->so_proto->pr_listen(so, backlog, td); 965 CURVNET_RESTORE(); 966 return (error); 967 } 968 969 /* 970 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 971 * order to interlock with socket I/O. 972 */ 973 int 974 solisten_proto_check(struct socket *so) 975 { 976 SOCK_LOCK_ASSERT(so); 977 978 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 979 SS_ISDISCONNECTING)) != 0) 980 return (EINVAL); 981 982 /* 983 * Sleeping is not permitted here, so simply fail if userspace is 984 * attempting to transmit or receive on the socket. This kind of 985 * transient failure is not ideal, but it should occur only if userspace 986 * is misusing the socket interfaces. 987 */ 988 if (!sx_try_xlock(&so->so_snd_sx)) 989 return (EAGAIN); 990 if (!sx_try_xlock(&so->so_rcv_sx)) { 991 sx_xunlock(&so->so_snd_sx); 992 return (EAGAIN); 993 } 994 mtx_lock(&so->so_snd_mtx); 995 mtx_lock(&so->so_rcv_mtx); 996 997 /* Interlock with soo_aio_queue(). */ 998 if (!SOLISTENING(so) && 999 ((so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || 1000 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0)) { 1001 solisten_proto_abort(so); 1002 return (EINVAL); 1003 } 1004 return (0); 1005 } 1006 1007 /* 1008 * Undo the setup done by solisten_proto_check(). 1009 */ 1010 void 1011 solisten_proto_abort(struct socket *so) 1012 { 1013 mtx_unlock(&so->so_snd_mtx); 1014 mtx_unlock(&so->so_rcv_mtx); 1015 sx_xunlock(&so->so_snd_sx); 1016 sx_xunlock(&so->so_rcv_sx); 1017 } 1018 1019 void 1020 solisten_proto(struct socket *so, int backlog) 1021 { 1022 int sbrcv_lowat, sbsnd_lowat; 1023 u_int sbrcv_hiwat, sbsnd_hiwat; 1024 short sbrcv_flags, sbsnd_flags; 1025 sbintime_t sbrcv_timeo, sbsnd_timeo; 1026 1027 SOCK_LOCK_ASSERT(so); 1028 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1029 SS_ISDISCONNECTING)) == 0, 1030 ("%s: bad socket state %p", __func__, so)); 1031 1032 if (SOLISTENING(so)) 1033 goto listening; 1034 1035 /* 1036 * Change this socket to listening state. 1037 */ 1038 sbrcv_lowat = so->so_rcv.sb_lowat; 1039 sbsnd_lowat = so->so_snd.sb_lowat; 1040 sbrcv_hiwat = so->so_rcv.sb_hiwat; 1041 sbsnd_hiwat = so->so_snd.sb_hiwat; 1042 sbrcv_flags = so->so_rcv.sb_flags; 1043 sbsnd_flags = so->so_snd.sb_flags; 1044 sbrcv_timeo = so->so_rcv.sb_timeo; 1045 sbsnd_timeo = so->so_snd.sb_timeo; 1046 1047 sbdestroy(so, SO_SND); 1048 sbdestroy(so, SO_RCV); 1049 1050 #ifdef INVARIANTS 1051 bzero(&so->so_rcv, 1052 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 1053 #endif 1054 1055 so->sol_sbrcv_lowat = sbrcv_lowat; 1056 so->sol_sbsnd_lowat = sbsnd_lowat; 1057 so->sol_sbrcv_hiwat = sbrcv_hiwat; 1058 so->sol_sbsnd_hiwat = sbsnd_hiwat; 1059 so->sol_sbrcv_flags = sbrcv_flags; 1060 so->sol_sbsnd_flags = sbsnd_flags; 1061 so->sol_sbrcv_timeo = sbrcv_timeo; 1062 so->sol_sbsnd_timeo = sbsnd_timeo; 1063 1064 so->sol_qlen = so->sol_incqlen = 0; 1065 TAILQ_INIT(&so->sol_incomp); 1066 TAILQ_INIT(&so->sol_comp); 1067 1068 so->sol_accept_filter = NULL; 1069 so->sol_accept_filter_arg = NULL; 1070 so->sol_accept_filter_str = NULL; 1071 1072 so->sol_upcall = NULL; 1073 so->sol_upcallarg = NULL; 1074 1075 so->so_options |= SO_ACCEPTCONN; 1076 1077 listening: 1078 if (backlog < 0 || backlog > somaxconn) 1079 backlog = somaxconn; 1080 so->sol_qlimit = backlog; 1081 1082 mtx_unlock(&so->so_snd_mtx); 1083 mtx_unlock(&so->so_rcv_mtx); 1084 sx_xunlock(&so->so_snd_sx); 1085 sx_xunlock(&so->so_rcv_sx); 1086 } 1087 1088 /* 1089 * Wakeup listeners/subsystems once we have a complete connection. 1090 * Enters with lock, returns unlocked. 1091 */ 1092 void 1093 solisten_wakeup(struct socket *sol) 1094 { 1095 1096 if (sol->sol_upcall != NULL) 1097 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1098 else { 1099 selwakeuppri(&sol->so_rdsel, PSOCK); 1100 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1101 } 1102 SOLISTEN_UNLOCK(sol); 1103 wakeup_one(&sol->sol_comp); 1104 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1105 pgsigio(&sol->so_sigio, SIGIO, 0); 1106 } 1107 1108 /* 1109 * Return single connection off a listening socket queue. Main consumer of 1110 * the function is kern_accept4(). Some modules, that do their own accept 1111 * management also use the function. The socket reference held by the 1112 * listen queue is handed to the caller. 1113 * 1114 * Listening socket must be locked on entry and is returned unlocked on 1115 * return. 1116 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1117 */ 1118 int 1119 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1120 { 1121 struct socket *so; 1122 int error; 1123 1124 SOLISTEN_LOCK_ASSERT(head); 1125 1126 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1127 head->so_error == 0) { 1128 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1129 "accept", 0); 1130 if (error != 0) { 1131 SOLISTEN_UNLOCK(head); 1132 return (error); 1133 } 1134 } 1135 if (head->so_error) { 1136 error = head->so_error; 1137 head->so_error = 0; 1138 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1139 error = EWOULDBLOCK; 1140 else 1141 error = 0; 1142 if (error) { 1143 SOLISTEN_UNLOCK(head); 1144 return (error); 1145 } 1146 so = TAILQ_FIRST(&head->sol_comp); 1147 SOCK_LOCK(so); 1148 KASSERT(so->so_qstate == SQ_COMP, 1149 ("%s: so %p not SQ_COMP", __func__, so)); 1150 head->sol_qlen--; 1151 so->so_qstate = SQ_NONE; 1152 so->so_listen = NULL; 1153 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1154 if (flags & ACCEPT4_INHERIT) 1155 so->so_state |= (head->so_state & SS_NBIO); 1156 else 1157 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1158 SOCK_UNLOCK(so); 1159 sorele_locked(head); 1160 1161 *ret = so; 1162 return (0); 1163 } 1164 1165 /* 1166 * Free socket upon release of the very last reference. 1167 */ 1168 static void 1169 sofree(struct socket *so) 1170 { 1171 struct protosw *pr = so->so_proto; 1172 1173 SOCK_LOCK_ASSERT(so); 1174 KASSERT(refcount_load(&so->so_count) == 0, 1175 ("%s: so %p has references", __func__, so)); 1176 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, 1177 ("%s: so %p is on listen queue", __func__, so)); 1178 1179 SOCK_UNLOCK(so); 1180 1181 if (so->so_dtor != NULL) 1182 so->so_dtor(so); 1183 1184 VNET_SO_ASSERT(so); 1185 if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) { 1186 MPASS(pr->pr_domain->dom_dispose != NULL); 1187 (*pr->pr_domain->dom_dispose)(so); 1188 } 1189 if (pr->pr_detach != NULL) 1190 pr->pr_detach(so); 1191 1192 /* 1193 * From this point on, we assume that no other references to this 1194 * socket exist anywhere else in the stack. Therefore, no locks need 1195 * to be acquired or held. 1196 */ 1197 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { 1198 sbdestroy(so, SO_SND); 1199 sbdestroy(so, SO_RCV); 1200 } 1201 seldrain(&so->so_rdsel); 1202 seldrain(&so->so_wrsel); 1203 knlist_destroy(&so->so_rdsel.si_note); 1204 knlist_destroy(&so->so_wrsel.si_note); 1205 sodealloc(so); 1206 } 1207 1208 /* 1209 * Release a reference on a socket while holding the socket lock. 1210 * Unlocks the socket lock before returning. 1211 */ 1212 void 1213 sorele_locked(struct socket *so) 1214 { 1215 SOCK_LOCK_ASSERT(so); 1216 if (refcount_release(&so->so_count)) 1217 sofree(so); 1218 else 1219 SOCK_UNLOCK(so); 1220 } 1221 1222 /* 1223 * Close a socket on last file table reference removal. Initiate disconnect 1224 * if connected. Free socket when disconnect complete. 1225 * 1226 * This function will sorele() the socket. Note that soclose() may be called 1227 * prior to the ref count reaching zero. The actual socket structure will 1228 * not be freed until the ref count reaches zero. 1229 */ 1230 int 1231 soclose(struct socket *so) 1232 { 1233 struct accept_queue lqueue; 1234 int error = 0; 1235 bool listening, last __diagused; 1236 1237 CURVNET_SET(so->so_vnet); 1238 funsetown(&so->so_sigio); 1239 if (so->so_state & SS_ISCONNECTED) { 1240 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1241 error = sodisconnect(so); 1242 if (error) { 1243 if (error == ENOTCONN) 1244 error = 0; 1245 goto drop; 1246 } 1247 } 1248 1249 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 1250 if ((so->so_state & SS_ISDISCONNECTING) && 1251 (so->so_state & SS_NBIO)) 1252 goto drop; 1253 while (so->so_state & SS_ISCONNECTED) { 1254 error = tsleep(&so->so_timeo, 1255 PSOCK | PCATCH, "soclos", 1256 so->so_linger * hz); 1257 if (error) 1258 break; 1259 } 1260 } 1261 } 1262 1263 drop: 1264 if (so->so_proto->pr_close != NULL) 1265 so->so_proto->pr_close(so); 1266 1267 SOCK_LOCK(so); 1268 if ((listening = SOLISTENING(so))) { 1269 struct socket *sp; 1270 1271 TAILQ_INIT(&lqueue); 1272 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1273 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1274 1275 so->sol_qlen = so->sol_incqlen = 0; 1276 1277 TAILQ_FOREACH(sp, &lqueue, so_list) { 1278 SOCK_LOCK(sp); 1279 sp->so_qstate = SQ_NONE; 1280 sp->so_listen = NULL; 1281 SOCK_UNLOCK(sp); 1282 last = refcount_release(&so->so_count); 1283 KASSERT(!last, ("%s: released last reference for %p", 1284 __func__, so)); 1285 } 1286 } 1287 sorele_locked(so); 1288 if (listening) { 1289 struct socket *sp, *tsp; 1290 1291 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) 1292 soabort(sp); 1293 } 1294 CURVNET_RESTORE(); 1295 return (error); 1296 } 1297 1298 /* 1299 * soabort() is used to abruptly tear down a connection, such as when a 1300 * resource limit is reached (listen queue depth exceeded), or if a listen 1301 * socket is closed while there are sockets waiting to be accepted. 1302 * 1303 * This interface is tricky, because it is called on an unreferenced socket, 1304 * and must be called only by a thread that has actually removed the socket 1305 * from the listen queue it was on. Likely this thread holds the last 1306 * reference on the socket and soabort() will proceed with sofree(). But 1307 * it might be not the last, as the sockets on the listen queues are seen 1308 * from the protocol side. 1309 * 1310 * This interface will call into the protocol code, so must not be called 1311 * with any socket locks held. Protocols do call it while holding their own 1312 * recursible protocol mutexes, but this is something that should be subject 1313 * to review in the future. 1314 * 1315 * Usually socket should have a single reference left, but this is not a 1316 * requirement. In the past, when we have had named references for file 1317 * descriptor and protocol, we asserted that none of them are being held. 1318 */ 1319 void 1320 soabort(struct socket *so) 1321 { 1322 1323 VNET_SO_ASSERT(so); 1324 1325 if (so->so_proto->pr_abort != NULL) 1326 so->so_proto->pr_abort(so); 1327 SOCK_LOCK(so); 1328 sorele_locked(so); 1329 } 1330 1331 int 1332 soaccept(struct socket *so, struct sockaddr **nam) 1333 { 1334 int error; 1335 1336 CURVNET_SET(so->so_vnet); 1337 error = so->so_proto->pr_accept(so, nam); 1338 CURVNET_RESTORE(); 1339 return (error); 1340 } 1341 1342 int 1343 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 1344 { 1345 1346 return (soconnectat(AT_FDCWD, so, nam, td)); 1347 } 1348 1349 int 1350 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1351 { 1352 int error; 1353 1354 CURVNET_SET(so->so_vnet); 1355 /* 1356 * If protocol is connection-based, can only connect once. 1357 * Otherwise, if connected, try to disconnect first. This allows 1358 * user to disconnect by connecting to, e.g., a null address. 1359 */ 1360 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1361 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1362 (error = sodisconnect(so)))) { 1363 error = EISCONN; 1364 } else { 1365 /* 1366 * Prevent accumulated error from previous connection from 1367 * biting us. 1368 */ 1369 so->so_error = 0; 1370 if (fd == AT_FDCWD) { 1371 error = so->so_proto->pr_connect(so, nam, td); 1372 } else { 1373 error = so->so_proto->pr_connectat(fd, so, nam, td); 1374 } 1375 } 1376 CURVNET_RESTORE(); 1377 1378 return (error); 1379 } 1380 1381 int 1382 soconnect2(struct socket *so1, struct socket *so2) 1383 { 1384 int error; 1385 1386 CURVNET_SET(so1->so_vnet); 1387 error = so1->so_proto->pr_connect2(so1, so2); 1388 CURVNET_RESTORE(); 1389 return (error); 1390 } 1391 1392 int 1393 sodisconnect(struct socket *so) 1394 { 1395 int error; 1396 1397 if ((so->so_state & SS_ISCONNECTED) == 0) 1398 return (ENOTCONN); 1399 if (so->so_state & SS_ISDISCONNECTING) 1400 return (EALREADY); 1401 VNET_SO_ASSERT(so); 1402 error = so->so_proto->pr_disconnect(so); 1403 return (error); 1404 } 1405 1406 int 1407 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1408 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1409 { 1410 long space; 1411 ssize_t resid; 1412 int clen = 0, error, dontroute; 1413 1414 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1415 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1416 ("sosend_dgram: !PR_ATOMIC")); 1417 1418 if (uio != NULL) 1419 resid = uio->uio_resid; 1420 else 1421 resid = top->m_pkthdr.len; 1422 /* 1423 * In theory resid should be unsigned. However, space must be 1424 * signed, as it might be less than 0 if we over-committed, and we 1425 * must use a signed comparison of space and resid. On the other 1426 * hand, a negative resid causes us to loop sending 0-length 1427 * segments to the protocol. 1428 */ 1429 if (resid < 0) { 1430 error = EINVAL; 1431 goto out; 1432 } 1433 1434 dontroute = 1435 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1436 if (td != NULL) 1437 td->td_ru.ru_msgsnd++; 1438 if (control != NULL) 1439 clen = control->m_len; 1440 1441 SOCKBUF_LOCK(&so->so_snd); 1442 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1443 SOCKBUF_UNLOCK(&so->so_snd); 1444 error = EPIPE; 1445 goto out; 1446 } 1447 if (so->so_error) { 1448 error = so->so_error; 1449 so->so_error = 0; 1450 SOCKBUF_UNLOCK(&so->so_snd); 1451 goto out; 1452 } 1453 if ((so->so_state & SS_ISCONNECTED) == 0) { 1454 /* 1455 * `sendto' and `sendmsg' is allowed on a connection-based 1456 * socket if it supports implied connect. Return ENOTCONN if 1457 * not connected and no address is supplied. 1458 */ 1459 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1460 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1461 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1462 !(resid == 0 && clen != 0)) { 1463 SOCKBUF_UNLOCK(&so->so_snd); 1464 error = ENOTCONN; 1465 goto out; 1466 } 1467 } else if (addr == NULL) { 1468 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1469 error = ENOTCONN; 1470 else 1471 error = EDESTADDRREQ; 1472 SOCKBUF_UNLOCK(&so->so_snd); 1473 goto out; 1474 } 1475 } 1476 1477 /* 1478 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1479 * problem and need fixing. 1480 */ 1481 space = sbspace(&so->so_snd); 1482 if (flags & MSG_OOB) 1483 space += 1024; 1484 space -= clen; 1485 SOCKBUF_UNLOCK(&so->so_snd); 1486 if (resid > space) { 1487 error = EMSGSIZE; 1488 goto out; 1489 } 1490 if (uio == NULL) { 1491 resid = 0; 1492 if (flags & MSG_EOR) 1493 top->m_flags |= M_EOR; 1494 } else { 1495 /* 1496 * Copy the data from userland into a mbuf chain. 1497 * If no data is to be copied in, a single empty mbuf 1498 * is returned. 1499 */ 1500 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1501 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1502 if (top == NULL) { 1503 error = EFAULT; /* only possible error */ 1504 goto out; 1505 } 1506 space -= resid - uio->uio_resid; 1507 resid = uio->uio_resid; 1508 } 1509 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1510 /* 1511 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1512 * than with. 1513 */ 1514 if (dontroute) { 1515 SOCK_LOCK(so); 1516 so->so_options |= SO_DONTROUTE; 1517 SOCK_UNLOCK(so); 1518 } 1519 /* 1520 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1521 * of date. We could have received a reset packet in an interrupt or 1522 * maybe we slept while doing page faults in uiomove() etc. We could 1523 * probably recheck again inside the locking protection here, but 1524 * there are probably other places that this also happens. We must 1525 * rethink this. 1526 */ 1527 VNET_SO_ASSERT(so); 1528 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1529 /* 1530 * If the user set MSG_EOF, the protocol understands this flag and 1531 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1532 */ 1533 ((flags & MSG_EOF) && 1534 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1535 (resid <= 0)) ? 1536 PRUS_EOF : 1537 /* If there is more to send set PRUS_MORETOCOME */ 1538 (flags & MSG_MORETOCOME) || 1539 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1540 top, addr, control, td); 1541 if (dontroute) { 1542 SOCK_LOCK(so); 1543 so->so_options &= ~SO_DONTROUTE; 1544 SOCK_UNLOCK(so); 1545 } 1546 clen = 0; 1547 control = NULL; 1548 top = NULL; 1549 out: 1550 if (top != NULL) 1551 m_freem(top); 1552 if (control != NULL) 1553 m_freem(control); 1554 return (error); 1555 } 1556 1557 /* 1558 * Send on a socket. If send must go all at once and message is larger than 1559 * send buffering, then hard error. Lock against other senders. If must go 1560 * all at once and not enough room now, then inform user that this would 1561 * block and do nothing. Otherwise, if nonblocking, send as much as 1562 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1563 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1564 * in mbuf chain must be small enough to send all at once. 1565 * 1566 * Returns nonzero on error, timeout or signal; callers must check for short 1567 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1568 * on return. 1569 */ 1570 int 1571 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1572 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1573 { 1574 long space; 1575 ssize_t resid; 1576 int clen = 0, error, dontroute; 1577 int atomic = sosendallatonce(so) || top; 1578 int pr_send_flag; 1579 #ifdef KERN_TLS 1580 struct ktls_session *tls; 1581 int tls_enq_cnt, tls_send_flag; 1582 uint8_t tls_rtype; 1583 1584 tls = NULL; 1585 tls_rtype = TLS_RLTYPE_APP; 1586 #endif 1587 if (uio != NULL) 1588 resid = uio->uio_resid; 1589 else if ((top->m_flags & M_PKTHDR) != 0) 1590 resid = top->m_pkthdr.len; 1591 else 1592 resid = m_length(top, NULL); 1593 /* 1594 * In theory resid should be unsigned. However, space must be 1595 * signed, as it might be less than 0 if we over-committed, and we 1596 * must use a signed comparison of space and resid. On the other 1597 * hand, a negative resid causes us to loop sending 0-length 1598 * segments to the protocol. 1599 * 1600 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1601 * type sockets since that's an error. 1602 */ 1603 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1604 error = EINVAL; 1605 goto out; 1606 } 1607 1608 dontroute = 1609 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1610 (so->so_proto->pr_flags & PR_ATOMIC); 1611 if (td != NULL) 1612 td->td_ru.ru_msgsnd++; 1613 if (control != NULL) 1614 clen = control->m_len; 1615 1616 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 1617 if (error) 1618 goto out; 1619 1620 #ifdef KERN_TLS 1621 tls_send_flag = 0; 1622 tls = ktls_hold(so->so_snd.sb_tls_info); 1623 if (tls != NULL) { 1624 if (tls->mode == TCP_TLS_MODE_SW) 1625 tls_send_flag = PRUS_NOTREADY; 1626 1627 if (control != NULL) { 1628 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1629 1630 if (clen >= sizeof(*cm) && 1631 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 1632 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 1633 clen = 0; 1634 m_freem(control); 1635 control = NULL; 1636 atomic = 1; 1637 } 1638 } 1639 1640 if (resid == 0 && !ktls_permit_empty_frames(tls)) { 1641 error = EINVAL; 1642 goto release; 1643 } 1644 } 1645 #endif 1646 1647 restart: 1648 do { 1649 SOCKBUF_LOCK(&so->so_snd); 1650 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1651 SOCKBUF_UNLOCK(&so->so_snd); 1652 error = EPIPE; 1653 goto release; 1654 } 1655 if (so->so_error) { 1656 error = so->so_error; 1657 so->so_error = 0; 1658 SOCKBUF_UNLOCK(&so->so_snd); 1659 goto release; 1660 } 1661 if ((so->so_state & SS_ISCONNECTED) == 0) { 1662 /* 1663 * `sendto' and `sendmsg' is allowed on a connection- 1664 * based socket if it supports implied connect. 1665 * Return ENOTCONN if not connected and no address is 1666 * supplied. 1667 */ 1668 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1669 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1670 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1671 !(resid == 0 && clen != 0)) { 1672 SOCKBUF_UNLOCK(&so->so_snd); 1673 error = ENOTCONN; 1674 goto release; 1675 } 1676 } else if (addr == NULL) { 1677 SOCKBUF_UNLOCK(&so->so_snd); 1678 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1679 error = ENOTCONN; 1680 else 1681 error = EDESTADDRREQ; 1682 goto release; 1683 } 1684 } 1685 space = sbspace(&so->so_snd); 1686 if (flags & MSG_OOB) 1687 space += 1024; 1688 if ((atomic && resid > so->so_snd.sb_hiwat) || 1689 clen > so->so_snd.sb_hiwat) { 1690 SOCKBUF_UNLOCK(&so->so_snd); 1691 error = EMSGSIZE; 1692 goto release; 1693 } 1694 if (space < resid + clen && 1695 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1696 if ((so->so_state & SS_NBIO) || 1697 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1698 SOCKBUF_UNLOCK(&so->so_snd); 1699 error = EWOULDBLOCK; 1700 goto release; 1701 } 1702 error = sbwait(so, SO_SND); 1703 SOCKBUF_UNLOCK(&so->so_snd); 1704 if (error) 1705 goto release; 1706 goto restart; 1707 } 1708 SOCKBUF_UNLOCK(&so->so_snd); 1709 space -= clen; 1710 do { 1711 if (uio == NULL) { 1712 resid = 0; 1713 if (flags & MSG_EOR) 1714 top->m_flags |= M_EOR; 1715 #ifdef KERN_TLS 1716 if (tls != NULL) { 1717 ktls_frame(top, tls, &tls_enq_cnt, 1718 tls_rtype); 1719 tls_rtype = TLS_RLTYPE_APP; 1720 } 1721 #endif 1722 } else { 1723 /* 1724 * Copy the data from userland into a mbuf 1725 * chain. If resid is 0, which can happen 1726 * only if we have control to send, then 1727 * a single empty mbuf is returned. This 1728 * is a workaround to prevent protocol send 1729 * methods to panic. 1730 */ 1731 #ifdef KERN_TLS 1732 if (tls != NULL) { 1733 top = m_uiotombuf(uio, M_WAITOK, space, 1734 tls->params.max_frame_len, 1735 M_EXTPG | 1736 ((flags & MSG_EOR) ? M_EOR : 0)); 1737 if (top != NULL) { 1738 ktls_frame(top, tls, 1739 &tls_enq_cnt, tls_rtype); 1740 } 1741 tls_rtype = TLS_RLTYPE_APP; 1742 } else 1743 #endif 1744 top = m_uiotombuf(uio, M_WAITOK, space, 1745 (atomic ? max_hdr : 0), 1746 (atomic ? M_PKTHDR : 0) | 1747 ((flags & MSG_EOR) ? M_EOR : 0)); 1748 if (top == NULL) { 1749 error = EFAULT; /* only possible error */ 1750 goto release; 1751 } 1752 space -= resid - uio->uio_resid; 1753 resid = uio->uio_resid; 1754 } 1755 if (dontroute) { 1756 SOCK_LOCK(so); 1757 so->so_options |= SO_DONTROUTE; 1758 SOCK_UNLOCK(so); 1759 } 1760 /* 1761 * XXX all the SBS_CANTSENDMORE checks previously 1762 * done could be out of date. We could have received 1763 * a reset packet in an interrupt or maybe we slept 1764 * while doing page faults in uiomove() etc. We 1765 * could probably recheck again inside the locking 1766 * protection here, but there are probably other 1767 * places that this also happens. We must rethink 1768 * this. 1769 */ 1770 VNET_SO_ASSERT(so); 1771 1772 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : 1773 /* 1774 * If the user set MSG_EOF, the protocol understands 1775 * this flag and nothing left to send then use 1776 * PRU_SEND_EOF instead of PRU_SEND. 1777 */ 1778 ((flags & MSG_EOF) && 1779 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1780 (resid <= 0)) ? 1781 PRUS_EOF : 1782 /* If there is more to send set PRUS_MORETOCOME. */ 1783 (flags & MSG_MORETOCOME) || 1784 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 1785 1786 #ifdef KERN_TLS 1787 pr_send_flag |= tls_send_flag; 1788 #endif 1789 1790 error = so->so_proto->pr_send(so, pr_send_flag, top, 1791 addr, control, td); 1792 1793 if (dontroute) { 1794 SOCK_LOCK(so); 1795 so->so_options &= ~SO_DONTROUTE; 1796 SOCK_UNLOCK(so); 1797 } 1798 1799 #ifdef KERN_TLS 1800 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 1801 if (error != 0) { 1802 m_freem(top); 1803 top = NULL; 1804 } else { 1805 soref(so); 1806 ktls_enqueue(top, so, tls_enq_cnt); 1807 } 1808 } 1809 #endif 1810 clen = 0; 1811 control = NULL; 1812 top = NULL; 1813 if (error) 1814 goto release; 1815 } while (resid && space > 0); 1816 } while (resid); 1817 1818 release: 1819 SOCK_IO_SEND_UNLOCK(so); 1820 out: 1821 #ifdef KERN_TLS 1822 if (tls != NULL) 1823 ktls_free(tls); 1824 #endif 1825 if (top != NULL) 1826 m_freem(top); 1827 if (control != NULL) 1828 m_freem(control); 1829 return (error); 1830 } 1831 1832 int 1833 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1834 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1835 { 1836 int error; 1837 1838 CURVNET_SET(so->so_vnet); 1839 error = so->so_proto->pr_sosend(so, addr, uio, 1840 top, control, flags, td); 1841 CURVNET_RESTORE(); 1842 return (error); 1843 } 1844 1845 /* 1846 * The part of soreceive() that implements reading non-inline out-of-band 1847 * data from a socket. For more complete comments, see soreceive(), from 1848 * which this code originated. 1849 * 1850 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1851 * unable to return an mbuf chain to the caller. 1852 */ 1853 static int 1854 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1855 { 1856 struct protosw *pr = so->so_proto; 1857 struct mbuf *m; 1858 int error; 1859 1860 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1861 VNET_SO_ASSERT(so); 1862 1863 m = m_get(M_WAITOK, MT_DATA); 1864 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 1865 if (error) 1866 goto bad; 1867 do { 1868 error = uiomove(mtod(m, void *), 1869 (int) min(uio->uio_resid, m->m_len), uio); 1870 m = m_free(m); 1871 } while (uio->uio_resid && error == 0 && m); 1872 bad: 1873 if (m != NULL) 1874 m_freem(m); 1875 return (error); 1876 } 1877 1878 /* 1879 * Following replacement or removal of the first mbuf on the first mbuf chain 1880 * of a socket buffer, push necessary state changes back into the socket 1881 * buffer so that other consumers see the values consistently. 'nextrecord' 1882 * is the callers locally stored value of the original value of 1883 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1884 * NOTE: 'nextrecord' may be NULL. 1885 */ 1886 static __inline void 1887 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1888 { 1889 1890 SOCKBUF_LOCK_ASSERT(sb); 1891 /* 1892 * First, update for the new value of nextrecord. If necessary, make 1893 * it the first record. 1894 */ 1895 if (sb->sb_mb != NULL) 1896 sb->sb_mb->m_nextpkt = nextrecord; 1897 else 1898 sb->sb_mb = nextrecord; 1899 1900 /* 1901 * Now update any dependent socket buffer fields to reflect the new 1902 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1903 * addition of a second clause that takes care of the case where 1904 * sb_mb has been updated, but remains the last record. 1905 */ 1906 if (sb->sb_mb == NULL) { 1907 sb->sb_mbtail = NULL; 1908 sb->sb_lastrecord = NULL; 1909 } else if (sb->sb_mb->m_nextpkt == NULL) 1910 sb->sb_lastrecord = sb->sb_mb; 1911 } 1912 1913 /* 1914 * Implement receive operations on a socket. We depend on the way that 1915 * records are added to the sockbuf by sbappend. In particular, each record 1916 * (mbufs linked through m_next) must begin with an address if the protocol 1917 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1918 * data, and then zero or more mbufs of data. In order to allow parallelism 1919 * between network receive and copying to user space, as well as avoid 1920 * sleeping with a mutex held, we release the socket buffer mutex during the 1921 * user space copy. Although the sockbuf is locked, new data may still be 1922 * appended, and thus we must maintain consistency of the sockbuf during that 1923 * time. 1924 * 1925 * The caller may receive the data as a single mbuf chain by supplying an 1926 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1927 * the count in uio_resid. 1928 */ 1929 int 1930 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1931 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1932 { 1933 struct mbuf *m, **mp; 1934 int flags, error, offset; 1935 ssize_t len; 1936 struct protosw *pr = so->so_proto; 1937 struct mbuf *nextrecord; 1938 int moff, type = 0; 1939 ssize_t orig_resid = uio->uio_resid; 1940 bool report_real_len = false; 1941 1942 mp = mp0; 1943 if (psa != NULL) 1944 *psa = NULL; 1945 if (controlp != NULL) 1946 *controlp = NULL; 1947 if (flagsp != NULL) { 1948 report_real_len = *flagsp & MSG_TRUNC; 1949 *flagsp &= ~MSG_TRUNC; 1950 flags = *flagsp &~ MSG_EOR; 1951 } else 1952 flags = 0; 1953 if (flags & MSG_OOB) 1954 return (soreceive_rcvoob(so, uio, flags)); 1955 if (mp != NULL) 1956 *mp = NULL; 1957 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1958 && uio->uio_resid) { 1959 VNET_SO_ASSERT(so); 1960 pr->pr_rcvd(so, 0); 1961 } 1962 1963 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 1964 if (error) 1965 return (error); 1966 1967 restart: 1968 SOCKBUF_LOCK(&so->so_rcv); 1969 m = so->so_rcv.sb_mb; 1970 /* 1971 * If we have less data than requested, block awaiting more (subject 1972 * to any timeout) if: 1973 * 1. the current count is less than the low water mark, or 1974 * 2. MSG_DONTWAIT is not set 1975 */ 1976 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1977 sbavail(&so->so_rcv) < uio->uio_resid) && 1978 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 1979 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1980 KASSERT(m != NULL || !sbavail(&so->so_rcv), 1981 ("receive: m == %p sbavail == %u", 1982 m, sbavail(&so->so_rcv))); 1983 if (so->so_error || so->so_rerror) { 1984 if (m != NULL) 1985 goto dontblock; 1986 if (so->so_error) 1987 error = so->so_error; 1988 else 1989 error = so->so_rerror; 1990 if ((flags & MSG_PEEK) == 0) { 1991 if (so->so_error) 1992 so->so_error = 0; 1993 else 1994 so->so_rerror = 0; 1995 } 1996 SOCKBUF_UNLOCK(&so->so_rcv); 1997 goto release; 1998 } 1999 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2000 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2001 if (m != NULL) 2002 goto dontblock; 2003 #ifdef KERN_TLS 2004 else if (so->so_rcv.sb_tlsdcc == 0 && 2005 so->so_rcv.sb_tlscc == 0) { 2006 #else 2007 else { 2008 #endif 2009 SOCKBUF_UNLOCK(&so->so_rcv); 2010 goto release; 2011 } 2012 } 2013 for (; m != NULL; m = m->m_next) 2014 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2015 m = so->so_rcv.sb_mb; 2016 goto dontblock; 2017 } 2018 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2019 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2020 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2021 SOCKBUF_UNLOCK(&so->so_rcv); 2022 error = ENOTCONN; 2023 goto release; 2024 } 2025 if (uio->uio_resid == 0 && !report_real_len) { 2026 SOCKBUF_UNLOCK(&so->so_rcv); 2027 goto release; 2028 } 2029 if ((so->so_state & SS_NBIO) || 2030 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2031 SOCKBUF_UNLOCK(&so->so_rcv); 2032 error = EWOULDBLOCK; 2033 goto release; 2034 } 2035 SBLASTRECORDCHK(&so->so_rcv); 2036 SBLASTMBUFCHK(&so->so_rcv); 2037 error = sbwait(so, SO_RCV); 2038 SOCKBUF_UNLOCK(&so->so_rcv); 2039 if (error) 2040 goto release; 2041 goto restart; 2042 } 2043 dontblock: 2044 /* 2045 * From this point onward, we maintain 'nextrecord' as a cache of the 2046 * pointer to the next record in the socket buffer. We must keep the 2047 * various socket buffer pointers and local stack versions of the 2048 * pointers in sync, pushing out modifications before dropping the 2049 * socket buffer mutex, and re-reading them when picking it up. 2050 * 2051 * Otherwise, we will race with the network stack appending new data 2052 * or records onto the socket buffer by using inconsistent/stale 2053 * versions of the field, possibly resulting in socket buffer 2054 * corruption. 2055 * 2056 * By holding the high-level sblock(), we prevent simultaneous 2057 * readers from pulling off the front of the socket buffer. 2058 */ 2059 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2060 if (uio->uio_td) 2061 uio->uio_td->td_ru.ru_msgrcv++; 2062 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2063 SBLASTRECORDCHK(&so->so_rcv); 2064 SBLASTMBUFCHK(&so->so_rcv); 2065 nextrecord = m->m_nextpkt; 2066 if (pr->pr_flags & PR_ADDR) { 2067 KASSERT(m->m_type == MT_SONAME, 2068 ("m->m_type == %d", m->m_type)); 2069 orig_resid = 0; 2070 if (psa != NULL) 2071 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2072 M_NOWAIT); 2073 if (flags & MSG_PEEK) { 2074 m = m->m_next; 2075 } else { 2076 sbfree(&so->so_rcv, m); 2077 so->so_rcv.sb_mb = m_free(m); 2078 m = so->so_rcv.sb_mb; 2079 sockbuf_pushsync(&so->so_rcv, nextrecord); 2080 } 2081 } 2082 2083 /* 2084 * Process one or more MT_CONTROL mbufs present before any data mbufs 2085 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2086 * just copy the data; if !MSG_PEEK, we call into the protocol to 2087 * perform externalization (or freeing if controlp == NULL). 2088 */ 2089 if (m != NULL && m->m_type == MT_CONTROL) { 2090 struct mbuf *cm = NULL, *cmn; 2091 struct mbuf **cme = &cm; 2092 #ifdef KERN_TLS 2093 struct cmsghdr *cmsg; 2094 struct tls_get_record tgr; 2095 2096 /* 2097 * For MSG_TLSAPPDATA, check for an alert record. 2098 * If found, return ENXIO without removing 2099 * it from the receive queue. This allows a subsequent 2100 * call without MSG_TLSAPPDATA to receive it. 2101 * Note that, for TLS, there should only be a single 2102 * control mbuf with the TLS_GET_RECORD message in it. 2103 */ 2104 if (flags & MSG_TLSAPPDATA) { 2105 cmsg = mtod(m, struct cmsghdr *); 2106 if (cmsg->cmsg_type == TLS_GET_RECORD && 2107 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2108 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2109 if (__predict_false(tgr.tls_type == 2110 TLS_RLTYPE_ALERT)) { 2111 SOCKBUF_UNLOCK(&so->so_rcv); 2112 error = ENXIO; 2113 goto release; 2114 } 2115 } 2116 } 2117 #endif 2118 2119 do { 2120 if (flags & MSG_PEEK) { 2121 if (controlp != NULL) { 2122 *controlp = m_copym(m, 0, m->m_len, 2123 M_NOWAIT); 2124 controlp = &(*controlp)->m_next; 2125 } 2126 m = m->m_next; 2127 } else { 2128 sbfree(&so->so_rcv, m); 2129 so->so_rcv.sb_mb = m->m_next; 2130 m->m_next = NULL; 2131 *cme = m; 2132 cme = &(*cme)->m_next; 2133 m = so->so_rcv.sb_mb; 2134 } 2135 } while (m != NULL && m->m_type == MT_CONTROL); 2136 if ((flags & MSG_PEEK) == 0) 2137 sockbuf_pushsync(&so->so_rcv, nextrecord); 2138 while (cm != NULL) { 2139 cmn = cm->m_next; 2140 cm->m_next = NULL; 2141 if (pr->pr_domain->dom_externalize != NULL) { 2142 SOCKBUF_UNLOCK(&so->so_rcv); 2143 VNET_SO_ASSERT(so); 2144 error = (*pr->pr_domain->dom_externalize) 2145 (cm, controlp, flags); 2146 SOCKBUF_LOCK(&so->so_rcv); 2147 } else if (controlp != NULL) 2148 *controlp = cm; 2149 else 2150 m_freem(cm); 2151 if (controlp != NULL) { 2152 while (*controlp != NULL) 2153 controlp = &(*controlp)->m_next; 2154 } 2155 cm = cmn; 2156 } 2157 if (m != NULL) 2158 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 2159 else 2160 nextrecord = so->so_rcv.sb_mb; 2161 orig_resid = 0; 2162 } 2163 if (m != NULL) { 2164 if ((flags & MSG_PEEK) == 0) { 2165 KASSERT(m->m_nextpkt == nextrecord, 2166 ("soreceive: post-control, nextrecord !sync")); 2167 if (nextrecord == NULL) { 2168 KASSERT(so->so_rcv.sb_mb == m, 2169 ("soreceive: post-control, sb_mb!=m")); 2170 KASSERT(so->so_rcv.sb_lastrecord == m, 2171 ("soreceive: post-control, lastrecord!=m")); 2172 } 2173 } 2174 type = m->m_type; 2175 if (type == MT_OOBDATA) 2176 flags |= MSG_OOB; 2177 } else { 2178 if ((flags & MSG_PEEK) == 0) { 2179 KASSERT(so->so_rcv.sb_mb == nextrecord, 2180 ("soreceive: sb_mb != nextrecord")); 2181 if (so->so_rcv.sb_mb == NULL) { 2182 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2183 ("soreceive: sb_lastercord != NULL")); 2184 } 2185 } 2186 } 2187 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2188 SBLASTRECORDCHK(&so->so_rcv); 2189 SBLASTMBUFCHK(&so->so_rcv); 2190 2191 /* 2192 * Now continue to read any data mbufs off of the head of the socket 2193 * buffer until the read request is satisfied. Note that 'type' is 2194 * used to store the type of any mbuf reads that have happened so far 2195 * such that soreceive() can stop reading if the type changes, which 2196 * causes soreceive() to return only one of regular data and inline 2197 * out-of-band data in a single socket receive operation. 2198 */ 2199 moff = 0; 2200 offset = 0; 2201 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 2202 && error == 0) { 2203 /* 2204 * If the type of mbuf has changed since the last mbuf 2205 * examined ('type'), end the receive operation. 2206 */ 2207 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2208 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 2209 if (type != m->m_type) 2210 break; 2211 } else if (type == MT_OOBDATA) 2212 break; 2213 else 2214 KASSERT(m->m_type == MT_DATA, 2215 ("m->m_type == %d", m->m_type)); 2216 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 2217 len = uio->uio_resid; 2218 if (so->so_oobmark && len > so->so_oobmark - offset) 2219 len = so->so_oobmark - offset; 2220 if (len > m->m_len - moff) 2221 len = m->m_len - moff; 2222 /* 2223 * If mp is set, just pass back the mbufs. Otherwise copy 2224 * them out via the uio, then free. Sockbuf must be 2225 * consistent here (points to current mbuf, it points to next 2226 * record) when we drop priority; we must note any additions 2227 * to the sockbuf when we block interrupts again. 2228 */ 2229 if (mp == NULL) { 2230 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2231 SBLASTRECORDCHK(&so->so_rcv); 2232 SBLASTMBUFCHK(&so->so_rcv); 2233 SOCKBUF_UNLOCK(&so->so_rcv); 2234 if ((m->m_flags & M_EXTPG) != 0) 2235 error = m_unmapped_uiomove(m, moff, uio, 2236 (int)len); 2237 else 2238 error = uiomove(mtod(m, char *) + moff, 2239 (int)len, uio); 2240 SOCKBUF_LOCK(&so->so_rcv); 2241 if (error) { 2242 /* 2243 * The MT_SONAME mbuf has already been removed 2244 * from the record, so it is necessary to 2245 * remove the data mbufs, if any, to preserve 2246 * the invariant in the case of PR_ADDR that 2247 * requires MT_SONAME mbufs at the head of 2248 * each record. 2249 */ 2250 if (pr->pr_flags & PR_ATOMIC && 2251 ((flags & MSG_PEEK) == 0)) 2252 (void)sbdroprecord_locked(&so->so_rcv); 2253 SOCKBUF_UNLOCK(&so->so_rcv); 2254 goto release; 2255 } 2256 } else 2257 uio->uio_resid -= len; 2258 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2259 if (len == m->m_len - moff) { 2260 if (m->m_flags & M_EOR) 2261 flags |= MSG_EOR; 2262 if (flags & MSG_PEEK) { 2263 m = m->m_next; 2264 moff = 0; 2265 } else { 2266 nextrecord = m->m_nextpkt; 2267 sbfree(&so->so_rcv, m); 2268 if (mp != NULL) { 2269 m->m_nextpkt = NULL; 2270 *mp = m; 2271 mp = &m->m_next; 2272 so->so_rcv.sb_mb = m = m->m_next; 2273 *mp = NULL; 2274 } else { 2275 so->so_rcv.sb_mb = m_free(m); 2276 m = so->so_rcv.sb_mb; 2277 } 2278 sockbuf_pushsync(&so->so_rcv, nextrecord); 2279 SBLASTRECORDCHK(&so->so_rcv); 2280 SBLASTMBUFCHK(&so->so_rcv); 2281 } 2282 } else { 2283 if (flags & MSG_PEEK) 2284 moff += len; 2285 else { 2286 if (mp != NULL) { 2287 if (flags & MSG_DONTWAIT) { 2288 *mp = m_copym(m, 0, len, 2289 M_NOWAIT); 2290 if (*mp == NULL) { 2291 /* 2292 * m_copym() couldn't 2293 * allocate an mbuf. 2294 * Adjust uio_resid back 2295 * (it was adjusted 2296 * down by len bytes, 2297 * which we didn't end 2298 * up "copying" over). 2299 */ 2300 uio->uio_resid += len; 2301 break; 2302 } 2303 } else { 2304 SOCKBUF_UNLOCK(&so->so_rcv); 2305 *mp = m_copym(m, 0, len, 2306 M_WAITOK); 2307 SOCKBUF_LOCK(&so->so_rcv); 2308 } 2309 } 2310 sbcut_locked(&so->so_rcv, len); 2311 } 2312 } 2313 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2314 if (so->so_oobmark) { 2315 if ((flags & MSG_PEEK) == 0) { 2316 so->so_oobmark -= len; 2317 if (so->so_oobmark == 0) { 2318 so->so_rcv.sb_state |= SBS_RCVATMARK; 2319 break; 2320 } 2321 } else { 2322 offset += len; 2323 if (offset == so->so_oobmark) 2324 break; 2325 } 2326 } 2327 if (flags & MSG_EOR) 2328 break; 2329 /* 2330 * If the MSG_WAITALL flag is set (for non-atomic socket), we 2331 * must not quit until "uio->uio_resid == 0" or an error 2332 * termination. If a signal/timeout occurs, return with a 2333 * short count but without error. Keep sockbuf locked 2334 * against other readers. 2335 */ 2336 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 2337 !sosendallatonce(so) && nextrecord == NULL) { 2338 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2339 if (so->so_error || so->so_rerror || 2340 so->so_rcv.sb_state & SBS_CANTRCVMORE) 2341 break; 2342 /* 2343 * Notify the protocol that some data has been 2344 * drained before blocking. 2345 */ 2346 if (pr->pr_flags & PR_WANTRCVD) { 2347 SOCKBUF_UNLOCK(&so->so_rcv); 2348 VNET_SO_ASSERT(so); 2349 pr->pr_rcvd(so, flags); 2350 SOCKBUF_LOCK(&so->so_rcv); 2351 } 2352 SBLASTRECORDCHK(&so->so_rcv); 2353 SBLASTMBUFCHK(&so->so_rcv); 2354 /* 2355 * We could receive some data while was notifying 2356 * the protocol. Skip blocking in this case. 2357 */ 2358 if (so->so_rcv.sb_mb == NULL) { 2359 error = sbwait(so, SO_RCV); 2360 if (error) { 2361 SOCKBUF_UNLOCK(&so->so_rcv); 2362 goto release; 2363 } 2364 } 2365 m = so->so_rcv.sb_mb; 2366 if (m != NULL) 2367 nextrecord = m->m_nextpkt; 2368 } 2369 } 2370 2371 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2372 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 2373 if (report_real_len) 2374 uio->uio_resid -= m_length(m, NULL) - moff; 2375 flags |= MSG_TRUNC; 2376 if ((flags & MSG_PEEK) == 0) 2377 (void) sbdroprecord_locked(&so->so_rcv); 2378 } 2379 if ((flags & MSG_PEEK) == 0) { 2380 if (m == NULL) { 2381 /* 2382 * First part is an inline SB_EMPTY_FIXUP(). Second 2383 * part makes sure sb_lastrecord is up-to-date if 2384 * there is still data in the socket buffer. 2385 */ 2386 so->so_rcv.sb_mb = nextrecord; 2387 if (so->so_rcv.sb_mb == NULL) { 2388 so->so_rcv.sb_mbtail = NULL; 2389 so->so_rcv.sb_lastrecord = NULL; 2390 } else if (nextrecord->m_nextpkt == NULL) 2391 so->so_rcv.sb_lastrecord = nextrecord; 2392 } 2393 SBLASTRECORDCHK(&so->so_rcv); 2394 SBLASTMBUFCHK(&so->so_rcv); 2395 /* 2396 * If soreceive() is being done from the socket callback, 2397 * then don't need to generate ACK to peer to update window, 2398 * since ACK will be generated on return to TCP. 2399 */ 2400 if (!(flags & MSG_SOCALLBCK) && 2401 (pr->pr_flags & PR_WANTRCVD)) { 2402 SOCKBUF_UNLOCK(&so->so_rcv); 2403 VNET_SO_ASSERT(so); 2404 pr->pr_rcvd(so, flags); 2405 SOCKBUF_LOCK(&so->so_rcv); 2406 } 2407 } 2408 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2409 if (orig_resid == uio->uio_resid && orig_resid && 2410 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 2411 SOCKBUF_UNLOCK(&so->so_rcv); 2412 goto restart; 2413 } 2414 SOCKBUF_UNLOCK(&so->so_rcv); 2415 2416 if (flagsp != NULL) 2417 *flagsp |= flags; 2418 release: 2419 SOCK_IO_RECV_UNLOCK(so); 2420 return (error); 2421 } 2422 2423 /* 2424 * Optimized version of soreceive() for stream (TCP) sockets. 2425 */ 2426 int 2427 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 2428 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2429 { 2430 int len = 0, error = 0, flags, oresid; 2431 struct sockbuf *sb; 2432 struct mbuf *m, *n = NULL; 2433 2434 /* We only do stream sockets. */ 2435 if (so->so_type != SOCK_STREAM) 2436 return (EINVAL); 2437 if (psa != NULL) 2438 *psa = NULL; 2439 if (flagsp != NULL) 2440 flags = *flagsp &~ MSG_EOR; 2441 else 2442 flags = 0; 2443 if (controlp != NULL) 2444 *controlp = NULL; 2445 if (flags & MSG_OOB) 2446 return (soreceive_rcvoob(so, uio, flags)); 2447 if (mp0 != NULL) 2448 *mp0 = NULL; 2449 2450 sb = &so->so_rcv; 2451 2452 #ifdef KERN_TLS 2453 /* 2454 * KTLS store TLS records as records with a control message to 2455 * describe the framing. 2456 * 2457 * We check once here before acquiring locks to optimize the 2458 * common case. 2459 */ 2460 if (sb->sb_tls_info != NULL) 2461 return (soreceive_generic(so, psa, uio, mp0, controlp, 2462 flagsp)); 2463 #endif 2464 2465 /* Prevent other readers from entering the socket. */ 2466 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 2467 if (error) 2468 return (error); 2469 SOCKBUF_LOCK(sb); 2470 2471 #ifdef KERN_TLS 2472 if (sb->sb_tls_info != NULL) { 2473 SOCKBUF_UNLOCK(sb); 2474 SOCK_IO_RECV_UNLOCK(so); 2475 return (soreceive_generic(so, psa, uio, mp0, controlp, 2476 flagsp)); 2477 } 2478 #endif 2479 2480 /* Easy one, no space to copyout anything. */ 2481 if (uio->uio_resid == 0) { 2482 error = EINVAL; 2483 goto out; 2484 } 2485 oresid = uio->uio_resid; 2486 2487 /* We will never ever get anything unless we are or were connected. */ 2488 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2489 error = ENOTCONN; 2490 goto out; 2491 } 2492 2493 restart: 2494 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2495 2496 /* Abort if socket has reported problems. */ 2497 if (so->so_error) { 2498 if (sbavail(sb) > 0) 2499 goto deliver; 2500 if (oresid > uio->uio_resid) 2501 goto out; 2502 error = so->so_error; 2503 if (!(flags & MSG_PEEK)) 2504 so->so_error = 0; 2505 goto out; 2506 } 2507 2508 /* Door is closed. Deliver what is left, if any. */ 2509 if (sb->sb_state & SBS_CANTRCVMORE) { 2510 if (sbavail(sb) > 0) 2511 goto deliver; 2512 else 2513 goto out; 2514 } 2515 2516 /* Socket buffer is empty and we shall not block. */ 2517 if (sbavail(sb) == 0 && 2518 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2519 error = EAGAIN; 2520 goto out; 2521 } 2522 2523 /* Socket buffer got some data that we shall deliver now. */ 2524 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 2525 ((so->so_state & SS_NBIO) || 2526 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2527 sbavail(sb) >= sb->sb_lowat || 2528 sbavail(sb) >= uio->uio_resid || 2529 sbavail(sb) >= sb->sb_hiwat) ) { 2530 goto deliver; 2531 } 2532 2533 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2534 if ((flags & MSG_WAITALL) && 2535 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 2536 goto deliver; 2537 2538 /* 2539 * Wait and block until (more) data comes in. 2540 * NB: Drops the sockbuf lock during wait. 2541 */ 2542 error = sbwait(so, SO_RCV); 2543 if (error) 2544 goto out; 2545 goto restart; 2546 2547 deliver: 2548 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2549 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 2550 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2551 2552 /* Statistics. */ 2553 if (uio->uio_td) 2554 uio->uio_td->td_ru.ru_msgrcv++; 2555 2556 /* Fill uio until full or current end of socket buffer is reached. */ 2557 len = min(uio->uio_resid, sbavail(sb)); 2558 if (mp0 != NULL) { 2559 /* Dequeue as many mbufs as possible. */ 2560 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2561 if (*mp0 == NULL) 2562 *mp0 = sb->sb_mb; 2563 else 2564 m_cat(*mp0, sb->sb_mb); 2565 for (m = sb->sb_mb; 2566 m != NULL && m->m_len <= len; 2567 m = m->m_next) { 2568 KASSERT(!(m->m_flags & M_NOTAVAIL), 2569 ("%s: m %p not available", __func__, m)); 2570 len -= m->m_len; 2571 uio->uio_resid -= m->m_len; 2572 sbfree(sb, m); 2573 n = m; 2574 } 2575 n->m_next = NULL; 2576 sb->sb_mb = m; 2577 sb->sb_lastrecord = sb->sb_mb; 2578 if (sb->sb_mb == NULL) 2579 SB_EMPTY_FIXUP(sb); 2580 } 2581 /* Copy the remainder. */ 2582 if (len > 0) { 2583 KASSERT(sb->sb_mb != NULL, 2584 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2585 2586 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2587 if (m == NULL) 2588 len = 0; /* Don't flush data from sockbuf. */ 2589 else 2590 uio->uio_resid -= len; 2591 if (*mp0 != NULL) 2592 m_cat(*mp0, m); 2593 else 2594 *mp0 = m; 2595 if (*mp0 == NULL) { 2596 error = ENOBUFS; 2597 goto out; 2598 } 2599 } 2600 } else { 2601 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2602 SOCKBUF_UNLOCK(sb); 2603 error = m_mbuftouio(uio, sb->sb_mb, len); 2604 SOCKBUF_LOCK(sb); 2605 if (error) 2606 goto out; 2607 } 2608 SBLASTRECORDCHK(sb); 2609 SBLASTMBUFCHK(sb); 2610 2611 /* 2612 * Remove the delivered data from the socket buffer unless we 2613 * were only peeking. 2614 */ 2615 if (!(flags & MSG_PEEK)) { 2616 if (len > 0) 2617 sbdrop_locked(sb, len); 2618 2619 /* Notify protocol that we drained some data. */ 2620 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2621 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2622 !(flags & MSG_SOCALLBCK))) { 2623 SOCKBUF_UNLOCK(sb); 2624 VNET_SO_ASSERT(so); 2625 so->so_proto->pr_rcvd(so, flags); 2626 SOCKBUF_LOCK(sb); 2627 } 2628 } 2629 2630 /* 2631 * For MSG_WAITALL we may have to loop again and wait for 2632 * more data to come in. 2633 */ 2634 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2635 goto restart; 2636 out: 2637 SBLASTRECORDCHK(sb); 2638 SBLASTMBUFCHK(sb); 2639 SOCKBUF_UNLOCK(sb); 2640 SOCK_IO_RECV_UNLOCK(so); 2641 return (error); 2642 } 2643 2644 /* 2645 * Optimized version of soreceive() for simple datagram cases from userspace. 2646 * Unlike in the stream case, we're able to drop a datagram if copyout() 2647 * fails, and because we handle datagrams atomically, we don't need to use a 2648 * sleep lock to prevent I/O interlacing. 2649 */ 2650 int 2651 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2652 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2653 { 2654 struct mbuf *m, *m2; 2655 int flags, error; 2656 ssize_t len; 2657 struct protosw *pr = so->so_proto; 2658 struct mbuf *nextrecord; 2659 2660 if (psa != NULL) 2661 *psa = NULL; 2662 if (controlp != NULL) 2663 *controlp = NULL; 2664 if (flagsp != NULL) 2665 flags = *flagsp &~ MSG_EOR; 2666 else 2667 flags = 0; 2668 2669 /* 2670 * For any complicated cases, fall back to the full 2671 * soreceive_generic(). 2672 */ 2673 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) 2674 return (soreceive_generic(so, psa, uio, mp0, controlp, 2675 flagsp)); 2676 2677 /* 2678 * Enforce restrictions on use. 2679 */ 2680 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2681 ("soreceive_dgram: wantrcvd")); 2682 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2683 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2684 ("soreceive_dgram: SBS_RCVATMARK")); 2685 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2686 ("soreceive_dgram: P_CONNREQUIRED")); 2687 2688 /* 2689 * Loop blocking while waiting for a datagram. 2690 */ 2691 SOCKBUF_LOCK(&so->so_rcv); 2692 while ((m = so->so_rcv.sb_mb) == NULL) { 2693 KASSERT(sbavail(&so->so_rcv) == 0, 2694 ("soreceive_dgram: sb_mb NULL but sbavail %u", 2695 sbavail(&so->so_rcv))); 2696 if (so->so_error) { 2697 error = so->so_error; 2698 so->so_error = 0; 2699 SOCKBUF_UNLOCK(&so->so_rcv); 2700 return (error); 2701 } 2702 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2703 uio->uio_resid == 0) { 2704 SOCKBUF_UNLOCK(&so->so_rcv); 2705 return (0); 2706 } 2707 if ((so->so_state & SS_NBIO) || 2708 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2709 SOCKBUF_UNLOCK(&so->so_rcv); 2710 return (EWOULDBLOCK); 2711 } 2712 SBLASTRECORDCHK(&so->so_rcv); 2713 SBLASTMBUFCHK(&so->so_rcv); 2714 error = sbwait(so, SO_RCV); 2715 if (error) { 2716 SOCKBUF_UNLOCK(&so->so_rcv); 2717 return (error); 2718 } 2719 } 2720 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2721 2722 if (uio->uio_td) 2723 uio->uio_td->td_ru.ru_msgrcv++; 2724 SBLASTRECORDCHK(&so->so_rcv); 2725 SBLASTMBUFCHK(&so->so_rcv); 2726 nextrecord = m->m_nextpkt; 2727 if (nextrecord == NULL) { 2728 KASSERT(so->so_rcv.sb_lastrecord == m, 2729 ("soreceive_dgram: lastrecord != m")); 2730 } 2731 2732 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2733 ("soreceive_dgram: m_nextpkt != nextrecord")); 2734 2735 /* 2736 * Pull 'm' and its chain off the front of the packet queue. 2737 */ 2738 so->so_rcv.sb_mb = NULL; 2739 sockbuf_pushsync(&so->so_rcv, nextrecord); 2740 2741 /* 2742 * Walk 'm's chain and free that many bytes from the socket buffer. 2743 */ 2744 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2745 sbfree(&so->so_rcv, m2); 2746 2747 /* 2748 * Do a few last checks before we let go of the lock. 2749 */ 2750 SBLASTRECORDCHK(&so->so_rcv); 2751 SBLASTMBUFCHK(&so->so_rcv); 2752 SOCKBUF_UNLOCK(&so->so_rcv); 2753 2754 if (pr->pr_flags & PR_ADDR) { 2755 KASSERT(m->m_type == MT_SONAME, 2756 ("m->m_type == %d", m->m_type)); 2757 if (psa != NULL) 2758 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2759 M_NOWAIT); 2760 m = m_free(m); 2761 } 2762 if (m == NULL) { 2763 /* XXXRW: Can this happen? */ 2764 return (0); 2765 } 2766 2767 /* 2768 * Packet to copyout() is now in 'm' and it is disconnected from the 2769 * queue. 2770 * 2771 * Process one or more MT_CONTROL mbufs present before any data mbufs 2772 * in the first mbuf chain on the socket buffer. We call into the 2773 * protocol to perform externalization (or freeing if controlp == 2774 * NULL). In some cases there can be only MT_CONTROL mbufs without 2775 * MT_DATA mbufs. 2776 */ 2777 if (m->m_type == MT_CONTROL) { 2778 struct mbuf *cm = NULL, *cmn; 2779 struct mbuf **cme = &cm; 2780 2781 do { 2782 m2 = m->m_next; 2783 m->m_next = NULL; 2784 *cme = m; 2785 cme = &(*cme)->m_next; 2786 m = m2; 2787 } while (m != NULL && m->m_type == MT_CONTROL); 2788 while (cm != NULL) { 2789 cmn = cm->m_next; 2790 cm->m_next = NULL; 2791 if (pr->pr_domain->dom_externalize != NULL) { 2792 error = (*pr->pr_domain->dom_externalize) 2793 (cm, controlp, flags); 2794 } else if (controlp != NULL) 2795 *controlp = cm; 2796 else 2797 m_freem(cm); 2798 if (controlp != NULL) { 2799 while (*controlp != NULL) 2800 controlp = &(*controlp)->m_next; 2801 } 2802 cm = cmn; 2803 } 2804 } 2805 KASSERT(m == NULL || m->m_type == MT_DATA, 2806 ("soreceive_dgram: !data")); 2807 while (m != NULL && uio->uio_resid > 0) { 2808 len = uio->uio_resid; 2809 if (len > m->m_len) 2810 len = m->m_len; 2811 error = uiomove(mtod(m, char *), (int)len, uio); 2812 if (error) { 2813 m_freem(m); 2814 return (error); 2815 } 2816 if (len == m->m_len) 2817 m = m_free(m); 2818 else { 2819 m->m_data += len; 2820 m->m_len -= len; 2821 } 2822 } 2823 if (m != NULL) { 2824 flags |= MSG_TRUNC; 2825 m_freem(m); 2826 } 2827 if (flagsp != NULL) 2828 *flagsp |= flags; 2829 return (0); 2830 } 2831 2832 int 2833 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2834 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2835 { 2836 int error; 2837 2838 CURVNET_SET(so->so_vnet); 2839 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); 2840 CURVNET_RESTORE(); 2841 return (error); 2842 } 2843 2844 int 2845 soshutdown(struct socket *so, int how) 2846 { 2847 struct protosw *pr; 2848 int error, soerror_enotconn; 2849 2850 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2851 return (EINVAL); 2852 2853 soerror_enotconn = 0; 2854 SOCK_LOCK(so); 2855 if ((so->so_state & 2856 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { 2857 /* 2858 * POSIX mandates us to return ENOTCONN when shutdown(2) is 2859 * invoked on a datagram sockets, however historically we would 2860 * actually tear socket down. This is known to be leveraged by 2861 * some applications to unblock process waiting in recvXXX(2) 2862 * by other process that it shares that socket with. Try to meet 2863 * both backward-compatibility and POSIX requirements by forcing 2864 * ENOTCONN but still asking protocol to perform pru_shutdown(). 2865 */ 2866 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) { 2867 SOCK_UNLOCK(so); 2868 return (ENOTCONN); 2869 } 2870 soerror_enotconn = 1; 2871 } 2872 2873 if (SOLISTENING(so)) { 2874 if (how != SHUT_WR) { 2875 so->so_error = ECONNABORTED; 2876 solisten_wakeup(so); /* unlocks so */ 2877 } else { 2878 SOCK_UNLOCK(so); 2879 } 2880 goto done; 2881 } 2882 SOCK_UNLOCK(so); 2883 2884 CURVNET_SET(so->so_vnet); 2885 pr = so->so_proto; 2886 if (pr->pr_flush != NULL) 2887 pr->pr_flush(so, how); 2888 if (how != SHUT_WR) 2889 sorflush(so); 2890 if (how != SHUT_RD) { 2891 error = pr->pr_shutdown(so); 2892 wakeup(&so->so_timeo); 2893 CURVNET_RESTORE(); 2894 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); 2895 } 2896 wakeup(&so->so_timeo); 2897 CURVNET_RESTORE(); 2898 2899 done: 2900 return (soerror_enotconn ? ENOTCONN : 0); 2901 } 2902 2903 void 2904 sorflush(struct socket *so) 2905 { 2906 struct protosw *pr; 2907 int error; 2908 2909 VNET_SO_ASSERT(so); 2910 2911 /* 2912 * Dislodge threads currently blocked in receive and wait to acquire 2913 * a lock against other simultaneous readers before clearing the 2914 * socket buffer. Don't let our acquire be interrupted by a signal 2915 * despite any existing socket disposition on interruptable waiting. 2916 */ 2917 socantrcvmore(so); 2918 2919 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 2920 if (error != 0) { 2921 KASSERT(SOLISTENING(so), 2922 ("%s: soiolock(%p) failed", __func__, so)); 2923 return; 2924 } 2925 2926 pr = so->so_proto; 2927 if (pr->pr_flags & PR_RIGHTS) { 2928 MPASS(pr->pr_domain->dom_dispose != NULL); 2929 (*pr->pr_domain->dom_dispose)(so); 2930 } else { 2931 sbrelease(so, SO_RCV); 2932 SOCK_IO_RECV_UNLOCK(so); 2933 } 2934 2935 } 2936 2937 /* 2938 * Wrapper for Socket established helper hook. 2939 * Parameters: socket, context of the hook point, hook id. 2940 */ 2941 static int inline 2942 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 2943 { 2944 struct socket_hhook_data hhook_data = { 2945 .so = so, 2946 .hctx = hctx, 2947 .m = NULL, 2948 .status = 0 2949 }; 2950 2951 CURVNET_SET(so->so_vnet); 2952 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 2953 CURVNET_RESTORE(); 2954 2955 /* Ugly but needed, since hhooks return void for now */ 2956 return (hhook_data.status); 2957 } 2958 2959 /* 2960 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2961 * additional variant to handle the case where the option value needs to be 2962 * some kind of integer, but not a specific size. In addition to their use 2963 * here, these functions are also called by the protocol-level pr_ctloutput() 2964 * routines. 2965 */ 2966 int 2967 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2968 { 2969 size_t valsize; 2970 2971 /* 2972 * If the user gives us more than we wanted, we ignore it, but if we 2973 * don't get the minimum length the caller wants, we return EINVAL. 2974 * On success, sopt->sopt_valsize is set to however much we actually 2975 * retrieved. 2976 */ 2977 if ((valsize = sopt->sopt_valsize) < minlen) 2978 return EINVAL; 2979 if (valsize > len) 2980 sopt->sopt_valsize = valsize = len; 2981 2982 if (sopt->sopt_td != NULL) 2983 return (copyin(sopt->sopt_val, buf, valsize)); 2984 2985 bcopy(sopt->sopt_val, buf, valsize); 2986 return (0); 2987 } 2988 2989 /* 2990 * Kernel version of setsockopt(2). 2991 * 2992 * XXX: optlen is size_t, not socklen_t 2993 */ 2994 int 2995 so_setsockopt(struct socket *so, int level, int optname, void *optval, 2996 size_t optlen) 2997 { 2998 struct sockopt sopt; 2999 3000 sopt.sopt_level = level; 3001 sopt.sopt_name = optname; 3002 sopt.sopt_dir = SOPT_SET; 3003 sopt.sopt_val = optval; 3004 sopt.sopt_valsize = optlen; 3005 sopt.sopt_td = NULL; 3006 return (sosetopt(so, &sopt)); 3007 } 3008 3009 int 3010 sosetopt(struct socket *so, struct sockopt *sopt) 3011 { 3012 int error, optval; 3013 struct linger l; 3014 struct timeval tv; 3015 sbintime_t val, *valp; 3016 uint32_t val32; 3017 #ifdef MAC 3018 struct mac extmac; 3019 #endif 3020 3021 CURVNET_SET(so->so_vnet); 3022 error = 0; 3023 if (sopt->sopt_level != SOL_SOCKET) { 3024 if (so->so_proto->pr_ctloutput != NULL) 3025 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3026 else 3027 error = ENOPROTOOPT; 3028 } else { 3029 switch (sopt->sopt_name) { 3030 case SO_ACCEPTFILTER: 3031 error = accept_filt_setopt(so, sopt); 3032 if (error) 3033 goto bad; 3034 break; 3035 3036 case SO_LINGER: 3037 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3038 if (error) 3039 goto bad; 3040 if (l.l_linger < 0 || 3041 l.l_linger > USHRT_MAX || 3042 l.l_linger > (INT_MAX / hz)) { 3043 error = EDOM; 3044 goto bad; 3045 } 3046 SOCK_LOCK(so); 3047 so->so_linger = l.l_linger; 3048 if (l.l_onoff) 3049 so->so_options |= SO_LINGER; 3050 else 3051 so->so_options &= ~SO_LINGER; 3052 SOCK_UNLOCK(so); 3053 break; 3054 3055 case SO_DEBUG: 3056 case SO_KEEPALIVE: 3057 case SO_DONTROUTE: 3058 case SO_USELOOPBACK: 3059 case SO_BROADCAST: 3060 case SO_REUSEADDR: 3061 case SO_REUSEPORT: 3062 case SO_REUSEPORT_LB: 3063 case SO_OOBINLINE: 3064 case SO_TIMESTAMP: 3065 case SO_BINTIME: 3066 case SO_NOSIGPIPE: 3067 case SO_NO_DDP: 3068 case SO_NO_OFFLOAD: 3069 case SO_RERROR: 3070 error = sooptcopyin(sopt, &optval, sizeof optval, 3071 sizeof optval); 3072 if (error) 3073 goto bad; 3074 SOCK_LOCK(so); 3075 if (optval) 3076 so->so_options |= sopt->sopt_name; 3077 else 3078 so->so_options &= ~sopt->sopt_name; 3079 SOCK_UNLOCK(so); 3080 break; 3081 3082 case SO_SETFIB: 3083 error = sooptcopyin(sopt, &optval, sizeof optval, 3084 sizeof optval); 3085 if (error) 3086 goto bad; 3087 3088 if (optval < 0 || optval >= rt_numfibs) { 3089 error = EINVAL; 3090 goto bad; 3091 } 3092 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 3093 (so->so_proto->pr_domain->dom_family == PF_INET6) || 3094 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 3095 so->so_fibnum = optval; 3096 else 3097 so->so_fibnum = 0; 3098 break; 3099 3100 case SO_USER_COOKIE: 3101 error = sooptcopyin(sopt, &val32, sizeof val32, 3102 sizeof val32); 3103 if (error) 3104 goto bad; 3105 so->so_user_cookie = val32; 3106 break; 3107 3108 case SO_SNDBUF: 3109 case SO_RCVBUF: 3110 case SO_SNDLOWAT: 3111 case SO_RCVLOWAT: 3112 error = sooptcopyin(sopt, &optval, sizeof optval, 3113 sizeof optval); 3114 if (error) 3115 goto bad; 3116 3117 /* 3118 * Values < 1 make no sense for any of these options, 3119 * so disallow them. 3120 */ 3121 if (optval < 1) { 3122 error = EINVAL; 3123 goto bad; 3124 } 3125 3126 error = sbsetopt(so, sopt->sopt_name, optval); 3127 break; 3128 3129 case SO_SNDTIMEO: 3130 case SO_RCVTIMEO: 3131 #ifdef COMPAT_FREEBSD32 3132 if (SV_CURPROC_FLAG(SV_ILP32)) { 3133 struct timeval32 tv32; 3134 3135 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3136 sizeof tv32); 3137 CP(tv32, tv, tv_sec); 3138 CP(tv32, tv, tv_usec); 3139 } else 3140 #endif 3141 error = sooptcopyin(sopt, &tv, sizeof tv, 3142 sizeof tv); 3143 if (error) 3144 goto bad; 3145 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3146 tv.tv_usec >= 1000000) { 3147 error = EDOM; 3148 goto bad; 3149 } 3150 if (tv.tv_sec > INT32_MAX) 3151 val = SBT_MAX; 3152 else 3153 val = tvtosbt(tv); 3154 SOCK_LOCK(so); 3155 valp = sopt->sopt_name == SO_SNDTIMEO ? 3156 (SOLISTENING(so) ? &so->sol_sbsnd_timeo : 3157 &so->so_snd.sb_timeo) : 3158 (SOLISTENING(so) ? &so->sol_sbrcv_timeo : 3159 &so->so_rcv.sb_timeo); 3160 *valp = val; 3161 SOCK_UNLOCK(so); 3162 break; 3163 3164 case SO_LABEL: 3165 #ifdef MAC 3166 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3167 sizeof extmac); 3168 if (error) 3169 goto bad; 3170 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3171 so, &extmac); 3172 #else 3173 error = EOPNOTSUPP; 3174 #endif 3175 break; 3176 3177 case SO_TS_CLOCK: 3178 error = sooptcopyin(sopt, &optval, sizeof optval, 3179 sizeof optval); 3180 if (error) 3181 goto bad; 3182 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 3183 error = EINVAL; 3184 goto bad; 3185 } 3186 so->so_ts_clock = optval; 3187 break; 3188 3189 case SO_MAX_PACING_RATE: 3190 error = sooptcopyin(sopt, &val32, sizeof(val32), 3191 sizeof(val32)); 3192 if (error) 3193 goto bad; 3194 so->so_max_pacing_rate = val32; 3195 break; 3196 3197 default: 3198 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3199 error = hhook_run_socket(so, sopt, 3200 HHOOK_SOCKET_OPT); 3201 else 3202 error = ENOPROTOOPT; 3203 break; 3204 } 3205 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 3206 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 3207 } 3208 bad: 3209 CURVNET_RESTORE(); 3210 return (error); 3211 } 3212 3213 /* 3214 * Helper routine for getsockopt. 3215 */ 3216 int 3217 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 3218 { 3219 int error; 3220 size_t valsize; 3221 3222 error = 0; 3223 3224 /* 3225 * Documented get behavior is that we always return a value, possibly 3226 * truncated to fit in the user's buffer. Traditional behavior is 3227 * that we always tell the user precisely how much we copied, rather 3228 * than something useful like the total amount we had available for 3229 * her. Note that this interface is not idempotent; the entire 3230 * answer must be generated ahead of time. 3231 */ 3232 valsize = min(len, sopt->sopt_valsize); 3233 sopt->sopt_valsize = valsize; 3234 if (sopt->sopt_val != NULL) { 3235 if (sopt->sopt_td != NULL) 3236 error = copyout(buf, sopt->sopt_val, valsize); 3237 else 3238 bcopy(buf, sopt->sopt_val, valsize); 3239 } 3240 return (error); 3241 } 3242 3243 int 3244 sogetopt(struct socket *so, struct sockopt *sopt) 3245 { 3246 int error, optval; 3247 struct linger l; 3248 struct timeval tv; 3249 #ifdef MAC 3250 struct mac extmac; 3251 #endif 3252 3253 CURVNET_SET(so->so_vnet); 3254 error = 0; 3255 if (sopt->sopt_level != SOL_SOCKET) { 3256 if (so->so_proto->pr_ctloutput != NULL) 3257 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3258 else 3259 error = ENOPROTOOPT; 3260 CURVNET_RESTORE(); 3261 return (error); 3262 } else { 3263 switch (sopt->sopt_name) { 3264 case SO_ACCEPTFILTER: 3265 error = accept_filt_getopt(so, sopt); 3266 break; 3267 3268 case SO_LINGER: 3269 SOCK_LOCK(so); 3270 l.l_onoff = so->so_options & SO_LINGER; 3271 l.l_linger = so->so_linger; 3272 SOCK_UNLOCK(so); 3273 error = sooptcopyout(sopt, &l, sizeof l); 3274 break; 3275 3276 case SO_USELOOPBACK: 3277 case SO_DONTROUTE: 3278 case SO_DEBUG: 3279 case SO_KEEPALIVE: 3280 case SO_REUSEADDR: 3281 case SO_REUSEPORT: 3282 case SO_REUSEPORT_LB: 3283 case SO_BROADCAST: 3284 case SO_OOBINLINE: 3285 case SO_ACCEPTCONN: 3286 case SO_TIMESTAMP: 3287 case SO_BINTIME: 3288 case SO_NOSIGPIPE: 3289 case SO_NO_DDP: 3290 case SO_NO_OFFLOAD: 3291 case SO_RERROR: 3292 optval = so->so_options & sopt->sopt_name; 3293 integer: 3294 error = sooptcopyout(sopt, &optval, sizeof optval); 3295 break; 3296 3297 case SO_DOMAIN: 3298 optval = so->so_proto->pr_domain->dom_family; 3299 goto integer; 3300 3301 case SO_TYPE: 3302 optval = so->so_type; 3303 goto integer; 3304 3305 case SO_PROTOCOL: 3306 optval = so->so_proto->pr_protocol; 3307 goto integer; 3308 3309 case SO_ERROR: 3310 SOCK_LOCK(so); 3311 if (so->so_error) { 3312 optval = so->so_error; 3313 so->so_error = 0; 3314 } else { 3315 optval = so->so_rerror; 3316 so->so_rerror = 0; 3317 } 3318 SOCK_UNLOCK(so); 3319 goto integer; 3320 3321 case SO_SNDBUF: 3322 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 3323 so->so_snd.sb_hiwat; 3324 goto integer; 3325 3326 case SO_RCVBUF: 3327 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 3328 so->so_rcv.sb_hiwat; 3329 goto integer; 3330 3331 case SO_SNDLOWAT: 3332 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 3333 so->so_snd.sb_lowat; 3334 goto integer; 3335 3336 case SO_RCVLOWAT: 3337 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 3338 so->so_rcv.sb_lowat; 3339 goto integer; 3340 3341 case SO_SNDTIMEO: 3342 case SO_RCVTIMEO: 3343 SOCK_LOCK(so); 3344 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 3345 (SOLISTENING(so) ? so->sol_sbsnd_timeo : 3346 so->so_snd.sb_timeo) : 3347 (SOLISTENING(so) ? so->sol_sbrcv_timeo : 3348 so->so_rcv.sb_timeo)); 3349 SOCK_UNLOCK(so); 3350 #ifdef COMPAT_FREEBSD32 3351 if (SV_CURPROC_FLAG(SV_ILP32)) { 3352 struct timeval32 tv32; 3353 3354 CP(tv, tv32, tv_sec); 3355 CP(tv, tv32, tv_usec); 3356 error = sooptcopyout(sopt, &tv32, sizeof tv32); 3357 } else 3358 #endif 3359 error = sooptcopyout(sopt, &tv, sizeof tv); 3360 break; 3361 3362 case SO_LABEL: 3363 #ifdef MAC 3364 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3365 sizeof(extmac)); 3366 if (error) 3367 goto bad; 3368 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 3369 so, &extmac); 3370 if (error) 3371 goto bad; 3372 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3373 #else 3374 error = EOPNOTSUPP; 3375 #endif 3376 break; 3377 3378 case SO_PEERLABEL: 3379 #ifdef MAC 3380 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3381 sizeof(extmac)); 3382 if (error) 3383 goto bad; 3384 error = mac_getsockopt_peerlabel( 3385 sopt->sopt_td->td_ucred, so, &extmac); 3386 if (error) 3387 goto bad; 3388 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3389 #else 3390 error = EOPNOTSUPP; 3391 #endif 3392 break; 3393 3394 case SO_LISTENQLIMIT: 3395 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 3396 goto integer; 3397 3398 case SO_LISTENQLEN: 3399 optval = SOLISTENING(so) ? so->sol_qlen : 0; 3400 goto integer; 3401 3402 case SO_LISTENINCQLEN: 3403 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 3404 goto integer; 3405 3406 case SO_TS_CLOCK: 3407 optval = so->so_ts_clock; 3408 goto integer; 3409 3410 case SO_MAX_PACING_RATE: 3411 optval = so->so_max_pacing_rate; 3412 goto integer; 3413 3414 default: 3415 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3416 error = hhook_run_socket(so, sopt, 3417 HHOOK_SOCKET_OPT); 3418 else 3419 error = ENOPROTOOPT; 3420 break; 3421 } 3422 } 3423 #ifdef MAC 3424 bad: 3425 #endif 3426 CURVNET_RESTORE(); 3427 return (error); 3428 } 3429 3430 int 3431 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 3432 { 3433 struct mbuf *m, *m_prev; 3434 int sopt_size = sopt->sopt_valsize; 3435 3436 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3437 if (m == NULL) 3438 return ENOBUFS; 3439 if (sopt_size > MLEN) { 3440 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 3441 if ((m->m_flags & M_EXT) == 0) { 3442 m_free(m); 3443 return ENOBUFS; 3444 } 3445 m->m_len = min(MCLBYTES, sopt_size); 3446 } else { 3447 m->m_len = min(MLEN, sopt_size); 3448 } 3449 sopt_size -= m->m_len; 3450 *mp = m; 3451 m_prev = m; 3452 3453 while (sopt_size) { 3454 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3455 if (m == NULL) { 3456 m_freem(*mp); 3457 return ENOBUFS; 3458 } 3459 if (sopt_size > MLEN) { 3460 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 3461 M_NOWAIT); 3462 if ((m->m_flags & M_EXT) == 0) { 3463 m_freem(m); 3464 m_freem(*mp); 3465 return ENOBUFS; 3466 } 3467 m->m_len = min(MCLBYTES, sopt_size); 3468 } else { 3469 m->m_len = min(MLEN, sopt_size); 3470 } 3471 sopt_size -= m->m_len; 3472 m_prev->m_next = m; 3473 m_prev = m; 3474 } 3475 return (0); 3476 } 3477 3478 int 3479 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 3480 { 3481 struct mbuf *m0 = m; 3482 3483 if (sopt->sopt_val == NULL) 3484 return (0); 3485 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3486 if (sopt->sopt_td != NULL) { 3487 int error; 3488 3489 error = copyin(sopt->sopt_val, mtod(m, char *), 3490 m->m_len); 3491 if (error != 0) { 3492 m_freem(m0); 3493 return(error); 3494 } 3495 } else 3496 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 3497 sopt->sopt_valsize -= m->m_len; 3498 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3499 m = m->m_next; 3500 } 3501 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 3502 panic("ip6_sooptmcopyin"); 3503 return (0); 3504 } 3505 3506 int 3507 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 3508 { 3509 struct mbuf *m0 = m; 3510 size_t valsize = 0; 3511 3512 if (sopt->sopt_val == NULL) 3513 return (0); 3514 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3515 if (sopt->sopt_td != NULL) { 3516 int error; 3517 3518 error = copyout(mtod(m, char *), sopt->sopt_val, 3519 m->m_len); 3520 if (error != 0) { 3521 m_freem(m0); 3522 return(error); 3523 } 3524 } else 3525 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 3526 sopt->sopt_valsize -= m->m_len; 3527 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3528 valsize += m->m_len; 3529 m = m->m_next; 3530 } 3531 if (m != NULL) { 3532 /* enough soopt buffer should be given from user-land */ 3533 m_freem(m0); 3534 return(EINVAL); 3535 } 3536 sopt->sopt_valsize = valsize; 3537 return (0); 3538 } 3539 3540 /* 3541 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3542 * out-of-band data, which will then notify socket consumers. 3543 */ 3544 void 3545 sohasoutofband(struct socket *so) 3546 { 3547 3548 if (so->so_sigio != NULL) 3549 pgsigio(&so->so_sigio, SIGURG, 0); 3550 selwakeuppri(&so->so_rdsel, PSOCK); 3551 } 3552 3553 int 3554 sopoll(struct socket *so, int events, struct ucred *active_cred, 3555 struct thread *td) 3556 { 3557 3558 /* 3559 * We do not need to set or assert curvnet as long as everyone uses 3560 * sopoll_generic(). 3561 */ 3562 return (so->so_proto->pr_sopoll(so, events, active_cred, td)); 3563 } 3564 3565 int 3566 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3567 struct thread *td) 3568 { 3569 int revents; 3570 3571 SOCK_LOCK(so); 3572 if (SOLISTENING(so)) { 3573 if (!(events & (POLLIN | POLLRDNORM))) 3574 revents = 0; 3575 else if (!TAILQ_EMPTY(&so->sol_comp)) 3576 revents = events & (POLLIN | POLLRDNORM); 3577 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 3578 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 3579 else { 3580 selrecord(td, &so->so_rdsel); 3581 revents = 0; 3582 } 3583 } else { 3584 revents = 0; 3585 SOCK_SENDBUF_LOCK(so); 3586 SOCK_RECVBUF_LOCK(so); 3587 if (events & (POLLIN | POLLRDNORM)) 3588 if (soreadabledata(so)) 3589 revents |= events & (POLLIN | POLLRDNORM); 3590 if (events & (POLLOUT | POLLWRNORM)) 3591 if (sowriteable(so)) 3592 revents |= events & (POLLOUT | POLLWRNORM); 3593 if (events & (POLLPRI | POLLRDBAND)) 3594 if (so->so_oobmark || 3595 (so->so_rcv.sb_state & SBS_RCVATMARK)) 3596 revents |= events & (POLLPRI | POLLRDBAND); 3597 if ((events & POLLINIGNEOF) == 0) { 3598 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3599 revents |= events & (POLLIN | POLLRDNORM); 3600 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3601 revents |= POLLHUP; 3602 } 3603 } 3604 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 3605 revents |= events & POLLRDHUP; 3606 if (revents == 0) { 3607 if (events & 3608 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 3609 selrecord(td, &so->so_rdsel); 3610 so->so_rcv.sb_flags |= SB_SEL; 3611 } 3612 if (events & (POLLOUT | POLLWRNORM)) { 3613 selrecord(td, &so->so_wrsel); 3614 so->so_snd.sb_flags |= SB_SEL; 3615 } 3616 } 3617 SOCK_RECVBUF_UNLOCK(so); 3618 SOCK_SENDBUF_UNLOCK(so); 3619 } 3620 SOCK_UNLOCK(so); 3621 return (revents); 3622 } 3623 3624 int 3625 soo_kqfilter(struct file *fp, struct knote *kn) 3626 { 3627 struct socket *so = kn->kn_fp->f_data; 3628 struct sockbuf *sb; 3629 sb_which which; 3630 struct knlist *knl; 3631 3632 switch (kn->kn_filter) { 3633 case EVFILT_READ: 3634 kn->kn_fop = &soread_filtops; 3635 knl = &so->so_rdsel.si_note; 3636 sb = &so->so_rcv; 3637 which = SO_RCV; 3638 break; 3639 case EVFILT_WRITE: 3640 kn->kn_fop = &sowrite_filtops; 3641 knl = &so->so_wrsel.si_note; 3642 sb = &so->so_snd; 3643 which = SO_SND; 3644 break; 3645 case EVFILT_EMPTY: 3646 kn->kn_fop = &soempty_filtops; 3647 knl = &so->so_wrsel.si_note; 3648 sb = &so->so_snd; 3649 which = SO_SND; 3650 break; 3651 default: 3652 return (EINVAL); 3653 } 3654 3655 SOCK_LOCK(so); 3656 if (SOLISTENING(so)) { 3657 knlist_add(knl, kn, 1); 3658 } else { 3659 SOCK_BUF_LOCK(so, which); 3660 knlist_add(knl, kn, 1); 3661 sb->sb_flags |= SB_KNOTE; 3662 SOCK_BUF_UNLOCK(so, which); 3663 } 3664 SOCK_UNLOCK(so); 3665 return (0); 3666 } 3667 3668 static void 3669 filt_sordetach(struct knote *kn) 3670 { 3671 struct socket *so = kn->kn_fp->f_data; 3672 3673 so_rdknl_lock(so); 3674 knlist_remove(&so->so_rdsel.si_note, kn, 1); 3675 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 3676 so->so_rcv.sb_flags &= ~SB_KNOTE; 3677 so_rdknl_unlock(so); 3678 } 3679 3680 /*ARGSUSED*/ 3681 static int 3682 filt_soread(struct knote *kn, long hint) 3683 { 3684 struct socket *so; 3685 3686 so = kn->kn_fp->f_data; 3687 3688 if (SOLISTENING(so)) { 3689 SOCK_LOCK_ASSERT(so); 3690 kn->kn_data = so->sol_qlen; 3691 if (so->so_error) { 3692 kn->kn_flags |= EV_EOF; 3693 kn->kn_fflags = so->so_error; 3694 return (1); 3695 } 3696 return (!TAILQ_EMPTY(&so->sol_comp)); 3697 } 3698 3699 SOCK_RECVBUF_LOCK_ASSERT(so); 3700 3701 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 3702 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3703 kn->kn_flags |= EV_EOF; 3704 kn->kn_fflags = so->so_error; 3705 return (1); 3706 } else if (so->so_error || so->so_rerror) 3707 return (1); 3708 3709 if (kn->kn_sfflags & NOTE_LOWAT) { 3710 if (kn->kn_data >= kn->kn_sdata) 3711 return (1); 3712 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 3713 return (1); 3714 3715 /* This hook returning non-zero indicates an event, not error */ 3716 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 3717 } 3718 3719 static void 3720 filt_sowdetach(struct knote *kn) 3721 { 3722 struct socket *so = kn->kn_fp->f_data; 3723 3724 so_wrknl_lock(so); 3725 knlist_remove(&so->so_wrsel.si_note, kn, 1); 3726 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 3727 so->so_snd.sb_flags &= ~SB_KNOTE; 3728 so_wrknl_unlock(so); 3729 } 3730 3731 /*ARGSUSED*/ 3732 static int 3733 filt_sowrite(struct knote *kn, long hint) 3734 { 3735 struct socket *so; 3736 3737 so = kn->kn_fp->f_data; 3738 3739 if (SOLISTENING(so)) 3740 return (0); 3741 3742 SOCK_SENDBUF_LOCK_ASSERT(so); 3743 kn->kn_data = sbspace(&so->so_snd); 3744 3745 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 3746 3747 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3748 kn->kn_flags |= EV_EOF; 3749 kn->kn_fflags = so->so_error; 3750 return (1); 3751 } else if (so->so_error) /* temporary udp error */ 3752 return (1); 3753 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3754 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3755 return (0); 3756 else if (kn->kn_sfflags & NOTE_LOWAT) 3757 return (kn->kn_data >= kn->kn_sdata); 3758 else 3759 return (kn->kn_data >= so->so_snd.sb_lowat); 3760 } 3761 3762 static int 3763 filt_soempty(struct knote *kn, long hint) 3764 { 3765 struct socket *so; 3766 3767 so = kn->kn_fp->f_data; 3768 3769 if (SOLISTENING(so)) 3770 return (1); 3771 3772 SOCK_SENDBUF_LOCK_ASSERT(so); 3773 kn->kn_data = sbused(&so->so_snd); 3774 3775 if (kn->kn_data == 0) 3776 return (1); 3777 else 3778 return (0); 3779 } 3780 3781 int 3782 socheckuid(struct socket *so, uid_t uid) 3783 { 3784 3785 if (so == NULL) 3786 return (EPERM); 3787 if (so->so_cred->cr_uid != uid) 3788 return (EPERM); 3789 return (0); 3790 } 3791 3792 /* 3793 * These functions are used by protocols to notify the socket layer (and its 3794 * consumers) of state changes in the sockets driven by protocol-side events. 3795 */ 3796 3797 /* 3798 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3799 * 3800 * Normal sequence from the active (originating) side is that 3801 * soisconnecting() is called during processing of connect() call, resulting 3802 * in an eventual call to soisconnected() if/when the connection is 3803 * established. When the connection is torn down soisdisconnecting() is 3804 * called during processing of disconnect() call, and soisdisconnected() is 3805 * called when the connection to the peer is totally severed. The semantics 3806 * of these routines are such that connectionless protocols can call 3807 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3808 * calls when setting up a ``connection'' takes no time. 3809 * 3810 * From the passive side, a socket is created with two queues of sockets: 3811 * so_incomp for connections in progress and so_comp for connections already 3812 * made and awaiting user acceptance. As a protocol is preparing incoming 3813 * connections, it creates a socket structure queued on so_incomp by calling 3814 * sonewconn(). When the connection is established, soisconnected() is 3815 * called, and transfers the socket structure to so_comp, making it available 3816 * to accept(). 3817 * 3818 * If a socket is closed with sockets on either so_incomp or so_comp, these 3819 * sockets are dropped. 3820 * 3821 * If higher-level protocols are implemented in the kernel, the wakeups done 3822 * here will sometimes cause software-interrupt process scheduling. 3823 */ 3824 void 3825 soisconnecting(struct socket *so) 3826 { 3827 3828 SOCK_LOCK(so); 3829 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3830 so->so_state |= SS_ISCONNECTING; 3831 SOCK_UNLOCK(so); 3832 } 3833 3834 void 3835 soisconnected(struct socket *so) 3836 { 3837 bool last __diagused; 3838 3839 SOCK_LOCK(so); 3840 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3841 so->so_state |= SS_ISCONNECTED; 3842 3843 if (so->so_qstate == SQ_INCOMP) { 3844 struct socket *head = so->so_listen; 3845 int ret; 3846 3847 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 3848 /* 3849 * Promoting a socket from incomplete queue to complete, we 3850 * need to go through reverse order of locking. We first do 3851 * trylock, and if that doesn't succeed, we go the hard way 3852 * leaving a reference and rechecking consistency after proper 3853 * locking. 3854 */ 3855 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 3856 soref(head); 3857 SOCK_UNLOCK(so); 3858 SOLISTEN_LOCK(head); 3859 SOCK_LOCK(so); 3860 if (__predict_false(head != so->so_listen)) { 3861 /* 3862 * The socket went off the listen queue, 3863 * should be lost race to close(2) of sol. 3864 * The socket is about to soabort(). 3865 */ 3866 SOCK_UNLOCK(so); 3867 sorele_locked(head); 3868 return; 3869 } 3870 last = refcount_release(&head->so_count); 3871 KASSERT(!last, ("%s: released last reference for %p", 3872 __func__, head)); 3873 } 3874 again: 3875 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3876 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 3877 head->sol_incqlen--; 3878 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 3879 head->sol_qlen++; 3880 so->so_qstate = SQ_COMP; 3881 SOCK_UNLOCK(so); 3882 solisten_wakeup(head); /* unlocks */ 3883 } else { 3884 SOCK_RECVBUF_LOCK(so); 3885 soupcall_set(so, SO_RCV, 3886 head->sol_accept_filter->accf_callback, 3887 head->sol_accept_filter_arg); 3888 so->so_options &= ~SO_ACCEPTFILTER; 3889 ret = head->sol_accept_filter->accf_callback(so, 3890 head->sol_accept_filter_arg, M_NOWAIT); 3891 if (ret == SU_ISCONNECTED) { 3892 soupcall_clear(so, SO_RCV); 3893 SOCK_RECVBUF_UNLOCK(so); 3894 goto again; 3895 } 3896 SOCK_RECVBUF_UNLOCK(so); 3897 SOCK_UNLOCK(so); 3898 SOLISTEN_UNLOCK(head); 3899 } 3900 return; 3901 } 3902 SOCK_UNLOCK(so); 3903 wakeup(&so->so_timeo); 3904 sorwakeup(so); 3905 sowwakeup(so); 3906 } 3907 3908 void 3909 soisdisconnecting(struct socket *so) 3910 { 3911 3912 SOCK_LOCK(so); 3913 so->so_state &= ~SS_ISCONNECTING; 3914 so->so_state |= SS_ISDISCONNECTING; 3915 3916 if (!SOLISTENING(so)) { 3917 SOCK_RECVBUF_LOCK(so); 3918 socantrcvmore_locked(so); 3919 SOCK_SENDBUF_LOCK(so); 3920 socantsendmore_locked(so); 3921 } 3922 SOCK_UNLOCK(so); 3923 wakeup(&so->so_timeo); 3924 } 3925 3926 void 3927 soisdisconnected(struct socket *so) 3928 { 3929 3930 SOCK_LOCK(so); 3931 3932 /* 3933 * There is at least one reader of so_state that does not 3934 * acquire socket lock, namely soreceive_generic(). Ensure 3935 * that it never sees all flags that track connection status 3936 * cleared, by ordering the update with a barrier semantic of 3937 * our release thread fence. 3938 */ 3939 so->so_state |= SS_ISDISCONNECTED; 3940 atomic_thread_fence_rel(); 3941 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3942 3943 if (!SOLISTENING(so)) { 3944 SOCK_UNLOCK(so); 3945 SOCK_RECVBUF_LOCK(so); 3946 socantrcvmore_locked(so); 3947 SOCK_SENDBUF_LOCK(so); 3948 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 3949 socantsendmore_locked(so); 3950 } else 3951 SOCK_UNLOCK(so); 3952 wakeup(&so->so_timeo); 3953 } 3954 3955 int 3956 soiolock(struct socket *so, struct sx *sx, int flags) 3957 { 3958 int error; 3959 3960 KASSERT((flags & SBL_VALID) == flags, 3961 ("soiolock: invalid flags %#x", flags)); 3962 3963 if ((flags & SBL_WAIT) != 0) { 3964 if ((flags & SBL_NOINTR) != 0) { 3965 sx_xlock(sx); 3966 } else { 3967 error = sx_xlock_sig(sx); 3968 if (error != 0) 3969 return (error); 3970 } 3971 } else if (!sx_try_xlock(sx)) { 3972 return (EWOULDBLOCK); 3973 } 3974 3975 if (__predict_false(SOLISTENING(so))) { 3976 sx_xunlock(sx); 3977 return (ENOTCONN); 3978 } 3979 return (0); 3980 } 3981 3982 void 3983 soiounlock(struct sx *sx) 3984 { 3985 sx_xunlock(sx); 3986 } 3987 3988 /* 3989 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3990 */ 3991 struct sockaddr * 3992 sodupsockaddr(const struct sockaddr *sa, int mflags) 3993 { 3994 struct sockaddr *sa2; 3995 3996 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3997 if (sa2) 3998 bcopy(sa, sa2, sa->sa_len); 3999 return sa2; 4000 } 4001 4002 /* 4003 * Register per-socket destructor. 4004 */ 4005 void 4006 sodtor_set(struct socket *so, so_dtor_t *func) 4007 { 4008 4009 SOCK_LOCK_ASSERT(so); 4010 so->so_dtor = func; 4011 } 4012 4013 /* 4014 * Register per-socket buffer upcalls. 4015 */ 4016 void 4017 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) 4018 { 4019 struct sockbuf *sb; 4020 4021 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4022 4023 switch (which) { 4024 case SO_RCV: 4025 sb = &so->so_rcv; 4026 break; 4027 case SO_SND: 4028 sb = &so->so_snd; 4029 break; 4030 } 4031 SOCK_BUF_LOCK_ASSERT(so, which); 4032 sb->sb_upcall = func; 4033 sb->sb_upcallarg = arg; 4034 sb->sb_flags |= SB_UPCALL; 4035 } 4036 4037 void 4038 soupcall_clear(struct socket *so, sb_which which) 4039 { 4040 struct sockbuf *sb; 4041 4042 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4043 4044 switch (which) { 4045 case SO_RCV: 4046 sb = &so->so_rcv; 4047 break; 4048 case SO_SND: 4049 sb = &so->so_snd; 4050 break; 4051 } 4052 SOCK_BUF_LOCK_ASSERT(so, which); 4053 KASSERT(sb->sb_upcall != NULL, 4054 ("%s: so %p no upcall to clear", __func__, so)); 4055 sb->sb_upcall = NULL; 4056 sb->sb_upcallarg = NULL; 4057 sb->sb_flags &= ~SB_UPCALL; 4058 } 4059 4060 void 4061 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4062 { 4063 4064 SOLISTEN_LOCK_ASSERT(so); 4065 so->sol_upcall = func; 4066 so->sol_upcallarg = arg; 4067 } 4068 4069 static void 4070 so_rdknl_lock(void *arg) 4071 { 4072 struct socket *so = arg; 4073 4074 retry: 4075 if (SOLISTENING(so)) { 4076 SOLISTEN_LOCK(so); 4077 } else { 4078 SOCK_RECVBUF_LOCK(so); 4079 if (__predict_false(SOLISTENING(so))) { 4080 SOCK_RECVBUF_UNLOCK(so); 4081 goto retry; 4082 } 4083 } 4084 } 4085 4086 static void 4087 so_rdknl_unlock(void *arg) 4088 { 4089 struct socket *so = arg; 4090 4091 if (SOLISTENING(so)) 4092 SOLISTEN_UNLOCK(so); 4093 else 4094 SOCK_RECVBUF_UNLOCK(so); 4095 } 4096 4097 static void 4098 so_rdknl_assert_lock(void *arg, int what) 4099 { 4100 struct socket *so = arg; 4101 4102 if (what == LA_LOCKED) { 4103 if (SOLISTENING(so)) 4104 SOLISTEN_LOCK_ASSERT(so); 4105 else 4106 SOCK_RECVBUF_LOCK_ASSERT(so); 4107 } else { 4108 if (SOLISTENING(so)) 4109 SOLISTEN_UNLOCK_ASSERT(so); 4110 else 4111 SOCK_RECVBUF_UNLOCK_ASSERT(so); 4112 } 4113 } 4114 4115 static void 4116 so_wrknl_lock(void *arg) 4117 { 4118 struct socket *so = arg; 4119 4120 retry: 4121 if (SOLISTENING(so)) { 4122 SOLISTEN_LOCK(so); 4123 } else { 4124 SOCK_SENDBUF_LOCK(so); 4125 if (__predict_false(SOLISTENING(so))) { 4126 SOCK_SENDBUF_UNLOCK(so); 4127 goto retry; 4128 } 4129 } 4130 } 4131 4132 static void 4133 so_wrknl_unlock(void *arg) 4134 { 4135 struct socket *so = arg; 4136 4137 if (SOLISTENING(so)) 4138 SOLISTEN_UNLOCK(so); 4139 else 4140 SOCK_SENDBUF_UNLOCK(so); 4141 } 4142 4143 static void 4144 so_wrknl_assert_lock(void *arg, int what) 4145 { 4146 struct socket *so = arg; 4147 4148 if (what == LA_LOCKED) { 4149 if (SOLISTENING(so)) 4150 SOLISTEN_LOCK_ASSERT(so); 4151 else 4152 SOCK_SENDBUF_LOCK_ASSERT(so); 4153 } else { 4154 if (SOLISTENING(so)) 4155 SOLISTEN_UNLOCK_ASSERT(so); 4156 else 4157 SOCK_SENDBUF_UNLOCK_ASSERT(so); 4158 } 4159 } 4160 4161 /* 4162 * Create an external-format (``xsocket'') structure using the information in 4163 * the kernel-format socket structure pointed to by so. This is done to 4164 * reduce the spew of irrelevant information over this interface, to isolate 4165 * user code from changes in the kernel structure, and potentially to provide 4166 * information-hiding if we decide that some of this information should be 4167 * hidden from users. 4168 */ 4169 void 4170 sotoxsocket(struct socket *so, struct xsocket *xso) 4171 { 4172 4173 bzero(xso, sizeof(*xso)); 4174 xso->xso_len = sizeof *xso; 4175 xso->xso_so = (uintptr_t)so; 4176 xso->so_type = so->so_type; 4177 xso->so_options = so->so_options; 4178 xso->so_linger = so->so_linger; 4179 xso->so_state = so->so_state; 4180 xso->so_pcb = (uintptr_t)so->so_pcb; 4181 xso->xso_protocol = so->so_proto->pr_protocol; 4182 xso->xso_family = so->so_proto->pr_domain->dom_family; 4183 xso->so_timeo = so->so_timeo; 4184 xso->so_error = so->so_error; 4185 xso->so_uid = so->so_cred->cr_uid; 4186 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 4187 if (SOLISTENING(so)) { 4188 xso->so_qlen = so->sol_qlen; 4189 xso->so_incqlen = so->sol_incqlen; 4190 xso->so_qlimit = so->sol_qlimit; 4191 xso->so_oobmark = 0; 4192 } else { 4193 xso->so_state |= so->so_qstate; 4194 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 4195 xso->so_oobmark = so->so_oobmark; 4196 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 4197 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 4198 } 4199 } 4200 4201 struct sockbuf * 4202 so_sockbuf_rcv(struct socket *so) 4203 { 4204 4205 return (&so->so_rcv); 4206 } 4207 4208 struct sockbuf * 4209 so_sockbuf_snd(struct socket *so) 4210 { 4211 4212 return (&so->so_snd); 4213 } 4214 4215 int 4216 so_state_get(const struct socket *so) 4217 { 4218 4219 return (so->so_state); 4220 } 4221 4222 void 4223 so_state_set(struct socket *so, int val) 4224 { 4225 4226 so->so_state = val; 4227 } 4228 4229 int 4230 so_options_get(const struct socket *so) 4231 { 4232 4233 return (so->so_options); 4234 } 4235 4236 void 4237 so_options_set(struct socket *so, int val) 4238 { 4239 4240 so->so_options = val; 4241 } 4242 4243 int 4244 so_error_get(const struct socket *so) 4245 { 4246 4247 return (so->so_error); 4248 } 4249 4250 void 4251 so_error_set(struct socket *so, int val) 4252 { 4253 4254 so->so_error = val; 4255 } 4256 4257 int 4258 so_linger_get(const struct socket *so) 4259 { 4260 4261 return (so->so_linger); 4262 } 4263 4264 void 4265 so_linger_set(struct socket *so, int val) 4266 { 4267 4268 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), 4269 ("%s: val %d out of range", __func__, val)); 4270 4271 so->so_linger = val; 4272 } 4273 4274 struct protosw * 4275 so_protosw_get(const struct socket *so) 4276 { 4277 4278 return (so->so_proto); 4279 } 4280 4281 void 4282 so_protosw_set(struct socket *so, struct protosw *val) 4283 { 4284 4285 so->so_proto = val; 4286 } 4287 4288 void 4289 so_sorwakeup(struct socket *so) 4290 { 4291 4292 sorwakeup(so); 4293 } 4294 4295 void 4296 so_sowwakeup(struct socket *so) 4297 { 4298 4299 sowwakeup(so); 4300 } 4301 4302 void 4303 so_sorwakeup_locked(struct socket *so) 4304 { 4305 4306 sorwakeup_locked(so); 4307 } 4308 4309 void 4310 so_sowwakeup_locked(struct socket *so) 4311 { 4312 4313 sowwakeup_locked(so); 4314 } 4315 4316 void 4317 so_lock(struct socket *so) 4318 { 4319 4320 SOCK_LOCK(so); 4321 } 4322 4323 void 4324 so_unlock(struct socket *so) 4325 { 4326 4327 SOCK_UNLOCK(so); 4328 } 4329