1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 35 */ 36 37 /* 38 * Comments on the socket life cycle: 39 * 40 * soalloc() sets of socket layer state for a socket, called only by 41 * socreate() and sonewconn(). Socket layer private. 42 * 43 * sodealloc() tears down socket layer state for a socket, called only by 44 * sofree() and sonewconn(). Socket layer private. 45 * 46 * pru_attach() associates protocol layer state with an allocated socket; 47 * called only once, may fail, aborting socket allocation. This is called 48 * from socreate() and sonewconn(). Socket layer private. 49 * 50 * pru_detach() disassociates protocol layer state from an attached socket, 51 * and will be called exactly once for sockets in which pru_attach() has 52 * been successfully called. If pru_attach() returned an error, 53 * pru_detach() will not be called. Socket layer private. 54 * 55 * pru_abort() and pru_close() notify the protocol layer that the last 56 * consumer of a socket is starting to tear down the socket, and that the 57 * protocol should terminate the connection. Historically, pru_abort() also 58 * detached protocol state from the socket state, but this is no longer the 59 * case. 60 * 61 * socreate() creates a socket and attaches protocol state. This is a public 62 * interface that may be used by socket layer consumers to create new 63 * sockets. 64 * 65 * sonewconn() creates a socket and attaches protocol state. This is a 66 * public interface that may be used by protocols to create new sockets when 67 * a new connection is received and will be available for accept() on a 68 * listen socket. 69 * 70 * soclose() destroys a socket after possibly waiting for it to disconnect. 71 * This is a public interface that socket consumers should use to close and 72 * release a socket when done with it. 73 * 74 * soabort() destroys a socket without waiting for it to disconnect (used 75 * only for incoming connections that are already partially or fully 76 * connected). This is used internally by the socket layer when clearing 77 * listen socket queues (due to overflow or close on the listen socket), but 78 * is also a public interface protocols may use to abort connections in 79 * their incomplete listen queues should they no longer be required. Sockets 80 * placed in completed connection listen queues should not be aborted for 81 * reasons described in the comment above the soclose() implementation. This 82 * is not a general purpose close routine, and except in the specific 83 * circumstances described here, should not be used. 84 * 85 * sofree() will free a socket and its protocol state if all references on 86 * the socket have been released, and is the public interface to attempt to 87 * free a socket when a reference is removed. This is a socket layer private 88 * interface. 89 * 90 * NOTE: In addition to socreate() and soclose(), which provide a single 91 * socket reference to the consumer to be managed as required, there are two 92 * calls to explicitly manage socket references, soref(), and sorele(). 93 * Currently, these are generally required only when transitioning a socket 94 * from a listen queue to a file descriptor, in order to prevent garbage 95 * collection of the socket at an untimely moment. For a number of reasons, 96 * these interfaces are not preferred, and should be avoided. 97 * 98 * NOTE: With regard to VNETs the general rule is that callers do not set 99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 101 * and sorflush(), which are usually called from a pre-set VNET context. 102 * sopoll() currently does not need a VNET context to be set. 103 */ 104 105 #include <sys/cdefs.h> 106 __FBSDID("$FreeBSD$"); 107 108 #include "opt_inet.h" 109 #include "opt_inet6.h" 110 #include "opt_kern_tls.h" 111 #include "opt_sctp.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/capsicum.h> 116 #include <sys/fcntl.h> 117 #include <sys/limits.h> 118 #include <sys/lock.h> 119 #include <sys/mac.h> 120 #include <sys/malloc.h> 121 #include <sys/mbuf.h> 122 #include <sys/mutex.h> 123 #include <sys/domain.h> 124 #include <sys/file.h> /* for struct knote */ 125 #include <sys/hhook.h> 126 #include <sys/kernel.h> 127 #include <sys/khelp.h> 128 #include <sys/ktls.h> 129 #include <sys/event.h> 130 #include <sys/eventhandler.h> 131 #include <sys/poll.h> 132 #include <sys/proc.h> 133 #include <sys/protosw.h> 134 #include <sys/sbuf.h> 135 #include <sys/socket.h> 136 #include <sys/socketvar.h> 137 #include <sys/resourcevar.h> 138 #include <net/route.h> 139 #include <sys/signalvar.h> 140 #include <sys/stat.h> 141 #include <sys/sx.h> 142 #include <sys/sysctl.h> 143 #include <sys/taskqueue.h> 144 #include <sys/uio.h> 145 #include <sys/un.h> 146 #include <sys/unpcb.h> 147 #include <sys/jail.h> 148 #include <sys/syslog.h> 149 #include <netinet/in.h> 150 #include <netinet/in_pcb.h> 151 #include <netinet/tcp.h> 152 153 #include <net/vnet.h> 154 155 #include <security/mac/mac_framework.h> 156 157 #include <vm/uma.h> 158 159 #ifdef COMPAT_FREEBSD32 160 #include <sys/mount.h> 161 #include <sys/sysent.h> 162 #include <compat/freebsd32/freebsd32.h> 163 #endif 164 165 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 166 int flags); 167 static void so_rdknl_lock(void *); 168 static void so_rdknl_unlock(void *); 169 static void so_rdknl_assert_lock(void *, int); 170 static void so_wrknl_lock(void *); 171 static void so_wrknl_unlock(void *); 172 static void so_wrknl_assert_lock(void *, int); 173 174 static void filt_sordetach(struct knote *kn); 175 static int filt_soread(struct knote *kn, long hint); 176 static void filt_sowdetach(struct knote *kn); 177 static int filt_sowrite(struct knote *kn, long hint); 178 static int filt_soempty(struct knote *kn, long hint); 179 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); 180 fo_kqfilter_t soo_kqfilter; 181 182 static struct filterops soread_filtops = { 183 .f_isfd = 1, 184 .f_detach = filt_sordetach, 185 .f_event = filt_soread, 186 }; 187 static struct filterops sowrite_filtops = { 188 .f_isfd = 1, 189 .f_detach = filt_sowdetach, 190 .f_event = filt_sowrite, 191 }; 192 static struct filterops soempty_filtops = { 193 .f_isfd = 1, 194 .f_detach = filt_sowdetach, 195 .f_event = filt_soempty, 196 }; 197 198 so_gen_t so_gencnt; /* generation count for sockets */ 199 200 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 201 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 202 203 #define VNET_SO_ASSERT(so) \ 204 VNET_ASSERT(curvnet != NULL, \ 205 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 206 207 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 208 #define V_socket_hhh VNET(socket_hhh) 209 210 /* 211 * Limit on the number of connections in the listen queue waiting 212 * for accept(2). 213 * NB: The original sysctl somaxconn is still available but hidden 214 * to prevent confusion about the actual purpose of this number. 215 */ 216 static u_int somaxconn = SOMAXCONN; 217 218 static int 219 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 220 { 221 int error; 222 int val; 223 224 val = somaxconn; 225 error = sysctl_handle_int(oidp, &val, 0, req); 226 if (error || !req->newptr ) 227 return (error); 228 229 /* 230 * The purpose of the UINT_MAX / 3 limit, is so that the formula 231 * 3 * so_qlimit / 2 232 * below, will not overflow. 233 */ 234 235 if (val < 1 || val > UINT_MAX / 3) 236 return (EINVAL); 237 238 somaxconn = val; 239 return (0); 240 } 241 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 242 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), 243 sysctl_somaxconn, "I", 244 "Maximum listen socket pending connection accept queue size"); 245 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 246 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, 247 sizeof(int), sysctl_somaxconn, "I", 248 "Maximum listen socket pending connection accept queue size (compat)"); 249 250 static int numopensockets; 251 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 252 &numopensockets, 0, "Number of open sockets"); 253 254 /* 255 * accept_mtx locks down per-socket fields relating to accept queues. See 256 * socketvar.h for an annotation of the protected fields of struct socket. 257 */ 258 struct mtx accept_mtx; 259 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 260 261 /* 262 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 263 * so_gencnt field. 264 */ 265 static struct mtx so_global_mtx; 266 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 267 268 /* 269 * General IPC sysctl name space, used by sockets and a variety of other IPC 270 * types. 271 */ 272 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 273 "IPC"); 274 275 /* 276 * Initialize the socket subsystem and set up the socket 277 * memory allocator. 278 */ 279 static uma_zone_t socket_zone; 280 int maxsockets; 281 282 static void 283 socket_zone_change(void *tag) 284 { 285 286 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 287 } 288 289 static void 290 socket_hhook_register(int subtype) 291 { 292 293 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 294 &V_socket_hhh[subtype], 295 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 296 printf("%s: WARNING: unable to register hook\n", __func__); 297 } 298 299 static void 300 socket_hhook_deregister(int subtype) 301 { 302 303 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 304 printf("%s: WARNING: unable to deregister hook\n", __func__); 305 } 306 307 static void 308 socket_init(void *tag) 309 { 310 311 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 312 NULL, NULL, UMA_ALIGN_PTR, 0); 313 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 314 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 315 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 316 EVENTHANDLER_PRI_FIRST); 317 } 318 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 319 320 static void 321 socket_vnet_init(const void *unused __unused) 322 { 323 int i; 324 325 /* We expect a contiguous range */ 326 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 327 socket_hhook_register(i); 328 } 329 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 330 socket_vnet_init, NULL); 331 332 static void 333 socket_vnet_uninit(const void *unused __unused) 334 { 335 int i; 336 337 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 338 socket_hhook_deregister(i); 339 } 340 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 341 socket_vnet_uninit, NULL); 342 343 /* 344 * Initialise maxsockets. This SYSINIT must be run after 345 * tunable_mbinit(). 346 */ 347 static void 348 init_maxsockets(void *ignored) 349 { 350 351 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 352 maxsockets = imax(maxsockets, maxfiles); 353 } 354 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 355 356 /* 357 * Sysctl to get and set the maximum global sockets limit. Notify protocols 358 * of the change so that they can update their dependent limits as required. 359 */ 360 static int 361 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 362 { 363 int error, newmaxsockets; 364 365 newmaxsockets = maxsockets; 366 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 367 if (error == 0 && req->newptr && newmaxsockets != maxsockets) { 368 if (newmaxsockets > maxsockets && 369 newmaxsockets <= maxfiles) { 370 maxsockets = newmaxsockets; 371 EVENTHANDLER_INVOKE(maxsockets_change); 372 } else 373 error = EINVAL; 374 } 375 return (error); 376 } 377 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 378 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0, 379 sysctl_maxsockets, "IU", 380 "Maximum number of sockets available"); 381 382 /* 383 * Socket operation routines. These routines are called by the routines in 384 * sys_socket.c or from a system process, and implement the semantics of 385 * socket operations by switching out to the protocol specific routines. 386 */ 387 388 /* 389 * Get a socket structure from our zone, and initialize it. Note that it 390 * would probably be better to allocate socket and PCB at the same time, but 391 * I'm not convinced that all the protocols can be easily modified to do 392 * this. 393 * 394 * soalloc() returns a socket with a ref count of 0. 395 */ 396 static struct socket * 397 soalloc(struct vnet *vnet) 398 { 399 struct socket *so; 400 401 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 402 if (so == NULL) 403 return (NULL); 404 #ifdef MAC 405 if (mac_socket_init(so, M_NOWAIT) != 0) { 406 uma_zfree(socket_zone, so); 407 return (NULL); 408 } 409 #endif 410 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 411 uma_zfree(socket_zone, so); 412 return (NULL); 413 } 414 415 /* 416 * The socket locking protocol allows to lock 2 sockets at a time, 417 * however, the first one must be a listening socket. WITNESS lacks 418 * a feature to change class of an existing lock, so we use DUPOK. 419 */ 420 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 421 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); 422 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); 423 so->so_rcv.sb_sel = &so->so_rdsel; 424 so->so_snd.sb_sel = &so->so_wrsel; 425 sx_init(&so->so_snd_sx, "so_snd_sx"); 426 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 427 TAILQ_INIT(&so->so_snd.sb_aiojobq); 428 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 429 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 430 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 431 #ifdef VIMAGE 432 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 433 __func__, __LINE__, so)); 434 so->so_vnet = vnet; 435 #endif 436 /* We shouldn't need the so_global_mtx */ 437 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 438 /* Do we need more comprehensive error returns? */ 439 uma_zfree(socket_zone, so); 440 return (NULL); 441 } 442 mtx_lock(&so_global_mtx); 443 so->so_gencnt = ++so_gencnt; 444 ++numopensockets; 445 #ifdef VIMAGE 446 vnet->vnet_sockcnt++; 447 #endif 448 mtx_unlock(&so_global_mtx); 449 450 return (so); 451 } 452 453 /* 454 * Free the storage associated with a socket at the socket layer, tear down 455 * locks, labels, etc. All protocol state is assumed already to have been 456 * torn down (and possibly never set up) by the caller. 457 */ 458 void 459 sodealloc(struct socket *so) 460 { 461 462 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 463 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 464 465 mtx_lock(&so_global_mtx); 466 so->so_gencnt = ++so_gencnt; 467 --numopensockets; /* Could be below, but faster here. */ 468 #ifdef VIMAGE 469 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 470 __func__, __LINE__, so)); 471 so->so_vnet->vnet_sockcnt--; 472 #endif 473 mtx_unlock(&so_global_mtx); 474 #ifdef MAC 475 mac_socket_destroy(so); 476 #endif 477 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 478 479 khelp_destroy_osd(&so->osd); 480 if (SOLISTENING(so)) { 481 if (so->sol_accept_filter != NULL) 482 accept_filt_setopt(so, NULL); 483 } else { 484 if (so->so_rcv.sb_hiwat) 485 (void)chgsbsize(so->so_cred->cr_uidinfo, 486 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 487 if (so->so_snd.sb_hiwat) 488 (void)chgsbsize(so->so_cred->cr_uidinfo, 489 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 490 sx_destroy(&so->so_snd_sx); 491 sx_destroy(&so->so_rcv_sx); 492 mtx_destroy(&so->so_snd_mtx); 493 mtx_destroy(&so->so_rcv_mtx); 494 } 495 crfree(so->so_cred); 496 mtx_destroy(&so->so_lock); 497 uma_zfree(socket_zone, so); 498 } 499 500 /* 501 * socreate returns a socket with a ref count of 1 and a file descriptor 502 * reference. The socket should be closed with soclose(). 503 */ 504 int 505 socreate(int dom, struct socket **aso, int type, int proto, 506 struct ucred *cred, struct thread *td) 507 { 508 struct protosw *prp; 509 struct socket *so; 510 int error; 511 512 if (proto) 513 prp = pffindproto(dom, proto, type); 514 else 515 prp = pffindtype(dom, type); 516 517 if (prp == NULL) { 518 /* No support for domain. */ 519 if (pffinddomain(dom) == NULL) 520 return (EAFNOSUPPORT); 521 /* No support for socket type. */ 522 if (proto == 0 && type != 0) 523 return (EPROTOTYPE); 524 return (EPROTONOSUPPORT); 525 } 526 527 MPASS(prp->pr_attach); 528 529 if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) 530 return (ECAPMODE); 531 532 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 533 return (EPROTONOSUPPORT); 534 535 if (prp->pr_type != type) 536 return (EPROTOTYPE); 537 so = soalloc(CRED_TO_VNET(cred)); 538 if (so == NULL) 539 return (ENOBUFS); 540 541 so->so_type = type; 542 so->so_cred = crhold(cred); 543 if ((prp->pr_domain->dom_family == PF_INET) || 544 (prp->pr_domain->dom_family == PF_INET6) || 545 (prp->pr_domain->dom_family == PF_ROUTE)) 546 so->so_fibnum = td->td_proc->p_fibnum; 547 else 548 so->so_fibnum = 0; 549 so->so_proto = prp; 550 #ifdef MAC 551 mac_socket_create(cred, so); 552 #endif 553 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 554 so_rdknl_assert_lock); 555 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 556 so_wrknl_assert_lock); 557 if ((prp->pr_flags & PR_SOCKBUF) == 0) { 558 so->so_snd.sb_mtx = &so->so_snd_mtx; 559 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 560 } 561 /* 562 * Auto-sizing of socket buffers is managed by the protocols and 563 * the appropriate flags must be set in the pru_attach function. 564 */ 565 CURVNET_SET(so->so_vnet); 566 error = prp->pr_attach(so, proto, td); 567 CURVNET_RESTORE(); 568 if (error) { 569 sodealloc(so); 570 return (error); 571 } 572 soref(so); 573 *aso = so; 574 return (0); 575 } 576 577 #ifdef REGRESSION 578 static int regression_sonewconn_earlytest = 1; 579 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 580 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 581 #endif 582 583 static struct timeval overinterval = { 60, 0 }; 584 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 585 &overinterval, 586 "Delay in seconds between warnings for listen socket overflows"); 587 588 /* 589 * When an attempt at a new connection is noted on a socket which supports 590 * accept(2), the protocol has two options: 591 * 1) Call legacy sonewconn() function, which would call protocol attach 592 * method, same as used for socket(2). 593 * 2) Call solisten_clone(), do attach that is specific to a cloned connection, 594 * and then call solisten_enqueue(). 595 * 596 * Note: the ref count on the socket is 0 on return. 597 */ 598 struct socket * 599 solisten_clone(struct socket *head) 600 { 601 struct sbuf descrsb; 602 struct socket *so; 603 int len, overcount; 604 u_int qlen; 605 const char localprefix[] = "local:"; 606 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 607 #if defined(INET6) 608 char addrbuf[INET6_ADDRSTRLEN]; 609 #elif defined(INET) 610 char addrbuf[INET_ADDRSTRLEN]; 611 #endif 612 bool dolog, over; 613 614 SOLISTEN_LOCK(head); 615 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 616 #ifdef REGRESSION 617 if (regression_sonewconn_earlytest && over) { 618 #else 619 if (over) { 620 #endif 621 head->sol_overcount++; 622 dolog = !!ratecheck(&head->sol_lastover, &overinterval); 623 624 /* 625 * If we're going to log, copy the overflow count and queue 626 * length from the listen socket before dropping the lock. 627 * Also, reset the overflow count. 628 */ 629 if (dolog) { 630 overcount = head->sol_overcount; 631 head->sol_overcount = 0; 632 qlen = head->sol_qlen; 633 } 634 SOLISTEN_UNLOCK(head); 635 636 if (dolog) { 637 /* 638 * Try to print something descriptive about the 639 * socket for the error message. 640 */ 641 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 642 SBUF_FIXEDLEN); 643 switch (head->so_proto->pr_domain->dom_family) { 644 #if defined(INET) || defined(INET6) 645 #ifdef INET 646 case AF_INET: 647 #endif 648 #ifdef INET6 649 case AF_INET6: 650 if (head->so_proto->pr_domain->dom_family == 651 AF_INET6 || 652 (sotoinpcb(head)->inp_inc.inc_flags & 653 INC_ISIPV6)) { 654 ip6_sprintf(addrbuf, 655 &sotoinpcb(head)->inp_inc.inc6_laddr); 656 sbuf_printf(&descrsb, "[%s]", addrbuf); 657 } else 658 #endif 659 { 660 #ifdef INET 661 inet_ntoa_r( 662 sotoinpcb(head)->inp_inc.inc_laddr, 663 addrbuf); 664 sbuf_cat(&descrsb, addrbuf); 665 #endif 666 } 667 sbuf_printf(&descrsb, ":%hu (proto %u)", 668 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 669 head->so_proto->pr_protocol); 670 break; 671 #endif /* INET || INET6 */ 672 case AF_UNIX: 673 sbuf_cat(&descrsb, localprefix); 674 if (sotounpcb(head)->unp_addr != NULL) 675 len = 676 sotounpcb(head)->unp_addr->sun_len - 677 offsetof(struct sockaddr_un, 678 sun_path); 679 else 680 len = 0; 681 if (len > 0) 682 sbuf_bcat(&descrsb, 683 sotounpcb(head)->unp_addr->sun_path, 684 len); 685 else 686 sbuf_cat(&descrsb, "(unknown)"); 687 break; 688 } 689 690 /* 691 * If we can't print something more specific, at least 692 * print the domain name. 693 */ 694 if (sbuf_finish(&descrsb) != 0 || 695 sbuf_len(&descrsb) <= 0) { 696 sbuf_clear(&descrsb); 697 sbuf_cat(&descrsb, 698 head->so_proto->pr_domain->dom_name ?: 699 "unknown"); 700 sbuf_finish(&descrsb); 701 } 702 KASSERT(sbuf_len(&descrsb) > 0, 703 ("%s: sbuf creation failed", __func__)); 704 /* 705 * Preserve the historic listen queue overflow log 706 * message, that starts with "sonewconn:". It has 707 * been known to sysadmins for years and also test 708 * sys/kern/sonewconn_overflow checks for it. 709 */ 710 if (head->so_cred == 0) { 711 log(LOG_DEBUG, "sonewconn: pcb %p (%s): " 712 "Listen queue overflow: %i already in " 713 "queue awaiting acceptance (%d " 714 "occurrences)\n", head->so_pcb, 715 sbuf_data(&descrsb), 716 qlen, overcount); 717 } else { 718 log(LOG_DEBUG, "sonewconn: pcb %p (%s): " 719 "Listen queue overflow: " 720 "%i already in queue awaiting acceptance " 721 "(%d occurrences), euid %d, rgid %d, jail %s\n", 722 head->so_pcb, sbuf_data(&descrsb), qlen, 723 overcount, head->so_cred->cr_uid, 724 head->so_cred->cr_rgid, 725 head->so_cred->cr_prison ? 726 head->so_cred->cr_prison->pr_name : 727 "not_jailed"); 728 } 729 sbuf_delete(&descrsb); 730 731 overcount = 0; 732 } 733 734 return (NULL); 735 } 736 SOLISTEN_UNLOCK(head); 737 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 738 __func__, head)); 739 so = soalloc(head->so_vnet); 740 if (so == NULL) { 741 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 742 "limit reached or out of memory\n", 743 __func__, head->so_pcb); 744 return (NULL); 745 } 746 so->so_listen = head; 747 so->so_type = head->so_type; 748 so->so_options = head->so_options & ~SO_ACCEPTCONN; 749 so->so_linger = head->so_linger; 750 so->so_state = head->so_state; 751 so->so_fibnum = head->so_fibnum; 752 so->so_proto = head->so_proto; 753 so->so_cred = crhold(head->so_cred); 754 #ifdef MAC 755 mac_socket_newconn(head, so); 756 #endif 757 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 758 so_rdknl_assert_lock); 759 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 760 so_wrknl_assert_lock); 761 VNET_SO_ASSERT(head); 762 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 763 sodealloc(so); 764 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 765 __func__, head->so_pcb); 766 return (NULL); 767 } 768 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 769 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 770 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 771 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 772 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; 773 so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; 774 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 775 so->so_snd.sb_mtx = &so->so_snd_mtx; 776 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 777 } 778 779 return (so); 780 } 781 782 /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */ 783 struct socket * 784 sonewconn(struct socket *head, int connstatus) 785 { 786 struct socket *so; 787 788 if ((so = solisten_clone(head)) == NULL) 789 return (NULL); 790 791 if (so->so_proto->pr_attach(so, 0, NULL) != 0) { 792 sodealloc(so); 793 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 794 __func__, head->so_pcb); 795 return (NULL); 796 } 797 798 solisten_enqueue(so, connstatus); 799 800 return (so); 801 } 802 803 /* 804 * Enqueue socket cloned by solisten_clone() to the listen queue of the 805 * listener it has been cloned from. 806 */ 807 void 808 solisten_enqueue(struct socket *so, int connstatus) 809 { 810 struct socket *head = so->so_listen; 811 812 MPASS(refcount_load(&so->so_count) == 0); 813 refcount_init(&so->so_count, 1); 814 815 SOLISTEN_LOCK(head); 816 if (head->sol_accept_filter != NULL) 817 connstatus = 0; 818 so->so_state |= connstatus; 819 soref(head); /* A socket on (in)complete queue refs head. */ 820 if (connstatus) { 821 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 822 so->so_qstate = SQ_COMP; 823 head->sol_qlen++; 824 solisten_wakeup(head); /* unlocks */ 825 } else { 826 /* 827 * Keep removing sockets from the head until there's room for 828 * us to insert on the tail. In pre-locking revisions, this 829 * was a simple if(), but as we could be racing with other 830 * threads and soabort() requires dropping locks, we must 831 * loop waiting for the condition to be true. 832 */ 833 while (head->sol_incqlen > head->sol_qlimit) { 834 struct socket *sp; 835 836 sp = TAILQ_FIRST(&head->sol_incomp); 837 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 838 head->sol_incqlen--; 839 SOCK_LOCK(sp); 840 sp->so_qstate = SQ_NONE; 841 sp->so_listen = NULL; 842 SOCK_UNLOCK(sp); 843 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ 844 soabort(sp); 845 SOLISTEN_LOCK(head); 846 } 847 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 848 so->so_qstate = SQ_INCOMP; 849 head->sol_incqlen++; 850 SOLISTEN_UNLOCK(head); 851 } 852 } 853 854 #if defined(SCTP) || defined(SCTP_SUPPORT) 855 /* 856 * Socket part of sctp_peeloff(). Detach a new socket from an 857 * association. The new socket is returned with a reference. 858 * 859 * XXXGL: reduce copy-paste with solisten_clone(). 860 */ 861 struct socket * 862 sopeeloff(struct socket *head) 863 { 864 struct socket *so; 865 866 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 867 __func__, __LINE__, head)); 868 so = soalloc(head->so_vnet); 869 if (so == NULL) { 870 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 871 "limit reached or out of memory\n", 872 __func__, head->so_pcb); 873 return (NULL); 874 } 875 so->so_type = head->so_type; 876 so->so_options = head->so_options; 877 so->so_linger = head->so_linger; 878 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 879 so->so_fibnum = head->so_fibnum; 880 so->so_proto = head->so_proto; 881 so->so_cred = crhold(head->so_cred); 882 #ifdef MAC 883 mac_socket_newconn(head, so); 884 #endif 885 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 886 so_rdknl_assert_lock); 887 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 888 so_wrknl_assert_lock); 889 VNET_SO_ASSERT(head); 890 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 891 sodealloc(so); 892 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 893 __func__, head->so_pcb); 894 return (NULL); 895 } 896 if ((*so->so_proto->pr_attach)(so, 0, NULL)) { 897 sodealloc(so); 898 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 899 __func__, head->so_pcb); 900 return (NULL); 901 } 902 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 903 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 904 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 905 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 906 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 907 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 908 909 soref(so); 910 911 return (so); 912 } 913 #endif /* SCTP */ 914 915 int 916 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 917 { 918 int error; 919 920 CURVNET_SET(so->so_vnet); 921 error = so->so_proto->pr_bind(so, nam, td); 922 CURVNET_RESTORE(); 923 return (error); 924 } 925 926 int 927 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 928 { 929 int error; 930 931 CURVNET_SET(so->so_vnet); 932 error = so->so_proto->pr_bindat(fd, so, nam, td); 933 CURVNET_RESTORE(); 934 return (error); 935 } 936 937 /* 938 * solisten() transitions a socket from a non-listening state to a listening 939 * state, but can also be used to update the listen queue depth on an 940 * existing listen socket. The protocol will call back into the sockets 941 * layer using solisten_proto_check() and solisten_proto() to check and set 942 * socket-layer listen state. Call backs are used so that the protocol can 943 * acquire both protocol and socket layer locks in whatever order is required 944 * by the protocol. 945 * 946 * Protocol implementors are advised to hold the socket lock across the 947 * socket-layer test and set to avoid races at the socket layer. 948 */ 949 int 950 solisten(struct socket *so, int backlog, struct thread *td) 951 { 952 int error; 953 954 CURVNET_SET(so->so_vnet); 955 error = so->so_proto->pr_listen(so, backlog, td); 956 CURVNET_RESTORE(); 957 return (error); 958 } 959 960 /* 961 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 962 * order to interlock with socket I/O. 963 */ 964 int 965 solisten_proto_check(struct socket *so) 966 { 967 SOCK_LOCK_ASSERT(so); 968 969 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 970 SS_ISDISCONNECTING)) != 0) 971 return (EINVAL); 972 973 /* 974 * Sleeping is not permitted here, so simply fail if userspace is 975 * attempting to transmit or receive on the socket. This kind of 976 * transient failure is not ideal, but it should occur only if userspace 977 * is misusing the socket interfaces. 978 */ 979 if (!sx_try_xlock(&so->so_snd_sx)) 980 return (EAGAIN); 981 if (!sx_try_xlock(&so->so_rcv_sx)) { 982 sx_xunlock(&so->so_snd_sx); 983 return (EAGAIN); 984 } 985 mtx_lock(&so->so_snd_mtx); 986 mtx_lock(&so->so_rcv_mtx); 987 988 /* Interlock with soo_aio_queue(). */ 989 if (!SOLISTENING(so) && 990 ((so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || 991 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0)) { 992 solisten_proto_abort(so); 993 return (EINVAL); 994 } 995 return (0); 996 } 997 998 /* 999 * Undo the setup done by solisten_proto_check(). 1000 */ 1001 void 1002 solisten_proto_abort(struct socket *so) 1003 { 1004 mtx_unlock(&so->so_snd_mtx); 1005 mtx_unlock(&so->so_rcv_mtx); 1006 sx_xunlock(&so->so_snd_sx); 1007 sx_xunlock(&so->so_rcv_sx); 1008 } 1009 1010 void 1011 solisten_proto(struct socket *so, int backlog) 1012 { 1013 int sbrcv_lowat, sbsnd_lowat; 1014 u_int sbrcv_hiwat, sbsnd_hiwat; 1015 short sbrcv_flags, sbsnd_flags; 1016 sbintime_t sbrcv_timeo, sbsnd_timeo; 1017 1018 SOCK_LOCK_ASSERT(so); 1019 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1020 SS_ISDISCONNECTING)) == 0, 1021 ("%s: bad socket state %p", __func__, so)); 1022 1023 if (SOLISTENING(so)) 1024 goto listening; 1025 1026 /* 1027 * Change this socket to listening state. 1028 */ 1029 sbrcv_lowat = so->so_rcv.sb_lowat; 1030 sbsnd_lowat = so->so_snd.sb_lowat; 1031 sbrcv_hiwat = so->so_rcv.sb_hiwat; 1032 sbsnd_hiwat = so->so_snd.sb_hiwat; 1033 sbrcv_flags = so->so_rcv.sb_flags; 1034 sbsnd_flags = so->so_snd.sb_flags; 1035 sbrcv_timeo = so->so_rcv.sb_timeo; 1036 sbsnd_timeo = so->so_snd.sb_timeo; 1037 1038 sbdestroy(so, SO_SND); 1039 sbdestroy(so, SO_RCV); 1040 1041 #ifdef INVARIANTS 1042 bzero(&so->so_rcv, 1043 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 1044 #endif 1045 1046 so->sol_sbrcv_lowat = sbrcv_lowat; 1047 so->sol_sbsnd_lowat = sbsnd_lowat; 1048 so->sol_sbrcv_hiwat = sbrcv_hiwat; 1049 so->sol_sbsnd_hiwat = sbsnd_hiwat; 1050 so->sol_sbrcv_flags = sbrcv_flags; 1051 so->sol_sbsnd_flags = sbsnd_flags; 1052 so->sol_sbrcv_timeo = sbrcv_timeo; 1053 so->sol_sbsnd_timeo = sbsnd_timeo; 1054 1055 so->sol_qlen = so->sol_incqlen = 0; 1056 TAILQ_INIT(&so->sol_incomp); 1057 TAILQ_INIT(&so->sol_comp); 1058 1059 so->sol_accept_filter = NULL; 1060 so->sol_accept_filter_arg = NULL; 1061 so->sol_accept_filter_str = NULL; 1062 1063 so->sol_upcall = NULL; 1064 so->sol_upcallarg = NULL; 1065 1066 so->so_options |= SO_ACCEPTCONN; 1067 1068 listening: 1069 if (backlog < 0 || backlog > somaxconn) 1070 backlog = somaxconn; 1071 so->sol_qlimit = backlog; 1072 1073 mtx_unlock(&so->so_snd_mtx); 1074 mtx_unlock(&so->so_rcv_mtx); 1075 sx_xunlock(&so->so_snd_sx); 1076 sx_xunlock(&so->so_rcv_sx); 1077 } 1078 1079 /* 1080 * Wakeup listeners/subsystems once we have a complete connection. 1081 * Enters with lock, returns unlocked. 1082 */ 1083 void 1084 solisten_wakeup(struct socket *sol) 1085 { 1086 1087 if (sol->sol_upcall != NULL) 1088 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1089 else { 1090 selwakeuppri(&sol->so_rdsel, PSOCK); 1091 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1092 } 1093 SOLISTEN_UNLOCK(sol); 1094 wakeup_one(&sol->sol_comp); 1095 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1096 pgsigio(&sol->so_sigio, SIGIO, 0); 1097 } 1098 1099 /* 1100 * Return single connection off a listening socket queue. Main consumer of 1101 * the function is kern_accept4(). Some modules, that do their own accept 1102 * management also use the function. The socket reference held by the 1103 * listen queue is handed to the caller. 1104 * 1105 * Listening socket must be locked on entry and is returned unlocked on 1106 * return. 1107 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1108 */ 1109 int 1110 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1111 { 1112 struct socket *so; 1113 int error; 1114 1115 SOLISTEN_LOCK_ASSERT(head); 1116 1117 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1118 head->so_error == 0) { 1119 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1120 "accept", 0); 1121 if (error != 0) { 1122 SOLISTEN_UNLOCK(head); 1123 return (error); 1124 } 1125 } 1126 if (head->so_error) { 1127 error = head->so_error; 1128 head->so_error = 0; 1129 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1130 error = EWOULDBLOCK; 1131 else 1132 error = 0; 1133 if (error) { 1134 SOLISTEN_UNLOCK(head); 1135 return (error); 1136 } 1137 so = TAILQ_FIRST(&head->sol_comp); 1138 SOCK_LOCK(so); 1139 KASSERT(so->so_qstate == SQ_COMP, 1140 ("%s: so %p not SQ_COMP", __func__, so)); 1141 head->sol_qlen--; 1142 so->so_qstate = SQ_NONE; 1143 so->so_listen = NULL; 1144 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1145 if (flags & ACCEPT4_INHERIT) 1146 so->so_state |= (head->so_state & SS_NBIO); 1147 else 1148 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1149 SOCK_UNLOCK(so); 1150 sorele_locked(head); 1151 1152 *ret = so; 1153 return (0); 1154 } 1155 1156 /* 1157 * Free socket upon release of the very last reference. 1158 */ 1159 static void 1160 sofree(struct socket *so) 1161 { 1162 struct protosw *pr = so->so_proto; 1163 1164 SOCK_LOCK_ASSERT(so); 1165 KASSERT(refcount_load(&so->so_count) == 0, 1166 ("%s: so %p has references", __func__, so)); 1167 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, 1168 ("%s: so %p is on listen queue", __func__, so)); 1169 1170 SOCK_UNLOCK(so); 1171 1172 if (so->so_dtor != NULL) 1173 so->so_dtor(so); 1174 1175 VNET_SO_ASSERT(so); 1176 if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) { 1177 MPASS(pr->pr_domain->dom_dispose != NULL); 1178 (*pr->pr_domain->dom_dispose)(so); 1179 } 1180 if (pr->pr_detach != NULL) 1181 pr->pr_detach(so); 1182 1183 /* 1184 * From this point on, we assume that no other references to this 1185 * socket exist anywhere else in the stack. Therefore, no locks need 1186 * to be acquired or held. 1187 */ 1188 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { 1189 sbdestroy(so, SO_SND); 1190 sbdestroy(so, SO_RCV); 1191 } 1192 seldrain(&so->so_rdsel); 1193 seldrain(&so->so_wrsel); 1194 knlist_destroy(&so->so_rdsel.si_note); 1195 knlist_destroy(&so->so_wrsel.si_note); 1196 sodealloc(so); 1197 } 1198 1199 /* 1200 * Release a reference on a socket while holding the socket lock. 1201 * Unlocks the socket lock before returning. 1202 */ 1203 void 1204 sorele_locked(struct socket *so) 1205 { 1206 SOCK_LOCK_ASSERT(so); 1207 if (refcount_release(&so->so_count)) 1208 sofree(so); 1209 else 1210 SOCK_UNLOCK(so); 1211 } 1212 1213 /* 1214 * Close a socket on last file table reference removal. Initiate disconnect 1215 * if connected. Free socket when disconnect complete. 1216 * 1217 * This function will sorele() the socket. Note that soclose() may be called 1218 * prior to the ref count reaching zero. The actual socket structure will 1219 * not be freed until the ref count reaches zero. 1220 */ 1221 int 1222 soclose(struct socket *so) 1223 { 1224 struct accept_queue lqueue; 1225 int error = 0; 1226 bool listening, last __diagused; 1227 1228 CURVNET_SET(so->so_vnet); 1229 funsetown(&so->so_sigio); 1230 if (so->so_state & SS_ISCONNECTED) { 1231 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1232 error = sodisconnect(so); 1233 if (error) { 1234 if (error == ENOTCONN) 1235 error = 0; 1236 goto drop; 1237 } 1238 } 1239 1240 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 1241 if ((so->so_state & SS_ISDISCONNECTING) && 1242 (so->so_state & SS_NBIO)) 1243 goto drop; 1244 while (so->so_state & SS_ISCONNECTED) { 1245 error = tsleep(&so->so_timeo, 1246 PSOCK | PCATCH, "soclos", 1247 so->so_linger * hz); 1248 if (error) 1249 break; 1250 } 1251 } 1252 } 1253 1254 drop: 1255 if (so->so_proto->pr_close != NULL) 1256 so->so_proto->pr_close(so); 1257 1258 SOCK_LOCK(so); 1259 if ((listening = SOLISTENING(so))) { 1260 struct socket *sp; 1261 1262 TAILQ_INIT(&lqueue); 1263 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1264 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1265 1266 so->sol_qlen = so->sol_incqlen = 0; 1267 1268 TAILQ_FOREACH(sp, &lqueue, so_list) { 1269 SOCK_LOCK(sp); 1270 sp->so_qstate = SQ_NONE; 1271 sp->so_listen = NULL; 1272 SOCK_UNLOCK(sp); 1273 last = refcount_release(&so->so_count); 1274 KASSERT(!last, ("%s: released last reference for %p", 1275 __func__, so)); 1276 } 1277 } 1278 sorele_locked(so); 1279 if (listening) { 1280 struct socket *sp, *tsp; 1281 1282 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) 1283 soabort(sp); 1284 } 1285 CURVNET_RESTORE(); 1286 return (error); 1287 } 1288 1289 /* 1290 * soabort() is used to abruptly tear down a connection, such as when a 1291 * resource limit is reached (listen queue depth exceeded), or if a listen 1292 * socket is closed while there are sockets waiting to be accepted. 1293 * 1294 * This interface is tricky, because it is called on an unreferenced socket, 1295 * and must be called only by a thread that has actually removed the socket 1296 * from the listen queue it was on. Likely this thread holds the last 1297 * reference on the socket and soabort() will proceed with sofree(). But 1298 * it might be not the last, as the sockets on the listen queues are seen 1299 * from the protocol side. 1300 * 1301 * This interface will call into the protocol code, so must not be called 1302 * with any socket locks held. Protocols do call it while holding their own 1303 * recursible protocol mutexes, but this is something that should be subject 1304 * to review in the future. 1305 * 1306 * Usually socket should have a single reference left, but this is not a 1307 * requirement. In the past, when we have had named references for file 1308 * descriptor and protocol, we asserted that none of them are being held. 1309 */ 1310 void 1311 soabort(struct socket *so) 1312 { 1313 1314 VNET_SO_ASSERT(so); 1315 1316 if (so->so_proto->pr_abort != NULL) 1317 so->so_proto->pr_abort(so); 1318 SOCK_LOCK(so); 1319 sorele_locked(so); 1320 } 1321 1322 int 1323 soaccept(struct socket *so, struct sockaddr **nam) 1324 { 1325 int error; 1326 1327 CURVNET_SET(so->so_vnet); 1328 error = so->so_proto->pr_accept(so, nam); 1329 CURVNET_RESTORE(); 1330 return (error); 1331 } 1332 1333 int 1334 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 1335 { 1336 1337 return (soconnectat(AT_FDCWD, so, nam, td)); 1338 } 1339 1340 int 1341 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1342 { 1343 int error; 1344 1345 CURVNET_SET(so->so_vnet); 1346 /* 1347 * If protocol is connection-based, can only connect once. 1348 * Otherwise, if connected, try to disconnect first. This allows 1349 * user to disconnect by connecting to, e.g., a null address. 1350 */ 1351 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1352 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1353 (error = sodisconnect(so)))) { 1354 error = EISCONN; 1355 } else { 1356 /* 1357 * Prevent accumulated error from previous connection from 1358 * biting us. 1359 */ 1360 so->so_error = 0; 1361 if (fd == AT_FDCWD) { 1362 error = so->so_proto->pr_connect(so, nam, td); 1363 } else { 1364 error = so->so_proto->pr_connectat(fd, so, nam, td); 1365 } 1366 } 1367 CURVNET_RESTORE(); 1368 1369 return (error); 1370 } 1371 1372 int 1373 soconnect2(struct socket *so1, struct socket *so2) 1374 { 1375 int error; 1376 1377 CURVNET_SET(so1->so_vnet); 1378 error = so1->so_proto->pr_connect2(so1, so2); 1379 CURVNET_RESTORE(); 1380 return (error); 1381 } 1382 1383 int 1384 sodisconnect(struct socket *so) 1385 { 1386 int error; 1387 1388 if ((so->so_state & SS_ISCONNECTED) == 0) 1389 return (ENOTCONN); 1390 if (so->so_state & SS_ISDISCONNECTING) 1391 return (EALREADY); 1392 VNET_SO_ASSERT(so); 1393 error = so->so_proto->pr_disconnect(so); 1394 return (error); 1395 } 1396 1397 int 1398 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1399 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1400 { 1401 long space; 1402 ssize_t resid; 1403 int clen = 0, error, dontroute; 1404 1405 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1406 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1407 ("sosend_dgram: !PR_ATOMIC")); 1408 1409 if (uio != NULL) 1410 resid = uio->uio_resid; 1411 else 1412 resid = top->m_pkthdr.len; 1413 /* 1414 * In theory resid should be unsigned. However, space must be 1415 * signed, as it might be less than 0 if we over-committed, and we 1416 * must use a signed comparison of space and resid. On the other 1417 * hand, a negative resid causes us to loop sending 0-length 1418 * segments to the protocol. 1419 */ 1420 if (resid < 0) { 1421 error = EINVAL; 1422 goto out; 1423 } 1424 1425 dontroute = 1426 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1427 if (td != NULL) 1428 td->td_ru.ru_msgsnd++; 1429 if (control != NULL) 1430 clen = control->m_len; 1431 1432 SOCKBUF_LOCK(&so->so_snd); 1433 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1434 SOCKBUF_UNLOCK(&so->so_snd); 1435 error = EPIPE; 1436 goto out; 1437 } 1438 if (so->so_error) { 1439 error = so->so_error; 1440 so->so_error = 0; 1441 SOCKBUF_UNLOCK(&so->so_snd); 1442 goto out; 1443 } 1444 if ((so->so_state & SS_ISCONNECTED) == 0) { 1445 /* 1446 * `sendto' and `sendmsg' is allowed on a connection-based 1447 * socket if it supports implied connect. Return ENOTCONN if 1448 * not connected and no address is supplied. 1449 */ 1450 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1451 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1452 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1453 !(resid == 0 && clen != 0)) { 1454 SOCKBUF_UNLOCK(&so->so_snd); 1455 error = ENOTCONN; 1456 goto out; 1457 } 1458 } else if (addr == NULL) { 1459 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1460 error = ENOTCONN; 1461 else 1462 error = EDESTADDRREQ; 1463 SOCKBUF_UNLOCK(&so->so_snd); 1464 goto out; 1465 } 1466 } 1467 1468 /* 1469 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1470 * problem and need fixing. 1471 */ 1472 space = sbspace(&so->so_snd); 1473 if (flags & MSG_OOB) 1474 space += 1024; 1475 space -= clen; 1476 SOCKBUF_UNLOCK(&so->so_snd); 1477 if (resid > space) { 1478 error = EMSGSIZE; 1479 goto out; 1480 } 1481 if (uio == NULL) { 1482 resid = 0; 1483 if (flags & MSG_EOR) 1484 top->m_flags |= M_EOR; 1485 } else { 1486 /* 1487 * Copy the data from userland into a mbuf chain. 1488 * If no data is to be copied in, a single empty mbuf 1489 * is returned. 1490 */ 1491 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1492 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1493 if (top == NULL) { 1494 error = EFAULT; /* only possible error */ 1495 goto out; 1496 } 1497 space -= resid - uio->uio_resid; 1498 resid = uio->uio_resid; 1499 } 1500 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1501 /* 1502 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1503 * than with. 1504 */ 1505 if (dontroute) { 1506 SOCK_LOCK(so); 1507 so->so_options |= SO_DONTROUTE; 1508 SOCK_UNLOCK(so); 1509 } 1510 /* 1511 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1512 * of date. We could have received a reset packet in an interrupt or 1513 * maybe we slept while doing page faults in uiomove() etc. We could 1514 * probably recheck again inside the locking protection here, but 1515 * there are probably other places that this also happens. We must 1516 * rethink this. 1517 */ 1518 VNET_SO_ASSERT(so); 1519 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1520 /* 1521 * If the user set MSG_EOF, the protocol understands this flag and 1522 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1523 */ 1524 ((flags & MSG_EOF) && 1525 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1526 (resid <= 0)) ? 1527 PRUS_EOF : 1528 /* If there is more to send set PRUS_MORETOCOME */ 1529 (flags & MSG_MORETOCOME) || 1530 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1531 top, addr, control, td); 1532 if (dontroute) { 1533 SOCK_LOCK(so); 1534 so->so_options &= ~SO_DONTROUTE; 1535 SOCK_UNLOCK(so); 1536 } 1537 clen = 0; 1538 control = NULL; 1539 top = NULL; 1540 out: 1541 if (top != NULL) 1542 m_freem(top); 1543 if (control != NULL) 1544 m_freem(control); 1545 return (error); 1546 } 1547 1548 /* 1549 * Send on a socket. If send must go all at once and message is larger than 1550 * send buffering, then hard error. Lock against other senders. If must go 1551 * all at once and not enough room now, then inform user that this would 1552 * block and do nothing. Otherwise, if nonblocking, send as much as 1553 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1554 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1555 * in mbuf chain must be small enough to send all at once. 1556 * 1557 * Returns nonzero on error, timeout or signal; callers must check for short 1558 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1559 * on return. 1560 */ 1561 int 1562 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1563 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1564 { 1565 long space; 1566 ssize_t resid; 1567 int clen = 0, error, dontroute; 1568 int atomic = sosendallatonce(so) || top; 1569 int pr_send_flag; 1570 #ifdef KERN_TLS 1571 struct ktls_session *tls; 1572 int tls_enq_cnt, tls_send_flag; 1573 uint8_t tls_rtype; 1574 1575 tls = NULL; 1576 tls_rtype = TLS_RLTYPE_APP; 1577 #endif 1578 if (uio != NULL) 1579 resid = uio->uio_resid; 1580 else if ((top->m_flags & M_PKTHDR) != 0) 1581 resid = top->m_pkthdr.len; 1582 else 1583 resid = m_length(top, NULL); 1584 /* 1585 * In theory resid should be unsigned. However, space must be 1586 * signed, as it might be less than 0 if we over-committed, and we 1587 * must use a signed comparison of space and resid. On the other 1588 * hand, a negative resid causes us to loop sending 0-length 1589 * segments to the protocol. 1590 * 1591 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1592 * type sockets since that's an error. 1593 */ 1594 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1595 error = EINVAL; 1596 goto out; 1597 } 1598 1599 dontroute = 1600 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1601 (so->so_proto->pr_flags & PR_ATOMIC); 1602 if (td != NULL) 1603 td->td_ru.ru_msgsnd++; 1604 if (control != NULL) 1605 clen = control->m_len; 1606 1607 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 1608 if (error) 1609 goto out; 1610 1611 #ifdef KERN_TLS 1612 tls_send_flag = 0; 1613 tls = ktls_hold(so->so_snd.sb_tls_info); 1614 if (tls != NULL) { 1615 if (tls->mode == TCP_TLS_MODE_SW) 1616 tls_send_flag = PRUS_NOTREADY; 1617 1618 if (control != NULL) { 1619 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1620 1621 if (clen >= sizeof(*cm) && 1622 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 1623 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 1624 clen = 0; 1625 m_freem(control); 1626 control = NULL; 1627 atomic = 1; 1628 } 1629 } 1630 1631 if (resid == 0 && !ktls_permit_empty_frames(tls)) { 1632 error = EINVAL; 1633 goto release; 1634 } 1635 } 1636 #endif 1637 1638 restart: 1639 do { 1640 SOCKBUF_LOCK(&so->so_snd); 1641 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1642 SOCKBUF_UNLOCK(&so->so_snd); 1643 error = EPIPE; 1644 goto release; 1645 } 1646 if (so->so_error) { 1647 error = so->so_error; 1648 so->so_error = 0; 1649 SOCKBUF_UNLOCK(&so->so_snd); 1650 goto release; 1651 } 1652 if ((so->so_state & SS_ISCONNECTED) == 0) { 1653 /* 1654 * `sendto' and `sendmsg' is allowed on a connection- 1655 * based socket if it supports implied connect. 1656 * Return ENOTCONN if not connected and no address is 1657 * supplied. 1658 */ 1659 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1660 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1661 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1662 !(resid == 0 && clen != 0)) { 1663 SOCKBUF_UNLOCK(&so->so_snd); 1664 error = ENOTCONN; 1665 goto release; 1666 } 1667 } else if (addr == NULL) { 1668 SOCKBUF_UNLOCK(&so->so_snd); 1669 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1670 error = ENOTCONN; 1671 else 1672 error = EDESTADDRREQ; 1673 goto release; 1674 } 1675 } 1676 space = sbspace(&so->so_snd); 1677 if (flags & MSG_OOB) 1678 space += 1024; 1679 if ((atomic && resid > so->so_snd.sb_hiwat) || 1680 clen > so->so_snd.sb_hiwat) { 1681 SOCKBUF_UNLOCK(&so->so_snd); 1682 error = EMSGSIZE; 1683 goto release; 1684 } 1685 if (space < resid + clen && 1686 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1687 if ((so->so_state & SS_NBIO) || 1688 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1689 SOCKBUF_UNLOCK(&so->so_snd); 1690 error = EWOULDBLOCK; 1691 goto release; 1692 } 1693 error = sbwait(so, SO_SND); 1694 SOCKBUF_UNLOCK(&so->so_snd); 1695 if (error) 1696 goto release; 1697 goto restart; 1698 } 1699 SOCKBUF_UNLOCK(&so->so_snd); 1700 space -= clen; 1701 do { 1702 if (uio == NULL) { 1703 resid = 0; 1704 if (flags & MSG_EOR) 1705 top->m_flags |= M_EOR; 1706 #ifdef KERN_TLS 1707 if (tls != NULL) { 1708 ktls_frame(top, tls, &tls_enq_cnt, 1709 tls_rtype); 1710 tls_rtype = TLS_RLTYPE_APP; 1711 } 1712 #endif 1713 } else { 1714 /* 1715 * Copy the data from userland into a mbuf 1716 * chain. If resid is 0, which can happen 1717 * only if we have control to send, then 1718 * a single empty mbuf is returned. This 1719 * is a workaround to prevent protocol send 1720 * methods to panic. 1721 */ 1722 #ifdef KERN_TLS 1723 if (tls != NULL) { 1724 top = m_uiotombuf(uio, M_WAITOK, space, 1725 tls->params.max_frame_len, 1726 M_EXTPG | 1727 ((flags & MSG_EOR) ? M_EOR : 0)); 1728 if (top != NULL) { 1729 ktls_frame(top, tls, 1730 &tls_enq_cnt, tls_rtype); 1731 } 1732 tls_rtype = TLS_RLTYPE_APP; 1733 } else 1734 #endif 1735 top = m_uiotombuf(uio, M_WAITOK, space, 1736 (atomic ? max_hdr : 0), 1737 (atomic ? M_PKTHDR : 0) | 1738 ((flags & MSG_EOR) ? M_EOR : 0)); 1739 if (top == NULL) { 1740 error = EFAULT; /* only possible error */ 1741 goto release; 1742 } 1743 space -= resid - uio->uio_resid; 1744 resid = uio->uio_resid; 1745 } 1746 if (dontroute) { 1747 SOCK_LOCK(so); 1748 so->so_options |= SO_DONTROUTE; 1749 SOCK_UNLOCK(so); 1750 } 1751 /* 1752 * XXX all the SBS_CANTSENDMORE checks previously 1753 * done could be out of date. We could have received 1754 * a reset packet in an interrupt or maybe we slept 1755 * while doing page faults in uiomove() etc. We 1756 * could probably recheck again inside the locking 1757 * protection here, but there are probably other 1758 * places that this also happens. We must rethink 1759 * this. 1760 */ 1761 VNET_SO_ASSERT(so); 1762 1763 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : 1764 /* 1765 * If the user set MSG_EOF, the protocol understands 1766 * this flag and nothing left to send then use 1767 * PRU_SEND_EOF instead of PRU_SEND. 1768 */ 1769 ((flags & MSG_EOF) && 1770 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1771 (resid <= 0)) ? 1772 PRUS_EOF : 1773 /* If there is more to send set PRUS_MORETOCOME. */ 1774 (flags & MSG_MORETOCOME) || 1775 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 1776 1777 #ifdef KERN_TLS 1778 pr_send_flag |= tls_send_flag; 1779 #endif 1780 1781 error = so->so_proto->pr_send(so, pr_send_flag, top, 1782 addr, control, td); 1783 1784 if (dontroute) { 1785 SOCK_LOCK(so); 1786 so->so_options &= ~SO_DONTROUTE; 1787 SOCK_UNLOCK(so); 1788 } 1789 1790 #ifdef KERN_TLS 1791 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 1792 if (error != 0) { 1793 m_freem(top); 1794 top = NULL; 1795 } else { 1796 soref(so); 1797 ktls_enqueue(top, so, tls_enq_cnt); 1798 } 1799 } 1800 #endif 1801 clen = 0; 1802 control = NULL; 1803 top = NULL; 1804 if (error) 1805 goto release; 1806 } while (resid && space > 0); 1807 } while (resid); 1808 1809 release: 1810 SOCK_IO_SEND_UNLOCK(so); 1811 out: 1812 #ifdef KERN_TLS 1813 if (tls != NULL) 1814 ktls_free(tls); 1815 #endif 1816 if (top != NULL) 1817 m_freem(top); 1818 if (control != NULL) 1819 m_freem(control); 1820 return (error); 1821 } 1822 1823 int 1824 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1825 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1826 { 1827 int error; 1828 1829 CURVNET_SET(so->so_vnet); 1830 error = so->so_proto->pr_sosend(so, addr, uio, 1831 top, control, flags, td); 1832 CURVNET_RESTORE(); 1833 return (error); 1834 } 1835 1836 /* 1837 * The part of soreceive() that implements reading non-inline out-of-band 1838 * data from a socket. For more complete comments, see soreceive(), from 1839 * which this code originated. 1840 * 1841 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1842 * unable to return an mbuf chain to the caller. 1843 */ 1844 static int 1845 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1846 { 1847 struct protosw *pr = so->so_proto; 1848 struct mbuf *m; 1849 int error; 1850 1851 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1852 VNET_SO_ASSERT(so); 1853 1854 m = m_get(M_WAITOK, MT_DATA); 1855 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 1856 if (error) 1857 goto bad; 1858 do { 1859 error = uiomove(mtod(m, void *), 1860 (int) min(uio->uio_resid, m->m_len), uio); 1861 m = m_free(m); 1862 } while (uio->uio_resid && error == 0 && m); 1863 bad: 1864 if (m != NULL) 1865 m_freem(m); 1866 return (error); 1867 } 1868 1869 /* 1870 * Following replacement or removal of the first mbuf on the first mbuf chain 1871 * of a socket buffer, push necessary state changes back into the socket 1872 * buffer so that other consumers see the values consistently. 'nextrecord' 1873 * is the callers locally stored value of the original value of 1874 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1875 * NOTE: 'nextrecord' may be NULL. 1876 */ 1877 static __inline void 1878 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1879 { 1880 1881 SOCKBUF_LOCK_ASSERT(sb); 1882 /* 1883 * First, update for the new value of nextrecord. If necessary, make 1884 * it the first record. 1885 */ 1886 if (sb->sb_mb != NULL) 1887 sb->sb_mb->m_nextpkt = nextrecord; 1888 else 1889 sb->sb_mb = nextrecord; 1890 1891 /* 1892 * Now update any dependent socket buffer fields to reflect the new 1893 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1894 * addition of a second clause that takes care of the case where 1895 * sb_mb has been updated, but remains the last record. 1896 */ 1897 if (sb->sb_mb == NULL) { 1898 sb->sb_mbtail = NULL; 1899 sb->sb_lastrecord = NULL; 1900 } else if (sb->sb_mb->m_nextpkt == NULL) 1901 sb->sb_lastrecord = sb->sb_mb; 1902 } 1903 1904 /* 1905 * Implement receive operations on a socket. We depend on the way that 1906 * records are added to the sockbuf by sbappend. In particular, each record 1907 * (mbufs linked through m_next) must begin with an address if the protocol 1908 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1909 * data, and then zero or more mbufs of data. In order to allow parallelism 1910 * between network receive and copying to user space, as well as avoid 1911 * sleeping with a mutex held, we release the socket buffer mutex during the 1912 * user space copy. Although the sockbuf is locked, new data may still be 1913 * appended, and thus we must maintain consistency of the sockbuf during that 1914 * time. 1915 * 1916 * The caller may receive the data as a single mbuf chain by supplying an 1917 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1918 * the count in uio_resid. 1919 */ 1920 int 1921 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1922 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1923 { 1924 struct mbuf *m, **mp; 1925 int flags, error, offset; 1926 ssize_t len; 1927 struct protosw *pr = so->so_proto; 1928 struct mbuf *nextrecord; 1929 int moff, type = 0; 1930 ssize_t orig_resid = uio->uio_resid; 1931 bool report_real_len = false; 1932 1933 mp = mp0; 1934 if (psa != NULL) 1935 *psa = NULL; 1936 if (controlp != NULL) 1937 *controlp = NULL; 1938 if (flagsp != NULL) { 1939 report_real_len = *flagsp & MSG_TRUNC; 1940 *flagsp &= ~MSG_TRUNC; 1941 flags = *flagsp &~ MSG_EOR; 1942 } else 1943 flags = 0; 1944 if (flags & MSG_OOB) 1945 return (soreceive_rcvoob(so, uio, flags)); 1946 if (mp != NULL) 1947 *mp = NULL; 1948 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1949 && uio->uio_resid) { 1950 VNET_SO_ASSERT(so); 1951 pr->pr_rcvd(so, 0); 1952 } 1953 1954 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 1955 if (error) 1956 return (error); 1957 1958 restart: 1959 SOCKBUF_LOCK(&so->so_rcv); 1960 m = so->so_rcv.sb_mb; 1961 /* 1962 * If we have less data than requested, block awaiting more (subject 1963 * to any timeout) if: 1964 * 1. the current count is less than the low water mark, or 1965 * 2. MSG_DONTWAIT is not set 1966 */ 1967 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1968 sbavail(&so->so_rcv) < uio->uio_resid) && 1969 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 1970 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1971 KASSERT(m != NULL || !sbavail(&so->so_rcv), 1972 ("receive: m == %p sbavail == %u", 1973 m, sbavail(&so->so_rcv))); 1974 if (so->so_error || so->so_rerror) { 1975 if (m != NULL) 1976 goto dontblock; 1977 if (so->so_error) 1978 error = so->so_error; 1979 else 1980 error = so->so_rerror; 1981 if ((flags & MSG_PEEK) == 0) { 1982 if (so->so_error) 1983 so->so_error = 0; 1984 else 1985 so->so_rerror = 0; 1986 } 1987 SOCKBUF_UNLOCK(&so->so_rcv); 1988 goto release; 1989 } 1990 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1991 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1992 if (m != NULL) 1993 goto dontblock; 1994 #ifdef KERN_TLS 1995 else if (so->so_rcv.sb_tlsdcc == 0 && 1996 so->so_rcv.sb_tlscc == 0) { 1997 #else 1998 else { 1999 #endif 2000 SOCKBUF_UNLOCK(&so->so_rcv); 2001 goto release; 2002 } 2003 } 2004 for (; m != NULL; m = m->m_next) 2005 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2006 m = so->so_rcv.sb_mb; 2007 goto dontblock; 2008 } 2009 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2010 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2011 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2012 SOCKBUF_UNLOCK(&so->so_rcv); 2013 error = ENOTCONN; 2014 goto release; 2015 } 2016 if (uio->uio_resid == 0 && !report_real_len) { 2017 SOCKBUF_UNLOCK(&so->so_rcv); 2018 goto release; 2019 } 2020 if ((so->so_state & SS_NBIO) || 2021 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2022 SOCKBUF_UNLOCK(&so->so_rcv); 2023 error = EWOULDBLOCK; 2024 goto release; 2025 } 2026 SBLASTRECORDCHK(&so->so_rcv); 2027 SBLASTMBUFCHK(&so->so_rcv); 2028 error = sbwait(so, SO_RCV); 2029 SOCKBUF_UNLOCK(&so->so_rcv); 2030 if (error) 2031 goto release; 2032 goto restart; 2033 } 2034 dontblock: 2035 /* 2036 * From this point onward, we maintain 'nextrecord' as a cache of the 2037 * pointer to the next record in the socket buffer. We must keep the 2038 * various socket buffer pointers and local stack versions of the 2039 * pointers in sync, pushing out modifications before dropping the 2040 * socket buffer mutex, and re-reading them when picking it up. 2041 * 2042 * Otherwise, we will race with the network stack appending new data 2043 * or records onto the socket buffer by using inconsistent/stale 2044 * versions of the field, possibly resulting in socket buffer 2045 * corruption. 2046 * 2047 * By holding the high-level sblock(), we prevent simultaneous 2048 * readers from pulling off the front of the socket buffer. 2049 */ 2050 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2051 if (uio->uio_td) 2052 uio->uio_td->td_ru.ru_msgrcv++; 2053 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2054 SBLASTRECORDCHK(&so->so_rcv); 2055 SBLASTMBUFCHK(&so->so_rcv); 2056 nextrecord = m->m_nextpkt; 2057 if (pr->pr_flags & PR_ADDR) { 2058 KASSERT(m->m_type == MT_SONAME, 2059 ("m->m_type == %d", m->m_type)); 2060 orig_resid = 0; 2061 if (psa != NULL) 2062 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2063 M_NOWAIT); 2064 if (flags & MSG_PEEK) { 2065 m = m->m_next; 2066 } else { 2067 sbfree(&so->so_rcv, m); 2068 so->so_rcv.sb_mb = m_free(m); 2069 m = so->so_rcv.sb_mb; 2070 sockbuf_pushsync(&so->so_rcv, nextrecord); 2071 } 2072 } 2073 2074 /* 2075 * Process one or more MT_CONTROL mbufs present before any data mbufs 2076 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2077 * just copy the data; if !MSG_PEEK, we call into the protocol to 2078 * perform externalization (or freeing if controlp == NULL). 2079 */ 2080 if (m != NULL && m->m_type == MT_CONTROL) { 2081 struct mbuf *cm = NULL, *cmn; 2082 struct mbuf **cme = &cm; 2083 #ifdef KERN_TLS 2084 struct cmsghdr *cmsg; 2085 struct tls_get_record tgr; 2086 2087 /* 2088 * For MSG_TLSAPPDATA, check for an alert record. 2089 * If found, return ENXIO without removing 2090 * it from the receive queue. This allows a subsequent 2091 * call without MSG_TLSAPPDATA to receive it. 2092 * Note that, for TLS, there should only be a single 2093 * control mbuf with the TLS_GET_RECORD message in it. 2094 */ 2095 if (flags & MSG_TLSAPPDATA) { 2096 cmsg = mtod(m, struct cmsghdr *); 2097 if (cmsg->cmsg_type == TLS_GET_RECORD && 2098 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2099 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2100 if (__predict_false(tgr.tls_type == 2101 TLS_RLTYPE_ALERT)) { 2102 SOCKBUF_UNLOCK(&so->so_rcv); 2103 error = ENXIO; 2104 goto release; 2105 } 2106 } 2107 } 2108 #endif 2109 2110 do { 2111 if (flags & MSG_PEEK) { 2112 if (controlp != NULL) { 2113 *controlp = m_copym(m, 0, m->m_len, 2114 M_NOWAIT); 2115 controlp = &(*controlp)->m_next; 2116 } 2117 m = m->m_next; 2118 } else { 2119 sbfree(&so->so_rcv, m); 2120 so->so_rcv.sb_mb = m->m_next; 2121 m->m_next = NULL; 2122 *cme = m; 2123 cme = &(*cme)->m_next; 2124 m = so->so_rcv.sb_mb; 2125 } 2126 } while (m != NULL && m->m_type == MT_CONTROL); 2127 if ((flags & MSG_PEEK) == 0) 2128 sockbuf_pushsync(&so->so_rcv, nextrecord); 2129 while (cm != NULL) { 2130 cmn = cm->m_next; 2131 cm->m_next = NULL; 2132 if (pr->pr_domain->dom_externalize != NULL) { 2133 SOCKBUF_UNLOCK(&so->so_rcv); 2134 VNET_SO_ASSERT(so); 2135 error = (*pr->pr_domain->dom_externalize) 2136 (cm, controlp, flags); 2137 SOCKBUF_LOCK(&so->so_rcv); 2138 } else if (controlp != NULL) 2139 *controlp = cm; 2140 else 2141 m_freem(cm); 2142 if (controlp != NULL) { 2143 while (*controlp != NULL) 2144 controlp = &(*controlp)->m_next; 2145 } 2146 cm = cmn; 2147 } 2148 if (m != NULL) 2149 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 2150 else 2151 nextrecord = so->so_rcv.sb_mb; 2152 orig_resid = 0; 2153 } 2154 if (m != NULL) { 2155 if ((flags & MSG_PEEK) == 0) { 2156 KASSERT(m->m_nextpkt == nextrecord, 2157 ("soreceive: post-control, nextrecord !sync")); 2158 if (nextrecord == NULL) { 2159 KASSERT(so->so_rcv.sb_mb == m, 2160 ("soreceive: post-control, sb_mb!=m")); 2161 KASSERT(so->so_rcv.sb_lastrecord == m, 2162 ("soreceive: post-control, lastrecord!=m")); 2163 } 2164 } 2165 type = m->m_type; 2166 if (type == MT_OOBDATA) 2167 flags |= MSG_OOB; 2168 } else { 2169 if ((flags & MSG_PEEK) == 0) { 2170 KASSERT(so->so_rcv.sb_mb == nextrecord, 2171 ("soreceive: sb_mb != nextrecord")); 2172 if (so->so_rcv.sb_mb == NULL) { 2173 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2174 ("soreceive: sb_lastercord != NULL")); 2175 } 2176 } 2177 } 2178 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2179 SBLASTRECORDCHK(&so->so_rcv); 2180 SBLASTMBUFCHK(&so->so_rcv); 2181 2182 /* 2183 * Now continue to read any data mbufs off of the head of the socket 2184 * buffer until the read request is satisfied. Note that 'type' is 2185 * used to store the type of any mbuf reads that have happened so far 2186 * such that soreceive() can stop reading if the type changes, which 2187 * causes soreceive() to return only one of regular data and inline 2188 * out-of-band data in a single socket receive operation. 2189 */ 2190 moff = 0; 2191 offset = 0; 2192 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 2193 && error == 0) { 2194 /* 2195 * If the type of mbuf has changed since the last mbuf 2196 * examined ('type'), end the receive operation. 2197 */ 2198 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2199 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 2200 if (type != m->m_type) 2201 break; 2202 } else if (type == MT_OOBDATA) 2203 break; 2204 else 2205 KASSERT(m->m_type == MT_DATA, 2206 ("m->m_type == %d", m->m_type)); 2207 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 2208 len = uio->uio_resid; 2209 if (so->so_oobmark && len > so->so_oobmark - offset) 2210 len = so->so_oobmark - offset; 2211 if (len > m->m_len - moff) 2212 len = m->m_len - moff; 2213 /* 2214 * If mp is set, just pass back the mbufs. Otherwise copy 2215 * them out via the uio, then free. Sockbuf must be 2216 * consistent here (points to current mbuf, it points to next 2217 * record) when we drop priority; we must note any additions 2218 * to the sockbuf when we block interrupts again. 2219 */ 2220 if (mp == NULL) { 2221 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2222 SBLASTRECORDCHK(&so->so_rcv); 2223 SBLASTMBUFCHK(&so->so_rcv); 2224 SOCKBUF_UNLOCK(&so->so_rcv); 2225 if ((m->m_flags & M_EXTPG) != 0) 2226 error = m_unmapped_uiomove(m, moff, uio, 2227 (int)len); 2228 else 2229 error = uiomove(mtod(m, char *) + moff, 2230 (int)len, uio); 2231 SOCKBUF_LOCK(&so->so_rcv); 2232 if (error) { 2233 /* 2234 * The MT_SONAME mbuf has already been removed 2235 * from the record, so it is necessary to 2236 * remove the data mbufs, if any, to preserve 2237 * the invariant in the case of PR_ADDR that 2238 * requires MT_SONAME mbufs at the head of 2239 * each record. 2240 */ 2241 if (pr->pr_flags & PR_ATOMIC && 2242 ((flags & MSG_PEEK) == 0)) 2243 (void)sbdroprecord_locked(&so->so_rcv); 2244 SOCKBUF_UNLOCK(&so->so_rcv); 2245 goto release; 2246 } 2247 } else 2248 uio->uio_resid -= len; 2249 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2250 if (len == m->m_len - moff) { 2251 if (m->m_flags & M_EOR) 2252 flags |= MSG_EOR; 2253 if (flags & MSG_PEEK) { 2254 m = m->m_next; 2255 moff = 0; 2256 } else { 2257 nextrecord = m->m_nextpkt; 2258 sbfree(&so->so_rcv, m); 2259 if (mp != NULL) { 2260 m->m_nextpkt = NULL; 2261 *mp = m; 2262 mp = &m->m_next; 2263 so->so_rcv.sb_mb = m = m->m_next; 2264 *mp = NULL; 2265 } else { 2266 so->so_rcv.sb_mb = m_free(m); 2267 m = so->so_rcv.sb_mb; 2268 } 2269 sockbuf_pushsync(&so->so_rcv, nextrecord); 2270 SBLASTRECORDCHK(&so->so_rcv); 2271 SBLASTMBUFCHK(&so->so_rcv); 2272 } 2273 } else { 2274 if (flags & MSG_PEEK) 2275 moff += len; 2276 else { 2277 if (mp != NULL) { 2278 if (flags & MSG_DONTWAIT) { 2279 *mp = m_copym(m, 0, len, 2280 M_NOWAIT); 2281 if (*mp == NULL) { 2282 /* 2283 * m_copym() couldn't 2284 * allocate an mbuf. 2285 * Adjust uio_resid back 2286 * (it was adjusted 2287 * down by len bytes, 2288 * which we didn't end 2289 * up "copying" over). 2290 */ 2291 uio->uio_resid += len; 2292 break; 2293 } 2294 } else { 2295 SOCKBUF_UNLOCK(&so->so_rcv); 2296 *mp = m_copym(m, 0, len, 2297 M_WAITOK); 2298 SOCKBUF_LOCK(&so->so_rcv); 2299 } 2300 } 2301 sbcut_locked(&so->so_rcv, len); 2302 } 2303 } 2304 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2305 if (so->so_oobmark) { 2306 if ((flags & MSG_PEEK) == 0) { 2307 so->so_oobmark -= len; 2308 if (so->so_oobmark == 0) { 2309 so->so_rcv.sb_state |= SBS_RCVATMARK; 2310 break; 2311 } 2312 } else { 2313 offset += len; 2314 if (offset == so->so_oobmark) 2315 break; 2316 } 2317 } 2318 if (flags & MSG_EOR) 2319 break; 2320 /* 2321 * If the MSG_WAITALL flag is set (for non-atomic socket), we 2322 * must not quit until "uio->uio_resid == 0" or an error 2323 * termination. If a signal/timeout occurs, return with a 2324 * short count but without error. Keep sockbuf locked 2325 * against other readers. 2326 */ 2327 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 2328 !sosendallatonce(so) && nextrecord == NULL) { 2329 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2330 if (so->so_error || so->so_rerror || 2331 so->so_rcv.sb_state & SBS_CANTRCVMORE) 2332 break; 2333 /* 2334 * Notify the protocol that some data has been 2335 * drained before blocking. 2336 */ 2337 if (pr->pr_flags & PR_WANTRCVD) { 2338 SOCKBUF_UNLOCK(&so->so_rcv); 2339 VNET_SO_ASSERT(so); 2340 pr->pr_rcvd(so, flags); 2341 SOCKBUF_LOCK(&so->so_rcv); 2342 } 2343 SBLASTRECORDCHK(&so->so_rcv); 2344 SBLASTMBUFCHK(&so->so_rcv); 2345 /* 2346 * We could receive some data while was notifying 2347 * the protocol. Skip blocking in this case. 2348 */ 2349 if (so->so_rcv.sb_mb == NULL) { 2350 error = sbwait(so, SO_RCV); 2351 if (error) { 2352 SOCKBUF_UNLOCK(&so->so_rcv); 2353 goto release; 2354 } 2355 } 2356 m = so->so_rcv.sb_mb; 2357 if (m != NULL) 2358 nextrecord = m->m_nextpkt; 2359 } 2360 } 2361 2362 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2363 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 2364 if (report_real_len) 2365 uio->uio_resid -= m_length(m, NULL) - moff; 2366 flags |= MSG_TRUNC; 2367 if ((flags & MSG_PEEK) == 0) 2368 (void) sbdroprecord_locked(&so->so_rcv); 2369 } 2370 if ((flags & MSG_PEEK) == 0) { 2371 if (m == NULL) { 2372 /* 2373 * First part is an inline SB_EMPTY_FIXUP(). Second 2374 * part makes sure sb_lastrecord is up-to-date if 2375 * there is still data in the socket buffer. 2376 */ 2377 so->so_rcv.sb_mb = nextrecord; 2378 if (so->so_rcv.sb_mb == NULL) { 2379 so->so_rcv.sb_mbtail = NULL; 2380 so->so_rcv.sb_lastrecord = NULL; 2381 } else if (nextrecord->m_nextpkt == NULL) 2382 so->so_rcv.sb_lastrecord = nextrecord; 2383 } 2384 SBLASTRECORDCHK(&so->so_rcv); 2385 SBLASTMBUFCHK(&so->so_rcv); 2386 /* 2387 * If soreceive() is being done from the socket callback, 2388 * then don't need to generate ACK to peer to update window, 2389 * since ACK will be generated on return to TCP. 2390 */ 2391 if (!(flags & MSG_SOCALLBCK) && 2392 (pr->pr_flags & PR_WANTRCVD)) { 2393 SOCKBUF_UNLOCK(&so->so_rcv); 2394 VNET_SO_ASSERT(so); 2395 pr->pr_rcvd(so, flags); 2396 SOCKBUF_LOCK(&so->so_rcv); 2397 } 2398 } 2399 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2400 if (orig_resid == uio->uio_resid && orig_resid && 2401 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 2402 SOCKBUF_UNLOCK(&so->so_rcv); 2403 goto restart; 2404 } 2405 SOCKBUF_UNLOCK(&so->so_rcv); 2406 2407 if (flagsp != NULL) 2408 *flagsp |= flags; 2409 release: 2410 SOCK_IO_RECV_UNLOCK(so); 2411 return (error); 2412 } 2413 2414 /* 2415 * Optimized version of soreceive() for stream (TCP) sockets. 2416 */ 2417 int 2418 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 2419 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2420 { 2421 int len = 0, error = 0, flags, oresid; 2422 struct sockbuf *sb; 2423 struct mbuf *m, *n = NULL; 2424 2425 /* We only do stream sockets. */ 2426 if (so->so_type != SOCK_STREAM) 2427 return (EINVAL); 2428 if (psa != NULL) 2429 *psa = NULL; 2430 if (flagsp != NULL) 2431 flags = *flagsp &~ MSG_EOR; 2432 else 2433 flags = 0; 2434 if (controlp != NULL) 2435 *controlp = NULL; 2436 if (flags & MSG_OOB) 2437 return (soreceive_rcvoob(so, uio, flags)); 2438 if (mp0 != NULL) 2439 *mp0 = NULL; 2440 2441 sb = &so->so_rcv; 2442 2443 #ifdef KERN_TLS 2444 /* 2445 * KTLS store TLS records as records with a control message to 2446 * describe the framing. 2447 * 2448 * We check once here before acquiring locks to optimize the 2449 * common case. 2450 */ 2451 if (sb->sb_tls_info != NULL) 2452 return (soreceive_generic(so, psa, uio, mp0, controlp, 2453 flagsp)); 2454 #endif 2455 2456 /* Prevent other readers from entering the socket. */ 2457 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 2458 if (error) 2459 return (error); 2460 SOCKBUF_LOCK(sb); 2461 2462 #ifdef KERN_TLS 2463 if (sb->sb_tls_info != NULL) { 2464 SOCKBUF_UNLOCK(sb); 2465 SOCK_IO_RECV_UNLOCK(so); 2466 return (soreceive_generic(so, psa, uio, mp0, controlp, 2467 flagsp)); 2468 } 2469 #endif 2470 2471 /* Easy one, no space to copyout anything. */ 2472 if (uio->uio_resid == 0) { 2473 error = EINVAL; 2474 goto out; 2475 } 2476 oresid = uio->uio_resid; 2477 2478 /* We will never ever get anything unless we are or were connected. */ 2479 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2480 error = ENOTCONN; 2481 goto out; 2482 } 2483 2484 restart: 2485 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2486 2487 /* Abort if socket has reported problems. */ 2488 if (so->so_error) { 2489 if (sbavail(sb) > 0) 2490 goto deliver; 2491 if (oresid > uio->uio_resid) 2492 goto out; 2493 error = so->so_error; 2494 if (!(flags & MSG_PEEK)) 2495 so->so_error = 0; 2496 goto out; 2497 } 2498 2499 /* Door is closed. Deliver what is left, if any. */ 2500 if (sb->sb_state & SBS_CANTRCVMORE) { 2501 if (sbavail(sb) > 0) 2502 goto deliver; 2503 else 2504 goto out; 2505 } 2506 2507 /* Socket buffer is empty and we shall not block. */ 2508 if (sbavail(sb) == 0 && 2509 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2510 error = EAGAIN; 2511 goto out; 2512 } 2513 2514 /* Socket buffer got some data that we shall deliver now. */ 2515 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 2516 ((so->so_state & SS_NBIO) || 2517 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2518 sbavail(sb) >= sb->sb_lowat || 2519 sbavail(sb) >= uio->uio_resid || 2520 sbavail(sb) >= sb->sb_hiwat) ) { 2521 goto deliver; 2522 } 2523 2524 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2525 if ((flags & MSG_WAITALL) && 2526 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 2527 goto deliver; 2528 2529 /* 2530 * Wait and block until (more) data comes in. 2531 * NB: Drops the sockbuf lock during wait. 2532 */ 2533 error = sbwait(so, SO_RCV); 2534 if (error) 2535 goto out; 2536 goto restart; 2537 2538 deliver: 2539 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2540 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 2541 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2542 2543 /* Statistics. */ 2544 if (uio->uio_td) 2545 uio->uio_td->td_ru.ru_msgrcv++; 2546 2547 /* Fill uio until full or current end of socket buffer is reached. */ 2548 len = min(uio->uio_resid, sbavail(sb)); 2549 if (mp0 != NULL) { 2550 /* Dequeue as many mbufs as possible. */ 2551 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2552 if (*mp0 == NULL) 2553 *mp0 = sb->sb_mb; 2554 else 2555 m_cat(*mp0, sb->sb_mb); 2556 for (m = sb->sb_mb; 2557 m != NULL && m->m_len <= len; 2558 m = m->m_next) { 2559 KASSERT(!(m->m_flags & M_NOTAVAIL), 2560 ("%s: m %p not available", __func__, m)); 2561 len -= m->m_len; 2562 uio->uio_resid -= m->m_len; 2563 sbfree(sb, m); 2564 n = m; 2565 } 2566 n->m_next = NULL; 2567 sb->sb_mb = m; 2568 sb->sb_lastrecord = sb->sb_mb; 2569 if (sb->sb_mb == NULL) 2570 SB_EMPTY_FIXUP(sb); 2571 } 2572 /* Copy the remainder. */ 2573 if (len > 0) { 2574 KASSERT(sb->sb_mb != NULL, 2575 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2576 2577 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2578 if (m == NULL) 2579 len = 0; /* Don't flush data from sockbuf. */ 2580 else 2581 uio->uio_resid -= len; 2582 if (*mp0 != NULL) 2583 m_cat(*mp0, m); 2584 else 2585 *mp0 = m; 2586 if (*mp0 == NULL) { 2587 error = ENOBUFS; 2588 goto out; 2589 } 2590 } 2591 } else { 2592 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2593 SOCKBUF_UNLOCK(sb); 2594 error = m_mbuftouio(uio, sb->sb_mb, len); 2595 SOCKBUF_LOCK(sb); 2596 if (error) 2597 goto out; 2598 } 2599 SBLASTRECORDCHK(sb); 2600 SBLASTMBUFCHK(sb); 2601 2602 /* 2603 * Remove the delivered data from the socket buffer unless we 2604 * were only peeking. 2605 */ 2606 if (!(flags & MSG_PEEK)) { 2607 if (len > 0) 2608 sbdrop_locked(sb, len); 2609 2610 /* Notify protocol that we drained some data. */ 2611 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2612 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2613 !(flags & MSG_SOCALLBCK))) { 2614 SOCKBUF_UNLOCK(sb); 2615 VNET_SO_ASSERT(so); 2616 so->so_proto->pr_rcvd(so, flags); 2617 SOCKBUF_LOCK(sb); 2618 } 2619 } 2620 2621 /* 2622 * For MSG_WAITALL we may have to loop again and wait for 2623 * more data to come in. 2624 */ 2625 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2626 goto restart; 2627 out: 2628 SBLASTRECORDCHK(sb); 2629 SBLASTMBUFCHK(sb); 2630 SOCKBUF_UNLOCK(sb); 2631 SOCK_IO_RECV_UNLOCK(so); 2632 return (error); 2633 } 2634 2635 /* 2636 * Optimized version of soreceive() for simple datagram cases from userspace. 2637 * Unlike in the stream case, we're able to drop a datagram if copyout() 2638 * fails, and because we handle datagrams atomically, we don't need to use a 2639 * sleep lock to prevent I/O interlacing. 2640 */ 2641 int 2642 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2643 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2644 { 2645 struct mbuf *m, *m2; 2646 int flags, error; 2647 ssize_t len; 2648 struct protosw *pr = so->so_proto; 2649 struct mbuf *nextrecord; 2650 2651 if (psa != NULL) 2652 *psa = NULL; 2653 if (controlp != NULL) 2654 *controlp = NULL; 2655 if (flagsp != NULL) 2656 flags = *flagsp &~ MSG_EOR; 2657 else 2658 flags = 0; 2659 2660 /* 2661 * For any complicated cases, fall back to the full 2662 * soreceive_generic(). 2663 */ 2664 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) 2665 return (soreceive_generic(so, psa, uio, mp0, controlp, 2666 flagsp)); 2667 2668 /* 2669 * Enforce restrictions on use. 2670 */ 2671 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2672 ("soreceive_dgram: wantrcvd")); 2673 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2674 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2675 ("soreceive_dgram: SBS_RCVATMARK")); 2676 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2677 ("soreceive_dgram: P_CONNREQUIRED")); 2678 2679 /* 2680 * Loop blocking while waiting for a datagram. 2681 */ 2682 SOCKBUF_LOCK(&so->so_rcv); 2683 while ((m = so->so_rcv.sb_mb) == NULL) { 2684 KASSERT(sbavail(&so->so_rcv) == 0, 2685 ("soreceive_dgram: sb_mb NULL but sbavail %u", 2686 sbavail(&so->so_rcv))); 2687 if (so->so_error) { 2688 error = so->so_error; 2689 so->so_error = 0; 2690 SOCKBUF_UNLOCK(&so->so_rcv); 2691 return (error); 2692 } 2693 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2694 uio->uio_resid == 0) { 2695 SOCKBUF_UNLOCK(&so->so_rcv); 2696 return (0); 2697 } 2698 if ((so->so_state & SS_NBIO) || 2699 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2700 SOCKBUF_UNLOCK(&so->so_rcv); 2701 return (EWOULDBLOCK); 2702 } 2703 SBLASTRECORDCHK(&so->so_rcv); 2704 SBLASTMBUFCHK(&so->so_rcv); 2705 error = sbwait(so, SO_RCV); 2706 if (error) { 2707 SOCKBUF_UNLOCK(&so->so_rcv); 2708 return (error); 2709 } 2710 } 2711 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2712 2713 if (uio->uio_td) 2714 uio->uio_td->td_ru.ru_msgrcv++; 2715 SBLASTRECORDCHK(&so->so_rcv); 2716 SBLASTMBUFCHK(&so->so_rcv); 2717 nextrecord = m->m_nextpkt; 2718 if (nextrecord == NULL) { 2719 KASSERT(so->so_rcv.sb_lastrecord == m, 2720 ("soreceive_dgram: lastrecord != m")); 2721 } 2722 2723 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2724 ("soreceive_dgram: m_nextpkt != nextrecord")); 2725 2726 /* 2727 * Pull 'm' and its chain off the front of the packet queue. 2728 */ 2729 so->so_rcv.sb_mb = NULL; 2730 sockbuf_pushsync(&so->so_rcv, nextrecord); 2731 2732 /* 2733 * Walk 'm's chain and free that many bytes from the socket buffer. 2734 */ 2735 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2736 sbfree(&so->so_rcv, m2); 2737 2738 /* 2739 * Do a few last checks before we let go of the lock. 2740 */ 2741 SBLASTRECORDCHK(&so->so_rcv); 2742 SBLASTMBUFCHK(&so->so_rcv); 2743 SOCKBUF_UNLOCK(&so->so_rcv); 2744 2745 if (pr->pr_flags & PR_ADDR) { 2746 KASSERT(m->m_type == MT_SONAME, 2747 ("m->m_type == %d", m->m_type)); 2748 if (psa != NULL) 2749 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2750 M_NOWAIT); 2751 m = m_free(m); 2752 } 2753 if (m == NULL) { 2754 /* XXXRW: Can this happen? */ 2755 return (0); 2756 } 2757 2758 /* 2759 * Packet to copyout() is now in 'm' and it is disconnected from the 2760 * queue. 2761 * 2762 * Process one or more MT_CONTROL mbufs present before any data mbufs 2763 * in the first mbuf chain on the socket buffer. We call into the 2764 * protocol to perform externalization (or freeing if controlp == 2765 * NULL). In some cases there can be only MT_CONTROL mbufs without 2766 * MT_DATA mbufs. 2767 */ 2768 if (m->m_type == MT_CONTROL) { 2769 struct mbuf *cm = NULL, *cmn; 2770 struct mbuf **cme = &cm; 2771 2772 do { 2773 m2 = m->m_next; 2774 m->m_next = NULL; 2775 *cme = m; 2776 cme = &(*cme)->m_next; 2777 m = m2; 2778 } while (m != NULL && m->m_type == MT_CONTROL); 2779 while (cm != NULL) { 2780 cmn = cm->m_next; 2781 cm->m_next = NULL; 2782 if (pr->pr_domain->dom_externalize != NULL) { 2783 error = (*pr->pr_domain->dom_externalize) 2784 (cm, controlp, flags); 2785 } else if (controlp != NULL) 2786 *controlp = cm; 2787 else 2788 m_freem(cm); 2789 if (controlp != NULL) { 2790 while (*controlp != NULL) 2791 controlp = &(*controlp)->m_next; 2792 } 2793 cm = cmn; 2794 } 2795 } 2796 KASSERT(m == NULL || m->m_type == MT_DATA, 2797 ("soreceive_dgram: !data")); 2798 while (m != NULL && uio->uio_resid > 0) { 2799 len = uio->uio_resid; 2800 if (len > m->m_len) 2801 len = m->m_len; 2802 error = uiomove(mtod(m, char *), (int)len, uio); 2803 if (error) { 2804 m_freem(m); 2805 return (error); 2806 } 2807 if (len == m->m_len) 2808 m = m_free(m); 2809 else { 2810 m->m_data += len; 2811 m->m_len -= len; 2812 } 2813 } 2814 if (m != NULL) { 2815 flags |= MSG_TRUNC; 2816 m_freem(m); 2817 } 2818 if (flagsp != NULL) 2819 *flagsp |= flags; 2820 return (0); 2821 } 2822 2823 int 2824 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2825 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2826 { 2827 int error; 2828 2829 CURVNET_SET(so->so_vnet); 2830 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); 2831 CURVNET_RESTORE(); 2832 return (error); 2833 } 2834 2835 int 2836 soshutdown(struct socket *so, int how) 2837 { 2838 struct protosw *pr; 2839 int error, soerror_enotconn; 2840 2841 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2842 return (EINVAL); 2843 2844 soerror_enotconn = 0; 2845 SOCK_LOCK(so); 2846 if ((so->so_state & 2847 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { 2848 /* 2849 * POSIX mandates us to return ENOTCONN when shutdown(2) is 2850 * invoked on a datagram sockets, however historically we would 2851 * actually tear socket down. This is known to be leveraged by 2852 * some applications to unblock process waiting in recvXXX(2) 2853 * by other process that it shares that socket with. Try to meet 2854 * both backward-compatibility and POSIX requirements by forcing 2855 * ENOTCONN but still asking protocol to perform pru_shutdown(). 2856 */ 2857 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) { 2858 SOCK_UNLOCK(so); 2859 return (ENOTCONN); 2860 } 2861 soerror_enotconn = 1; 2862 } 2863 2864 if (SOLISTENING(so)) { 2865 if (how != SHUT_WR) { 2866 so->so_error = ECONNABORTED; 2867 solisten_wakeup(so); /* unlocks so */ 2868 } else { 2869 SOCK_UNLOCK(so); 2870 } 2871 goto done; 2872 } 2873 SOCK_UNLOCK(so); 2874 2875 CURVNET_SET(so->so_vnet); 2876 pr = so->so_proto; 2877 if (pr->pr_flush != NULL) 2878 pr->pr_flush(so, how); 2879 if (how != SHUT_WR) 2880 sorflush(so); 2881 if (how != SHUT_RD) { 2882 error = pr->pr_shutdown(so); 2883 wakeup(&so->so_timeo); 2884 CURVNET_RESTORE(); 2885 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); 2886 } 2887 wakeup(&so->so_timeo); 2888 CURVNET_RESTORE(); 2889 2890 done: 2891 return (soerror_enotconn ? ENOTCONN : 0); 2892 } 2893 2894 void 2895 sorflush(struct socket *so) 2896 { 2897 struct protosw *pr; 2898 int error; 2899 2900 VNET_SO_ASSERT(so); 2901 2902 /* 2903 * Dislodge threads currently blocked in receive and wait to acquire 2904 * a lock against other simultaneous readers before clearing the 2905 * socket buffer. Don't let our acquire be interrupted by a signal 2906 * despite any existing socket disposition on interruptable waiting. 2907 */ 2908 socantrcvmore(so); 2909 2910 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 2911 if (error != 0) { 2912 KASSERT(SOLISTENING(so), 2913 ("%s: soiolock(%p) failed", __func__, so)); 2914 return; 2915 } 2916 2917 pr = so->so_proto; 2918 if (pr->pr_flags & PR_RIGHTS) { 2919 MPASS(pr->pr_domain->dom_dispose != NULL); 2920 (*pr->pr_domain->dom_dispose)(so); 2921 } else { 2922 sbrelease(so, SO_RCV); 2923 SOCK_IO_RECV_UNLOCK(so); 2924 } 2925 2926 } 2927 2928 /* 2929 * Wrapper for Socket established helper hook. 2930 * Parameters: socket, context of the hook point, hook id. 2931 */ 2932 static int inline 2933 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 2934 { 2935 struct socket_hhook_data hhook_data = { 2936 .so = so, 2937 .hctx = hctx, 2938 .m = NULL, 2939 .status = 0 2940 }; 2941 2942 CURVNET_SET(so->so_vnet); 2943 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 2944 CURVNET_RESTORE(); 2945 2946 /* Ugly but needed, since hhooks return void for now */ 2947 return (hhook_data.status); 2948 } 2949 2950 /* 2951 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2952 * additional variant to handle the case where the option value needs to be 2953 * some kind of integer, but not a specific size. In addition to their use 2954 * here, these functions are also called by the protocol-level pr_ctloutput() 2955 * routines. 2956 */ 2957 int 2958 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2959 { 2960 size_t valsize; 2961 2962 /* 2963 * If the user gives us more than we wanted, we ignore it, but if we 2964 * don't get the minimum length the caller wants, we return EINVAL. 2965 * On success, sopt->sopt_valsize is set to however much we actually 2966 * retrieved. 2967 */ 2968 if ((valsize = sopt->sopt_valsize) < minlen) 2969 return EINVAL; 2970 if (valsize > len) 2971 sopt->sopt_valsize = valsize = len; 2972 2973 if (sopt->sopt_td != NULL) 2974 return (copyin(sopt->sopt_val, buf, valsize)); 2975 2976 bcopy(sopt->sopt_val, buf, valsize); 2977 return (0); 2978 } 2979 2980 /* 2981 * Kernel version of setsockopt(2). 2982 * 2983 * XXX: optlen is size_t, not socklen_t 2984 */ 2985 int 2986 so_setsockopt(struct socket *so, int level, int optname, void *optval, 2987 size_t optlen) 2988 { 2989 struct sockopt sopt; 2990 2991 sopt.sopt_level = level; 2992 sopt.sopt_name = optname; 2993 sopt.sopt_dir = SOPT_SET; 2994 sopt.sopt_val = optval; 2995 sopt.sopt_valsize = optlen; 2996 sopt.sopt_td = NULL; 2997 return (sosetopt(so, &sopt)); 2998 } 2999 3000 int 3001 sosetopt(struct socket *so, struct sockopt *sopt) 3002 { 3003 int error, optval; 3004 struct linger l; 3005 struct timeval tv; 3006 sbintime_t val, *valp; 3007 uint32_t val32; 3008 #ifdef MAC 3009 struct mac extmac; 3010 #endif 3011 3012 CURVNET_SET(so->so_vnet); 3013 error = 0; 3014 if (sopt->sopt_level != SOL_SOCKET) { 3015 if (so->so_proto->pr_ctloutput != NULL) 3016 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3017 else 3018 error = ENOPROTOOPT; 3019 } else { 3020 switch (sopt->sopt_name) { 3021 case SO_ACCEPTFILTER: 3022 error = accept_filt_setopt(so, sopt); 3023 if (error) 3024 goto bad; 3025 break; 3026 3027 case SO_LINGER: 3028 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3029 if (error) 3030 goto bad; 3031 if (l.l_linger < 0 || 3032 l.l_linger > USHRT_MAX || 3033 l.l_linger > (INT_MAX / hz)) { 3034 error = EDOM; 3035 goto bad; 3036 } 3037 SOCK_LOCK(so); 3038 so->so_linger = l.l_linger; 3039 if (l.l_onoff) 3040 so->so_options |= SO_LINGER; 3041 else 3042 so->so_options &= ~SO_LINGER; 3043 SOCK_UNLOCK(so); 3044 break; 3045 3046 case SO_DEBUG: 3047 case SO_KEEPALIVE: 3048 case SO_DONTROUTE: 3049 case SO_USELOOPBACK: 3050 case SO_BROADCAST: 3051 case SO_REUSEADDR: 3052 case SO_REUSEPORT: 3053 case SO_REUSEPORT_LB: 3054 case SO_OOBINLINE: 3055 case SO_TIMESTAMP: 3056 case SO_BINTIME: 3057 case SO_NOSIGPIPE: 3058 case SO_NO_DDP: 3059 case SO_NO_OFFLOAD: 3060 case SO_RERROR: 3061 error = sooptcopyin(sopt, &optval, sizeof optval, 3062 sizeof optval); 3063 if (error) 3064 goto bad; 3065 SOCK_LOCK(so); 3066 if (optval) 3067 so->so_options |= sopt->sopt_name; 3068 else 3069 so->so_options &= ~sopt->sopt_name; 3070 SOCK_UNLOCK(so); 3071 break; 3072 3073 case SO_SETFIB: 3074 error = sooptcopyin(sopt, &optval, sizeof optval, 3075 sizeof optval); 3076 if (error) 3077 goto bad; 3078 3079 if (optval < 0 || optval >= rt_numfibs) { 3080 error = EINVAL; 3081 goto bad; 3082 } 3083 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 3084 (so->so_proto->pr_domain->dom_family == PF_INET6) || 3085 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 3086 so->so_fibnum = optval; 3087 else 3088 so->so_fibnum = 0; 3089 break; 3090 3091 case SO_USER_COOKIE: 3092 error = sooptcopyin(sopt, &val32, sizeof val32, 3093 sizeof val32); 3094 if (error) 3095 goto bad; 3096 so->so_user_cookie = val32; 3097 break; 3098 3099 case SO_SNDBUF: 3100 case SO_RCVBUF: 3101 case SO_SNDLOWAT: 3102 case SO_RCVLOWAT: 3103 error = sooptcopyin(sopt, &optval, sizeof optval, 3104 sizeof optval); 3105 if (error) 3106 goto bad; 3107 3108 /* 3109 * Values < 1 make no sense for any of these options, 3110 * so disallow them. 3111 */ 3112 if (optval < 1) { 3113 error = EINVAL; 3114 goto bad; 3115 } 3116 3117 error = sbsetopt(so, sopt->sopt_name, optval); 3118 break; 3119 3120 case SO_SNDTIMEO: 3121 case SO_RCVTIMEO: 3122 #ifdef COMPAT_FREEBSD32 3123 if (SV_CURPROC_FLAG(SV_ILP32)) { 3124 struct timeval32 tv32; 3125 3126 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3127 sizeof tv32); 3128 CP(tv32, tv, tv_sec); 3129 CP(tv32, tv, tv_usec); 3130 } else 3131 #endif 3132 error = sooptcopyin(sopt, &tv, sizeof tv, 3133 sizeof tv); 3134 if (error) 3135 goto bad; 3136 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3137 tv.tv_usec >= 1000000) { 3138 error = EDOM; 3139 goto bad; 3140 } 3141 if (tv.tv_sec > INT32_MAX) 3142 val = SBT_MAX; 3143 else 3144 val = tvtosbt(tv); 3145 SOCK_LOCK(so); 3146 valp = sopt->sopt_name == SO_SNDTIMEO ? 3147 (SOLISTENING(so) ? &so->sol_sbsnd_timeo : 3148 &so->so_snd.sb_timeo) : 3149 (SOLISTENING(so) ? &so->sol_sbrcv_timeo : 3150 &so->so_rcv.sb_timeo); 3151 *valp = val; 3152 SOCK_UNLOCK(so); 3153 break; 3154 3155 case SO_LABEL: 3156 #ifdef MAC 3157 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3158 sizeof extmac); 3159 if (error) 3160 goto bad; 3161 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3162 so, &extmac); 3163 #else 3164 error = EOPNOTSUPP; 3165 #endif 3166 break; 3167 3168 case SO_TS_CLOCK: 3169 error = sooptcopyin(sopt, &optval, sizeof optval, 3170 sizeof optval); 3171 if (error) 3172 goto bad; 3173 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 3174 error = EINVAL; 3175 goto bad; 3176 } 3177 so->so_ts_clock = optval; 3178 break; 3179 3180 case SO_MAX_PACING_RATE: 3181 error = sooptcopyin(sopt, &val32, sizeof(val32), 3182 sizeof(val32)); 3183 if (error) 3184 goto bad; 3185 so->so_max_pacing_rate = val32; 3186 break; 3187 3188 default: 3189 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3190 error = hhook_run_socket(so, sopt, 3191 HHOOK_SOCKET_OPT); 3192 else 3193 error = ENOPROTOOPT; 3194 break; 3195 } 3196 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 3197 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 3198 } 3199 bad: 3200 CURVNET_RESTORE(); 3201 return (error); 3202 } 3203 3204 /* 3205 * Helper routine for getsockopt. 3206 */ 3207 int 3208 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 3209 { 3210 int error; 3211 size_t valsize; 3212 3213 error = 0; 3214 3215 /* 3216 * Documented get behavior is that we always return a value, possibly 3217 * truncated to fit in the user's buffer. Traditional behavior is 3218 * that we always tell the user precisely how much we copied, rather 3219 * than something useful like the total amount we had available for 3220 * her. Note that this interface is not idempotent; the entire 3221 * answer must be generated ahead of time. 3222 */ 3223 valsize = min(len, sopt->sopt_valsize); 3224 sopt->sopt_valsize = valsize; 3225 if (sopt->sopt_val != NULL) { 3226 if (sopt->sopt_td != NULL) 3227 error = copyout(buf, sopt->sopt_val, valsize); 3228 else 3229 bcopy(buf, sopt->sopt_val, valsize); 3230 } 3231 return (error); 3232 } 3233 3234 int 3235 sogetopt(struct socket *so, struct sockopt *sopt) 3236 { 3237 int error, optval; 3238 struct linger l; 3239 struct timeval tv; 3240 #ifdef MAC 3241 struct mac extmac; 3242 #endif 3243 3244 CURVNET_SET(so->so_vnet); 3245 error = 0; 3246 if (sopt->sopt_level != SOL_SOCKET) { 3247 if (so->so_proto->pr_ctloutput != NULL) 3248 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3249 else 3250 error = ENOPROTOOPT; 3251 CURVNET_RESTORE(); 3252 return (error); 3253 } else { 3254 switch (sopt->sopt_name) { 3255 case SO_ACCEPTFILTER: 3256 error = accept_filt_getopt(so, sopt); 3257 break; 3258 3259 case SO_LINGER: 3260 SOCK_LOCK(so); 3261 l.l_onoff = so->so_options & SO_LINGER; 3262 l.l_linger = so->so_linger; 3263 SOCK_UNLOCK(so); 3264 error = sooptcopyout(sopt, &l, sizeof l); 3265 break; 3266 3267 case SO_USELOOPBACK: 3268 case SO_DONTROUTE: 3269 case SO_DEBUG: 3270 case SO_KEEPALIVE: 3271 case SO_REUSEADDR: 3272 case SO_REUSEPORT: 3273 case SO_REUSEPORT_LB: 3274 case SO_BROADCAST: 3275 case SO_OOBINLINE: 3276 case SO_ACCEPTCONN: 3277 case SO_TIMESTAMP: 3278 case SO_BINTIME: 3279 case SO_NOSIGPIPE: 3280 case SO_NO_DDP: 3281 case SO_NO_OFFLOAD: 3282 case SO_RERROR: 3283 optval = so->so_options & sopt->sopt_name; 3284 integer: 3285 error = sooptcopyout(sopt, &optval, sizeof optval); 3286 break; 3287 3288 case SO_DOMAIN: 3289 optval = so->so_proto->pr_domain->dom_family; 3290 goto integer; 3291 3292 case SO_TYPE: 3293 optval = so->so_type; 3294 goto integer; 3295 3296 case SO_PROTOCOL: 3297 optval = so->so_proto->pr_protocol; 3298 goto integer; 3299 3300 case SO_ERROR: 3301 SOCK_LOCK(so); 3302 if (so->so_error) { 3303 optval = so->so_error; 3304 so->so_error = 0; 3305 } else { 3306 optval = so->so_rerror; 3307 so->so_rerror = 0; 3308 } 3309 SOCK_UNLOCK(so); 3310 goto integer; 3311 3312 case SO_SNDBUF: 3313 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 3314 so->so_snd.sb_hiwat; 3315 goto integer; 3316 3317 case SO_RCVBUF: 3318 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 3319 so->so_rcv.sb_hiwat; 3320 goto integer; 3321 3322 case SO_SNDLOWAT: 3323 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 3324 so->so_snd.sb_lowat; 3325 goto integer; 3326 3327 case SO_RCVLOWAT: 3328 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 3329 so->so_rcv.sb_lowat; 3330 goto integer; 3331 3332 case SO_SNDTIMEO: 3333 case SO_RCVTIMEO: 3334 SOCK_LOCK(so); 3335 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 3336 (SOLISTENING(so) ? so->sol_sbsnd_timeo : 3337 so->so_snd.sb_timeo) : 3338 (SOLISTENING(so) ? so->sol_sbrcv_timeo : 3339 so->so_rcv.sb_timeo)); 3340 SOCK_UNLOCK(so); 3341 #ifdef COMPAT_FREEBSD32 3342 if (SV_CURPROC_FLAG(SV_ILP32)) { 3343 struct timeval32 tv32; 3344 3345 CP(tv, tv32, tv_sec); 3346 CP(tv, tv32, tv_usec); 3347 error = sooptcopyout(sopt, &tv32, sizeof tv32); 3348 } else 3349 #endif 3350 error = sooptcopyout(sopt, &tv, sizeof tv); 3351 break; 3352 3353 case SO_LABEL: 3354 #ifdef MAC 3355 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3356 sizeof(extmac)); 3357 if (error) 3358 goto bad; 3359 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 3360 so, &extmac); 3361 if (error) 3362 goto bad; 3363 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3364 #else 3365 error = EOPNOTSUPP; 3366 #endif 3367 break; 3368 3369 case SO_PEERLABEL: 3370 #ifdef MAC 3371 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3372 sizeof(extmac)); 3373 if (error) 3374 goto bad; 3375 error = mac_getsockopt_peerlabel( 3376 sopt->sopt_td->td_ucred, so, &extmac); 3377 if (error) 3378 goto bad; 3379 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3380 #else 3381 error = EOPNOTSUPP; 3382 #endif 3383 break; 3384 3385 case SO_LISTENQLIMIT: 3386 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 3387 goto integer; 3388 3389 case SO_LISTENQLEN: 3390 optval = SOLISTENING(so) ? so->sol_qlen : 0; 3391 goto integer; 3392 3393 case SO_LISTENINCQLEN: 3394 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 3395 goto integer; 3396 3397 case SO_TS_CLOCK: 3398 optval = so->so_ts_clock; 3399 goto integer; 3400 3401 case SO_MAX_PACING_RATE: 3402 optval = so->so_max_pacing_rate; 3403 goto integer; 3404 3405 default: 3406 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3407 error = hhook_run_socket(so, sopt, 3408 HHOOK_SOCKET_OPT); 3409 else 3410 error = ENOPROTOOPT; 3411 break; 3412 } 3413 } 3414 #ifdef MAC 3415 bad: 3416 #endif 3417 CURVNET_RESTORE(); 3418 return (error); 3419 } 3420 3421 int 3422 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 3423 { 3424 struct mbuf *m, *m_prev; 3425 int sopt_size = sopt->sopt_valsize; 3426 3427 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3428 if (m == NULL) 3429 return ENOBUFS; 3430 if (sopt_size > MLEN) { 3431 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 3432 if ((m->m_flags & M_EXT) == 0) { 3433 m_free(m); 3434 return ENOBUFS; 3435 } 3436 m->m_len = min(MCLBYTES, sopt_size); 3437 } else { 3438 m->m_len = min(MLEN, sopt_size); 3439 } 3440 sopt_size -= m->m_len; 3441 *mp = m; 3442 m_prev = m; 3443 3444 while (sopt_size) { 3445 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3446 if (m == NULL) { 3447 m_freem(*mp); 3448 return ENOBUFS; 3449 } 3450 if (sopt_size > MLEN) { 3451 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 3452 M_NOWAIT); 3453 if ((m->m_flags & M_EXT) == 0) { 3454 m_freem(m); 3455 m_freem(*mp); 3456 return ENOBUFS; 3457 } 3458 m->m_len = min(MCLBYTES, sopt_size); 3459 } else { 3460 m->m_len = min(MLEN, sopt_size); 3461 } 3462 sopt_size -= m->m_len; 3463 m_prev->m_next = m; 3464 m_prev = m; 3465 } 3466 return (0); 3467 } 3468 3469 int 3470 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 3471 { 3472 struct mbuf *m0 = m; 3473 3474 if (sopt->sopt_val == NULL) 3475 return (0); 3476 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3477 if (sopt->sopt_td != NULL) { 3478 int error; 3479 3480 error = copyin(sopt->sopt_val, mtod(m, char *), 3481 m->m_len); 3482 if (error != 0) { 3483 m_freem(m0); 3484 return(error); 3485 } 3486 } else 3487 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 3488 sopt->sopt_valsize -= m->m_len; 3489 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3490 m = m->m_next; 3491 } 3492 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 3493 panic("ip6_sooptmcopyin"); 3494 return (0); 3495 } 3496 3497 int 3498 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 3499 { 3500 struct mbuf *m0 = m; 3501 size_t valsize = 0; 3502 3503 if (sopt->sopt_val == NULL) 3504 return (0); 3505 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3506 if (sopt->sopt_td != NULL) { 3507 int error; 3508 3509 error = copyout(mtod(m, char *), sopt->sopt_val, 3510 m->m_len); 3511 if (error != 0) { 3512 m_freem(m0); 3513 return(error); 3514 } 3515 } else 3516 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 3517 sopt->sopt_valsize -= m->m_len; 3518 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3519 valsize += m->m_len; 3520 m = m->m_next; 3521 } 3522 if (m != NULL) { 3523 /* enough soopt buffer should be given from user-land */ 3524 m_freem(m0); 3525 return(EINVAL); 3526 } 3527 sopt->sopt_valsize = valsize; 3528 return (0); 3529 } 3530 3531 /* 3532 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3533 * out-of-band data, which will then notify socket consumers. 3534 */ 3535 void 3536 sohasoutofband(struct socket *so) 3537 { 3538 3539 if (so->so_sigio != NULL) 3540 pgsigio(&so->so_sigio, SIGURG, 0); 3541 selwakeuppri(&so->so_rdsel, PSOCK); 3542 } 3543 3544 int 3545 sopoll(struct socket *so, int events, struct ucred *active_cred, 3546 struct thread *td) 3547 { 3548 3549 /* 3550 * We do not need to set or assert curvnet as long as everyone uses 3551 * sopoll_generic(). 3552 */ 3553 return (so->so_proto->pr_sopoll(so, events, active_cred, td)); 3554 } 3555 3556 int 3557 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3558 struct thread *td) 3559 { 3560 int revents; 3561 3562 SOCK_LOCK(so); 3563 if (SOLISTENING(so)) { 3564 if (!(events & (POLLIN | POLLRDNORM))) 3565 revents = 0; 3566 else if (!TAILQ_EMPTY(&so->sol_comp)) 3567 revents = events & (POLLIN | POLLRDNORM); 3568 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 3569 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 3570 else { 3571 selrecord(td, &so->so_rdsel); 3572 revents = 0; 3573 } 3574 } else { 3575 revents = 0; 3576 SOCK_SENDBUF_LOCK(so); 3577 SOCK_RECVBUF_LOCK(so); 3578 if (events & (POLLIN | POLLRDNORM)) 3579 if (soreadabledata(so)) 3580 revents |= events & (POLLIN | POLLRDNORM); 3581 if (events & (POLLOUT | POLLWRNORM)) 3582 if (sowriteable(so)) 3583 revents |= events & (POLLOUT | POLLWRNORM); 3584 if (events & (POLLPRI | POLLRDBAND)) 3585 if (so->so_oobmark || 3586 (so->so_rcv.sb_state & SBS_RCVATMARK)) 3587 revents |= events & (POLLPRI | POLLRDBAND); 3588 if ((events & POLLINIGNEOF) == 0) { 3589 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3590 revents |= events & (POLLIN | POLLRDNORM); 3591 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3592 revents |= POLLHUP; 3593 } 3594 } 3595 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 3596 revents |= events & POLLRDHUP; 3597 if (revents == 0) { 3598 if (events & 3599 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 3600 selrecord(td, &so->so_rdsel); 3601 so->so_rcv.sb_flags |= SB_SEL; 3602 } 3603 if (events & (POLLOUT | POLLWRNORM)) { 3604 selrecord(td, &so->so_wrsel); 3605 so->so_snd.sb_flags |= SB_SEL; 3606 } 3607 } 3608 SOCK_RECVBUF_UNLOCK(so); 3609 SOCK_SENDBUF_UNLOCK(so); 3610 } 3611 SOCK_UNLOCK(so); 3612 return (revents); 3613 } 3614 3615 int 3616 soo_kqfilter(struct file *fp, struct knote *kn) 3617 { 3618 struct socket *so = kn->kn_fp->f_data; 3619 struct sockbuf *sb; 3620 sb_which which; 3621 struct knlist *knl; 3622 3623 switch (kn->kn_filter) { 3624 case EVFILT_READ: 3625 kn->kn_fop = &soread_filtops; 3626 knl = &so->so_rdsel.si_note; 3627 sb = &so->so_rcv; 3628 which = SO_RCV; 3629 break; 3630 case EVFILT_WRITE: 3631 kn->kn_fop = &sowrite_filtops; 3632 knl = &so->so_wrsel.si_note; 3633 sb = &so->so_snd; 3634 which = SO_SND; 3635 break; 3636 case EVFILT_EMPTY: 3637 kn->kn_fop = &soempty_filtops; 3638 knl = &so->so_wrsel.si_note; 3639 sb = &so->so_snd; 3640 which = SO_SND; 3641 break; 3642 default: 3643 return (EINVAL); 3644 } 3645 3646 SOCK_LOCK(so); 3647 if (SOLISTENING(so)) { 3648 knlist_add(knl, kn, 1); 3649 } else { 3650 SOCK_BUF_LOCK(so, which); 3651 knlist_add(knl, kn, 1); 3652 sb->sb_flags |= SB_KNOTE; 3653 SOCK_BUF_UNLOCK(so, which); 3654 } 3655 SOCK_UNLOCK(so); 3656 return (0); 3657 } 3658 3659 static void 3660 filt_sordetach(struct knote *kn) 3661 { 3662 struct socket *so = kn->kn_fp->f_data; 3663 3664 so_rdknl_lock(so); 3665 knlist_remove(&so->so_rdsel.si_note, kn, 1); 3666 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 3667 so->so_rcv.sb_flags &= ~SB_KNOTE; 3668 so_rdknl_unlock(so); 3669 } 3670 3671 /*ARGSUSED*/ 3672 static int 3673 filt_soread(struct knote *kn, long hint) 3674 { 3675 struct socket *so; 3676 3677 so = kn->kn_fp->f_data; 3678 3679 if (SOLISTENING(so)) { 3680 SOCK_LOCK_ASSERT(so); 3681 kn->kn_data = so->sol_qlen; 3682 if (so->so_error) { 3683 kn->kn_flags |= EV_EOF; 3684 kn->kn_fflags = so->so_error; 3685 return (1); 3686 } 3687 return (!TAILQ_EMPTY(&so->sol_comp)); 3688 } 3689 3690 SOCK_RECVBUF_LOCK_ASSERT(so); 3691 3692 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 3693 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3694 kn->kn_flags |= EV_EOF; 3695 kn->kn_fflags = so->so_error; 3696 return (1); 3697 } else if (so->so_error || so->so_rerror) 3698 return (1); 3699 3700 if (kn->kn_sfflags & NOTE_LOWAT) { 3701 if (kn->kn_data >= kn->kn_sdata) 3702 return (1); 3703 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 3704 return (1); 3705 3706 /* This hook returning non-zero indicates an event, not error */ 3707 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 3708 } 3709 3710 static void 3711 filt_sowdetach(struct knote *kn) 3712 { 3713 struct socket *so = kn->kn_fp->f_data; 3714 3715 so_wrknl_lock(so); 3716 knlist_remove(&so->so_wrsel.si_note, kn, 1); 3717 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 3718 so->so_snd.sb_flags &= ~SB_KNOTE; 3719 so_wrknl_unlock(so); 3720 } 3721 3722 /*ARGSUSED*/ 3723 static int 3724 filt_sowrite(struct knote *kn, long hint) 3725 { 3726 struct socket *so; 3727 3728 so = kn->kn_fp->f_data; 3729 3730 if (SOLISTENING(so)) 3731 return (0); 3732 3733 SOCK_SENDBUF_LOCK_ASSERT(so); 3734 kn->kn_data = sbspace(&so->so_snd); 3735 3736 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 3737 3738 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3739 kn->kn_flags |= EV_EOF; 3740 kn->kn_fflags = so->so_error; 3741 return (1); 3742 } else if (so->so_error) /* temporary udp error */ 3743 return (1); 3744 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3745 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3746 return (0); 3747 else if (kn->kn_sfflags & NOTE_LOWAT) 3748 return (kn->kn_data >= kn->kn_sdata); 3749 else 3750 return (kn->kn_data >= so->so_snd.sb_lowat); 3751 } 3752 3753 static int 3754 filt_soempty(struct knote *kn, long hint) 3755 { 3756 struct socket *so; 3757 3758 so = kn->kn_fp->f_data; 3759 3760 if (SOLISTENING(so)) 3761 return (1); 3762 3763 SOCK_SENDBUF_LOCK_ASSERT(so); 3764 kn->kn_data = sbused(&so->so_snd); 3765 3766 if (kn->kn_data == 0) 3767 return (1); 3768 else 3769 return (0); 3770 } 3771 3772 int 3773 socheckuid(struct socket *so, uid_t uid) 3774 { 3775 3776 if (so == NULL) 3777 return (EPERM); 3778 if (so->so_cred->cr_uid != uid) 3779 return (EPERM); 3780 return (0); 3781 } 3782 3783 /* 3784 * These functions are used by protocols to notify the socket layer (and its 3785 * consumers) of state changes in the sockets driven by protocol-side events. 3786 */ 3787 3788 /* 3789 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3790 * 3791 * Normal sequence from the active (originating) side is that 3792 * soisconnecting() is called during processing of connect() call, resulting 3793 * in an eventual call to soisconnected() if/when the connection is 3794 * established. When the connection is torn down soisdisconnecting() is 3795 * called during processing of disconnect() call, and soisdisconnected() is 3796 * called when the connection to the peer is totally severed. The semantics 3797 * of these routines are such that connectionless protocols can call 3798 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3799 * calls when setting up a ``connection'' takes no time. 3800 * 3801 * From the passive side, a socket is created with two queues of sockets: 3802 * so_incomp for connections in progress and so_comp for connections already 3803 * made and awaiting user acceptance. As a protocol is preparing incoming 3804 * connections, it creates a socket structure queued on so_incomp by calling 3805 * sonewconn(). When the connection is established, soisconnected() is 3806 * called, and transfers the socket structure to so_comp, making it available 3807 * to accept(). 3808 * 3809 * If a socket is closed with sockets on either so_incomp or so_comp, these 3810 * sockets are dropped. 3811 * 3812 * If higher-level protocols are implemented in the kernel, the wakeups done 3813 * here will sometimes cause software-interrupt process scheduling. 3814 */ 3815 void 3816 soisconnecting(struct socket *so) 3817 { 3818 3819 SOCK_LOCK(so); 3820 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3821 so->so_state |= SS_ISCONNECTING; 3822 SOCK_UNLOCK(so); 3823 } 3824 3825 void 3826 soisconnected(struct socket *so) 3827 { 3828 bool last __diagused; 3829 3830 SOCK_LOCK(so); 3831 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3832 so->so_state |= SS_ISCONNECTED; 3833 3834 if (so->so_qstate == SQ_INCOMP) { 3835 struct socket *head = so->so_listen; 3836 int ret; 3837 3838 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 3839 /* 3840 * Promoting a socket from incomplete queue to complete, we 3841 * need to go through reverse order of locking. We first do 3842 * trylock, and if that doesn't succeed, we go the hard way 3843 * leaving a reference and rechecking consistency after proper 3844 * locking. 3845 */ 3846 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 3847 soref(head); 3848 SOCK_UNLOCK(so); 3849 SOLISTEN_LOCK(head); 3850 SOCK_LOCK(so); 3851 if (__predict_false(head != so->so_listen)) { 3852 /* 3853 * The socket went off the listen queue, 3854 * should be lost race to close(2) of sol. 3855 * The socket is about to soabort(). 3856 */ 3857 SOCK_UNLOCK(so); 3858 sorele_locked(head); 3859 return; 3860 } 3861 last = refcount_release(&head->so_count); 3862 KASSERT(!last, ("%s: released last reference for %p", 3863 __func__, head)); 3864 } 3865 again: 3866 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3867 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 3868 head->sol_incqlen--; 3869 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 3870 head->sol_qlen++; 3871 so->so_qstate = SQ_COMP; 3872 SOCK_UNLOCK(so); 3873 solisten_wakeup(head); /* unlocks */ 3874 } else { 3875 SOCK_RECVBUF_LOCK(so); 3876 soupcall_set(so, SO_RCV, 3877 head->sol_accept_filter->accf_callback, 3878 head->sol_accept_filter_arg); 3879 so->so_options &= ~SO_ACCEPTFILTER; 3880 ret = head->sol_accept_filter->accf_callback(so, 3881 head->sol_accept_filter_arg, M_NOWAIT); 3882 if (ret == SU_ISCONNECTED) { 3883 soupcall_clear(so, SO_RCV); 3884 SOCK_RECVBUF_UNLOCK(so); 3885 goto again; 3886 } 3887 SOCK_RECVBUF_UNLOCK(so); 3888 SOCK_UNLOCK(so); 3889 SOLISTEN_UNLOCK(head); 3890 } 3891 return; 3892 } 3893 SOCK_UNLOCK(so); 3894 wakeup(&so->so_timeo); 3895 sorwakeup(so); 3896 sowwakeup(so); 3897 } 3898 3899 void 3900 soisdisconnecting(struct socket *so) 3901 { 3902 3903 SOCK_LOCK(so); 3904 so->so_state &= ~SS_ISCONNECTING; 3905 so->so_state |= SS_ISDISCONNECTING; 3906 3907 if (!SOLISTENING(so)) { 3908 SOCK_RECVBUF_LOCK(so); 3909 socantrcvmore_locked(so); 3910 SOCK_SENDBUF_LOCK(so); 3911 socantsendmore_locked(so); 3912 } 3913 SOCK_UNLOCK(so); 3914 wakeup(&so->so_timeo); 3915 } 3916 3917 void 3918 soisdisconnected(struct socket *so) 3919 { 3920 3921 SOCK_LOCK(so); 3922 3923 /* 3924 * There is at least one reader of so_state that does not 3925 * acquire socket lock, namely soreceive_generic(). Ensure 3926 * that it never sees all flags that track connection status 3927 * cleared, by ordering the update with a barrier semantic of 3928 * our release thread fence. 3929 */ 3930 so->so_state |= SS_ISDISCONNECTED; 3931 atomic_thread_fence_rel(); 3932 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3933 3934 if (!SOLISTENING(so)) { 3935 SOCK_UNLOCK(so); 3936 SOCK_RECVBUF_LOCK(so); 3937 socantrcvmore_locked(so); 3938 SOCK_SENDBUF_LOCK(so); 3939 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 3940 socantsendmore_locked(so); 3941 } else 3942 SOCK_UNLOCK(so); 3943 wakeup(&so->so_timeo); 3944 } 3945 3946 int 3947 soiolock(struct socket *so, struct sx *sx, int flags) 3948 { 3949 int error; 3950 3951 KASSERT((flags & SBL_VALID) == flags, 3952 ("soiolock: invalid flags %#x", flags)); 3953 3954 if ((flags & SBL_WAIT) != 0) { 3955 if ((flags & SBL_NOINTR) != 0) { 3956 sx_xlock(sx); 3957 } else { 3958 error = sx_xlock_sig(sx); 3959 if (error != 0) 3960 return (error); 3961 } 3962 } else if (!sx_try_xlock(sx)) { 3963 return (EWOULDBLOCK); 3964 } 3965 3966 if (__predict_false(SOLISTENING(so))) { 3967 sx_xunlock(sx); 3968 return (ENOTCONN); 3969 } 3970 return (0); 3971 } 3972 3973 void 3974 soiounlock(struct sx *sx) 3975 { 3976 sx_xunlock(sx); 3977 } 3978 3979 /* 3980 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3981 */ 3982 struct sockaddr * 3983 sodupsockaddr(const struct sockaddr *sa, int mflags) 3984 { 3985 struct sockaddr *sa2; 3986 3987 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3988 if (sa2) 3989 bcopy(sa, sa2, sa->sa_len); 3990 return sa2; 3991 } 3992 3993 /* 3994 * Register per-socket destructor. 3995 */ 3996 void 3997 sodtor_set(struct socket *so, so_dtor_t *func) 3998 { 3999 4000 SOCK_LOCK_ASSERT(so); 4001 so->so_dtor = func; 4002 } 4003 4004 /* 4005 * Register per-socket buffer upcalls. 4006 */ 4007 void 4008 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) 4009 { 4010 struct sockbuf *sb; 4011 4012 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4013 4014 switch (which) { 4015 case SO_RCV: 4016 sb = &so->so_rcv; 4017 break; 4018 case SO_SND: 4019 sb = &so->so_snd; 4020 break; 4021 } 4022 SOCK_BUF_LOCK_ASSERT(so, which); 4023 sb->sb_upcall = func; 4024 sb->sb_upcallarg = arg; 4025 sb->sb_flags |= SB_UPCALL; 4026 } 4027 4028 void 4029 soupcall_clear(struct socket *so, sb_which which) 4030 { 4031 struct sockbuf *sb; 4032 4033 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4034 4035 switch (which) { 4036 case SO_RCV: 4037 sb = &so->so_rcv; 4038 break; 4039 case SO_SND: 4040 sb = &so->so_snd; 4041 break; 4042 } 4043 SOCK_BUF_LOCK_ASSERT(so, which); 4044 KASSERT(sb->sb_upcall != NULL, 4045 ("%s: so %p no upcall to clear", __func__, so)); 4046 sb->sb_upcall = NULL; 4047 sb->sb_upcallarg = NULL; 4048 sb->sb_flags &= ~SB_UPCALL; 4049 } 4050 4051 void 4052 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4053 { 4054 4055 SOLISTEN_LOCK_ASSERT(so); 4056 so->sol_upcall = func; 4057 so->sol_upcallarg = arg; 4058 } 4059 4060 static void 4061 so_rdknl_lock(void *arg) 4062 { 4063 struct socket *so = arg; 4064 4065 retry: 4066 if (SOLISTENING(so)) { 4067 SOLISTEN_LOCK(so); 4068 } else { 4069 SOCK_RECVBUF_LOCK(so); 4070 if (__predict_false(SOLISTENING(so))) { 4071 SOCK_RECVBUF_UNLOCK(so); 4072 goto retry; 4073 } 4074 } 4075 } 4076 4077 static void 4078 so_rdknl_unlock(void *arg) 4079 { 4080 struct socket *so = arg; 4081 4082 if (SOLISTENING(so)) 4083 SOLISTEN_UNLOCK(so); 4084 else 4085 SOCK_RECVBUF_UNLOCK(so); 4086 } 4087 4088 static void 4089 so_rdknl_assert_lock(void *arg, int what) 4090 { 4091 struct socket *so = arg; 4092 4093 if (what == LA_LOCKED) { 4094 if (SOLISTENING(so)) 4095 SOLISTEN_LOCK_ASSERT(so); 4096 else 4097 SOCK_RECVBUF_LOCK_ASSERT(so); 4098 } else { 4099 if (SOLISTENING(so)) 4100 SOLISTEN_UNLOCK_ASSERT(so); 4101 else 4102 SOCK_RECVBUF_UNLOCK_ASSERT(so); 4103 } 4104 } 4105 4106 static void 4107 so_wrknl_lock(void *arg) 4108 { 4109 struct socket *so = arg; 4110 4111 retry: 4112 if (SOLISTENING(so)) { 4113 SOLISTEN_LOCK(so); 4114 } else { 4115 SOCK_SENDBUF_LOCK(so); 4116 if (__predict_false(SOLISTENING(so))) { 4117 SOCK_SENDBUF_UNLOCK(so); 4118 goto retry; 4119 } 4120 } 4121 } 4122 4123 static void 4124 so_wrknl_unlock(void *arg) 4125 { 4126 struct socket *so = arg; 4127 4128 if (SOLISTENING(so)) 4129 SOLISTEN_UNLOCK(so); 4130 else 4131 SOCK_SENDBUF_UNLOCK(so); 4132 } 4133 4134 static void 4135 so_wrknl_assert_lock(void *arg, int what) 4136 { 4137 struct socket *so = arg; 4138 4139 if (what == LA_LOCKED) { 4140 if (SOLISTENING(so)) 4141 SOLISTEN_LOCK_ASSERT(so); 4142 else 4143 SOCK_SENDBUF_LOCK_ASSERT(so); 4144 } else { 4145 if (SOLISTENING(so)) 4146 SOLISTEN_UNLOCK_ASSERT(so); 4147 else 4148 SOCK_SENDBUF_UNLOCK_ASSERT(so); 4149 } 4150 } 4151 4152 /* 4153 * Create an external-format (``xsocket'') structure using the information in 4154 * the kernel-format socket structure pointed to by so. This is done to 4155 * reduce the spew of irrelevant information over this interface, to isolate 4156 * user code from changes in the kernel structure, and potentially to provide 4157 * information-hiding if we decide that some of this information should be 4158 * hidden from users. 4159 */ 4160 void 4161 sotoxsocket(struct socket *so, struct xsocket *xso) 4162 { 4163 4164 bzero(xso, sizeof(*xso)); 4165 xso->xso_len = sizeof *xso; 4166 xso->xso_so = (uintptr_t)so; 4167 xso->so_type = so->so_type; 4168 xso->so_options = so->so_options; 4169 xso->so_linger = so->so_linger; 4170 xso->so_state = so->so_state; 4171 xso->so_pcb = (uintptr_t)so->so_pcb; 4172 xso->xso_protocol = so->so_proto->pr_protocol; 4173 xso->xso_family = so->so_proto->pr_domain->dom_family; 4174 xso->so_timeo = so->so_timeo; 4175 xso->so_error = so->so_error; 4176 xso->so_uid = so->so_cred->cr_uid; 4177 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 4178 if (SOLISTENING(so)) { 4179 xso->so_qlen = so->sol_qlen; 4180 xso->so_incqlen = so->sol_incqlen; 4181 xso->so_qlimit = so->sol_qlimit; 4182 xso->so_oobmark = 0; 4183 } else { 4184 xso->so_state |= so->so_qstate; 4185 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 4186 xso->so_oobmark = so->so_oobmark; 4187 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 4188 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 4189 } 4190 } 4191 4192 struct sockbuf * 4193 so_sockbuf_rcv(struct socket *so) 4194 { 4195 4196 return (&so->so_rcv); 4197 } 4198 4199 struct sockbuf * 4200 so_sockbuf_snd(struct socket *so) 4201 { 4202 4203 return (&so->so_snd); 4204 } 4205 4206 int 4207 so_state_get(const struct socket *so) 4208 { 4209 4210 return (so->so_state); 4211 } 4212 4213 void 4214 so_state_set(struct socket *so, int val) 4215 { 4216 4217 so->so_state = val; 4218 } 4219 4220 int 4221 so_options_get(const struct socket *so) 4222 { 4223 4224 return (so->so_options); 4225 } 4226 4227 void 4228 so_options_set(struct socket *so, int val) 4229 { 4230 4231 so->so_options = val; 4232 } 4233 4234 int 4235 so_error_get(const struct socket *so) 4236 { 4237 4238 return (so->so_error); 4239 } 4240 4241 void 4242 so_error_set(struct socket *so, int val) 4243 { 4244 4245 so->so_error = val; 4246 } 4247 4248 int 4249 so_linger_get(const struct socket *so) 4250 { 4251 4252 return (so->so_linger); 4253 } 4254 4255 void 4256 so_linger_set(struct socket *so, int val) 4257 { 4258 4259 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), 4260 ("%s: val %d out of range", __func__, val)); 4261 4262 so->so_linger = val; 4263 } 4264 4265 struct protosw * 4266 so_protosw_get(const struct socket *so) 4267 { 4268 4269 return (so->so_proto); 4270 } 4271 4272 void 4273 so_protosw_set(struct socket *so, struct protosw *val) 4274 { 4275 4276 so->so_proto = val; 4277 } 4278 4279 void 4280 so_sorwakeup(struct socket *so) 4281 { 4282 4283 sorwakeup(so); 4284 } 4285 4286 void 4287 so_sowwakeup(struct socket *so) 4288 { 4289 4290 sowwakeup(so); 4291 } 4292 4293 void 4294 so_sorwakeup_locked(struct socket *so) 4295 { 4296 4297 sorwakeup_locked(so); 4298 } 4299 4300 void 4301 so_sowwakeup_locked(struct socket *so) 4302 { 4303 4304 sowwakeup_locked(so); 4305 } 4306 4307 void 4308 so_lock(struct socket *so) 4309 { 4310 4311 SOCK_LOCK(so); 4312 } 4313 4314 void 4315 so_unlock(struct socket *so) 4316 { 4317 4318 SOCK_UNLOCK(so); 4319 } 4320