1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 * 96 * NOTE: With regard to VNETs the general rule is that callers do not set 97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 98 * sofree(), sorele(), sonewconn() and sorflush(), which are usually called 99 * from a pre-set VNET context. sopoll() currently does not need a VNET 100 * context to be set. 101 */ 102 103 #include <sys/cdefs.h> 104 #include "opt_inet.h" 105 #include "opt_inet6.h" 106 #include "opt_kern_tls.h" 107 #include "opt_sctp.h" 108 109 #include <sys/param.h> 110 #include <sys/systm.h> 111 #include <sys/capsicum.h> 112 #include <sys/fcntl.h> 113 #include <sys/limits.h> 114 #include <sys/lock.h> 115 #include <sys/mac.h> 116 #include <sys/malloc.h> 117 #include <sys/mbuf.h> 118 #include <sys/mutex.h> 119 #include <sys/domain.h> 120 #include <sys/file.h> /* for struct knote */ 121 #include <sys/hhook.h> 122 #include <sys/kernel.h> 123 #include <sys/khelp.h> 124 #include <sys/ktls.h> 125 #include <sys/event.h> 126 #include <sys/eventhandler.h> 127 #include <sys/poll.h> 128 #include <sys/proc.h> 129 #include <sys/protosw.h> 130 #include <sys/sbuf.h> 131 #include <sys/socket.h> 132 #include <sys/socketvar.h> 133 #include <sys/resourcevar.h> 134 #include <net/route.h> 135 #include <sys/signalvar.h> 136 #include <sys/stat.h> 137 #include <sys/sx.h> 138 #include <sys/sysctl.h> 139 #include <sys/taskqueue.h> 140 #include <sys/uio.h> 141 #include <sys/un.h> 142 #include <sys/unpcb.h> 143 #include <sys/jail.h> 144 #include <sys/syslog.h> 145 #include <netinet/in.h> 146 #include <netinet/in_pcb.h> 147 #include <netinet/tcp.h> 148 149 #include <net/vnet.h> 150 151 #include <security/mac/mac_framework.h> 152 153 #include <vm/uma.h> 154 155 #ifdef COMPAT_FREEBSD32 156 #include <sys/mount.h> 157 #include <sys/sysent.h> 158 #include <compat/freebsd32/freebsd32.h> 159 #endif 160 161 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 162 int flags); 163 static void so_rdknl_lock(void *); 164 static void so_rdknl_unlock(void *); 165 static void so_rdknl_assert_lock(void *, int); 166 static void so_wrknl_lock(void *); 167 static void so_wrknl_unlock(void *); 168 static void so_wrknl_assert_lock(void *, int); 169 170 static void filt_sordetach(struct knote *kn); 171 static int filt_soread(struct knote *kn, long hint); 172 static void filt_sowdetach(struct knote *kn); 173 static int filt_sowrite(struct knote *kn, long hint); 174 static int filt_soempty(struct knote *kn, long hint); 175 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); 176 fo_kqfilter_t soo_kqfilter; 177 178 static struct filterops soread_filtops = { 179 .f_isfd = 1, 180 .f_detach = filt_sordetach, 181 .f_event = filt_soread, 182 }; 183 static struct filterops sowrite_filtops = { 184 .f_isfd = 1, 185 .f_detach = filt_sowdetach, 186 .f_event = filt_sowrite, 187 }; 188 static struct filterops soempty_filtops = { 189 .f_isfd = 1, 190 .f_detach = filt_sowdetach, 191 .f_event = filt_soempty, 192 }; 193 194 so_gen_t so_gencnt; /* generation count for sockets */ 195 196 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 197 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 198 199 #define VNET_SO_ASSERT(so) \ 200 VNET_ASSERT(curvnet != NULL, \ 201 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 202 203 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 204 #define V_socket_hhh VNET(socket_hhh) 205 206 /* 207 * Limit on the number of connections in the listen queue waiting 208 * for accept(2). 209 * NB: The original sysctl somaxconn is still available but hidden 210 * to prevent confusion about the actual purpose of this number. 211 */ 212 static u_int somaxconn = SOMAXCONN; 213 214 static int 215 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 216 { 217 int error; 218 int val; 219 220 val = somaxconn; 221 error = sysctl_handle_int(oidp, &val, 0, req); 222 if (error || !req->newptr ) 223 return (error); 224 225 /* 226 * The purpose of the UINT_MAX / 3 limit, is so that the formula 227 * 3 * so_qlimit / 2 228 * below, will not overflow. 229 */ 230 231 if (val < 1 || val > UINT_MAX / 3) 232 return (EINVAL); 233 234 somaxconn = val; 235 return (0); 236 } 237 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 238 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), 239 sysctl_somaxconn, "I", 240 "Maximum listen socket pending connection accept queue size"); 241 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 242 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, 243 sizeof(int), sysctl_somaxconn, "I", 244 "Maximum listen socket pending connection accept queue size (compat)"); 245 246 static int numopensockets; 247 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 248 &numopensockets, 0, "Number of open sockets"); 249 250 /* 251 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 252 * so_gencnt field. 253 */ 254 static struct mtx so_global_mtx; 255 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 256 257 /* 258 * General IPC sysctl name space, used by sockets and a variety of other IPC 259 * types. 260 */ 261 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 262 "IPC"); 263 264 /* 265 * Initialize the socket subsystem and set up the socket 266 * memory allocator. 267 */ 268 static uma_zone_t socket_zone; 269 int maxsockets; 270 271 static void 272 socket_zone_change(void *tag) 273 { 274 275 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 276 } 277 278 static void 279 socket_hhook_register(int subtype) 280 { 281 282 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 283 &V_socket_hhh[subtype], 284 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 285 printf("%s: WARNING: unable to register hook\n", __func__); 286 } 287 288 static void 289 socket_hhook_deregister(int subtype) 290 { 291 292 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 293 printf("%s: WARNING: unable to deregister hook\n", __func__); 294 } 295 296 static void 297 socket_init(void *tag) 298 { 299 300 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 301 NULL, NULL, UMA_ALIGN_PTR, 0); 302 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 303 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 304 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 305 EVENTHANDLER_PRI_FIRST); 306 } 307 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 308 309 static void 310 socket_vnet_init(const void *unused __unused) 311 { 312 int i; 313 314 /* We expect a contiguous range */ 315 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 316 socket_hhook_register(i); 317 } 318 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 319 socket_vnet_init, NULL); 320 321 static void 322 socket_vnet_uninit(const void *unused __unused) 323 { 324 int i; 325 326 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 327 socket_hhook_deregister(i); 328 } 329 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 330 socket_vnet_uninit, NULL); 331 332 /* 333 * Initialise maxsockets. This SYSINIT must be run after 334 * tunable_mbinit(). 335 */ 336 static void 337 init_maxsockets(void *ignored) 338 { 339 340 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 341 maxsockets = imax(maxsockets, maxfiles); 342 } 343 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 344 345 /* 346 * Sysctl to get and set the maximum global sockets limit. Notify protocols 347 * of the change so that they can update their dependent limits as required. 348 */ 349 static int 350 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 351 { 352 int error, newmaxsockets; 353 354 newmaxsockets = maxsockets; 355 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 356 if (error == 0 && req->newptr && newmaxsockets != maxsockets) { 357 if (newmaxsockets > maxsockets && 358 newmaxsockets <= maxfiles) { 359 maxsockets = newmaxsockets; 360 EVENTHANDLER_INVOKE(maxsockets_change); 361 } else 362 error = EINVAL; 363 } 364 return (error); 365 } 366 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 367 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 368 &maxsockets, 0, sysctl_maxsockets, "IU", 369 "Maximum number of sockets available"); 370 371 /* 372 * Socket operation routines. These routines are called by the routines in 373 * sys_socket.c or from a system process, and implement the semantics of 374 * socket operations by switching out to the protocol specific routines. 375 */ 376 377 /* 378 * Get a socket structure from our zone, and initialize it. Note that it 379 * would probably be better to allocate socket and PCB at the same time, but 380 * I'm not convinced that all the protocols can be easily modified to do 381 * this. 382 * 383 * soalloc() returns a socket with a ref count of 0. 384 */ 385 static struct socket * 386 soalloc(struct vnet *vnet) 387 { 388 struct socket *so; 389 390 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 391 if (so == NULL) 392 return (NULL); 393 #ifdef MAC 394 if (mac_socket_init(so, M_NOWAIT) != 0) { 395 uma_zfree(socket_zone, so); 396 return (NULL); 397 } 398 #endif 399 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 400 uma_zfree(socket_zone, so); 401 return (NULL); 402 } 403 404 /* 405 * The socket locking protocol allows to lock 2 sockets at a time, 406 * however, the first one must be a listening socket. WITNESS lacks 407 * a feature to change class of an existing lock, so we use DUPOK. 408 */ 409 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 410 mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); 411 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); 412 so->so_rcv.sb_sel = &so->so_rdsel; 413 so->so_snd.sb_sel = &so->so_wrsel; 414 sx_init(&so->so_snd_sx, "so_snd_sx"); 415 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 416 TAILQ_INIT(&so->so_snd.sb_aiojobq); 417 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 418 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 419 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 420 #ifdef VIMAGE 421 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 422 __func__, __LINE__, so)); 423 so->so_vnet = vnet; 424 #endif 425 /* We shouldn't need the so_global_mtx */ 426 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 427 /* Do we need more comprehensive error returns? */ 428 uma_zfree(socket_zone, so); 429 return (NULL); 430 } 431 mtx_lock(&so_global_mtx); 432 so->so_gencnt = ++so_gencnt; 433 ++numopensockets; 434 #ifdef VIMAGE 435 vnet->vnet_sockcnt++; 436 #endif 437 mtx_unlock(&so_global_mtx); 438 439 return (so); 440 } 441 442 /* 443 * Free the storage associated with a socket at the socket layer, tear down 444 * locks, labels, etc. All protocol state is assumed already to have been 445 * torn down (and possibly never set up) by the caller. 446 */ 447 void 448 sodealloc(struct socket *so) 449 { 450 451 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 452 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 453 454 mtx_lock(&so_global_mtx); 455 so->so_gencnt = ++so_gencnt; 456 --numopensockets; /* Could be below, but faster here. */ 457 #ifdef VIMAGE 458 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 459 __func__, __LINE__, so)); 460 so->so_vnet->vnet_sockcnt--; 461 #endif 462 mtx_unlock(&so_global_mtx); 463 #ifdef MAC 464 mac_socket_destroy(so); 465 #endif 466 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 467 468 khelp_destroy_osd(&so->osd); 469 if (SOLISTENING(so)) { 470 if (so->sol_accept_filter != NULL) 471 accept_filt_setopt(so, NULL); 472 } else { 473 if (so->so_rcv.sb_hiwat) 474 (void)chgsbsize(so->so_cred->cr_uidinfo, 475 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 476 if (so->so_snd.sb_hiwat) 477 (void)chgsbsize(so->so_cred->cr_uidinfo, 478 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 479 sx_destroy(&so->so_snd_sx); 480 sx_destroy(&so->so_rcv_sx); 481 mtx_destroy(&so->so_snd_mtx); 482 mtx_destroy(&so->so_rcv_mtx); 483 } 484 crfree(so->so_cred); 485 mtx_destroy(&so->so_lock); 486 uma_zfree(socket_zone, so); 487 } 488 489 /* 490 * socreate returns a socket with a ref count of 1 and a file descriptor 491 * reference. The socket should be closed with soclose(). 492 */ 493 int 494 socreate(int dom, struct socket **aso, int type, int proto, 495 struct ucred *cred, struct thread *td) 496 { 497 struct protosw *prp; 498 struct socket *so; 499 int error; 500 501 /* 502 * XXX: divert(4) historically abused PF_INET. Keep this compatibility 503 * shim until all applications have been updated. 504 */ 505 if (__predict_false(dom == PF_INET && type == SOCK_RAW && 506 proto == IPPROTO_DIVERT)) { 507 dom = PF_DIVERT; 508 printf("%s uses obsolete way to create divert(4) socket\n", 509 td->td_proc->p_comm); 510 } 511 512 prp = pffindproto(dom, type, proto); 513 if (prp == NULL) { 514 /* No support for domain. */ 515 if (pffinddomain(dom) == NULL) 516 return (EAFNOSUPPORT); 517 /* No support for socket type. */ 518 if (proto == 0 && type != 0) 519 return (EPROTOTYPE); 520 return (EPROTONOSUPPORT); 521 } 522 523 MPASS(prp->pr_attach); 524 525 if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) 526 return (ECAPMODE); 527 528 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 529 return (EPROTONOSUPPORT); 530 531 so = soalloc(CRED_TO_VNET(cred)); 532 if (so == NULL) 533 return (ENOBUFS); 534 535 so->so_type = type; 536 so->so_cred = crhold(cred); 537 if ((prp->pr_domain->dom_family == PF_INET) || 538 (prp->pr_domain->dom_family == PF_INET6) || 539 (prp->pr_domain->dom_family == PF_ROUTE)) 540 so->so_fibnum = td->td_proc->p_fibnum; 541 else 542 so->so_fibnum = 0; 543 so->so_proto = prp; 544 #ifdef MAC 545 mac_socket_create(cred, so); 546 #endif 547 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 548 so_rdknl_assert_lock); 549 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 550 so_wrknl_assert_lock); 551 if ((prp->pr_flags & PR_SOCKBUF) == 0) { 552 so->so_snd.sb_mtx = &so->so_snd_mtx; 553 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 554 } 555 /* 556 * Auto-sizing of socket buffers is managed by the protocols and 557 * the appropriate flags must be set in the pru_attach function. 558 */ 559 CURVNET_SET(so->so_vnet); 560 error = prp->pr_attach(so, proto, td); 561 CURVNET_RESTORE(); 562 if (error) { 563 sodealloc(so); 564 return (error); 565 } 566 soref(so); 567 *aso = so; 568 return (0); 569 } 570 571 #ifdef REGRESSION 572 static int regression_sonewconn_earlytest = 1; 573 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 574 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 575 #endif 576 577 static int sooverprio = LOG_DEBUG; 578 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, 579 &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); 580 581 static struct timeval overinterval = { 60, 0 }; 582 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 583 &overinterval, 584 "Delay in seconds between warnings for listen socket overflows"); 585 586 /* 587 * When an attempt at a new connection is noted on a socket which supports 588 * accept(2), the protocol has two options: 589 * 1) Call legacy sonewconn() function, which would call protocol attach 590 * method, same as used for socket(2). 591 * 2) Call solisten_clone(), do attach that is specific to a cloned connection, 592 * and then call solisten_enqueue(). 593 * 594 * Note: the ref count on the socket is 0 on return. 595 */ 596 struct socket * 597 solisten_clone(struct socket *head) 598 { 599 struct sbuf descrsb; 600 struct socket *so; 601 int len, overcount; 602 u_int qlen; 603 const char localprefix[] = "local:"; 604 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 605 #if defined(INET6) 606 char addrbuf[INET6_ADDRSTRLEN]; 607 #elif defined(INET) 608 char addrbuf[INET_ADDRSTRLEN]; 609 #endif 610 bool dolog, over; 611 612 SOLISTEN_LOCK(head); 613 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 614 #ifdef REGRESSION 615 if (regression_sonewconn_earlytest && over) { 616 #else 617 if (over) { 618 #endif 619 head->sol_overcount++; 620 dolog = (sooverprio >= 0) && 621 !!ratecheck(&head->sol_lastover, &overinterval); 622 623 /* 624 * If we're going to log, copy the overflow count and queue 625 * length from the listen socket before dropping the lock. 626 * Also, reset the overflow count. 627 */ 628 if (dolog) { 629 overcount = head->sol_overcount; 630 head->sol_overcount = 0; 631 qlen = head->sol_qlen; 632 } 633 SOLISTEN_UNLOCK(head); 634 635 if (dolog) { 636 /* 637 * Try to print something descriptive about the 638 * socket for the error message. 639 */ 640 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 641 SBUF_FIXEDLEN); 642 switch (head->so_proto->pr_domain->dom_family) { 643 #if defined(INET) || defined(INET6) 644 #ifdef INET 645 case AF_INET: 646 #endif 647 #ifdef INET6 648 case AF_INET6: 649 if (head->so_proto->pr_domain->dom_family == 650 AF_INET6 || 651 (sotoinpcb(head)->inp_inc.inc_flags & 652 INC_ISIPV6)) { 653 ip6_sprintf(addrbuf, 654 &sotoinpcb(head)->inp_inc.inc6_laddr); 655 sbuf_printf(&descrsb, "[%s]", addrbuf); 656 } else 657 #endif 658 { 659 #ifdef INET 660 inet_ntoa_r( 661 sotoinpcb(head)->inp_inc.inc_laddr, 662 addrbuf); 663 sbuf_cat(&descrsb, addrbuf); 664 #endif 665 } 666 sbuf_printf(&descrsb, ":%hu (proto %u)", 667 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 668 head->so_proto->pr_protocol); 669 break; 670 #endif /* INET || INET6 */ 671 case AF_UNIX: 672 sbuf_cat(&descrsb, localprefix); 673 if (sotounpcb(head)->unp_addr != NULL) 674 len = 675 sotounpcb(head)->unp_addr->sun_len - 676 offsetof(struct sockaddr_un, 677 sun_path); 678 else 679 len = 0; 680 if (len > 0) 681 sbuf_bcat(&descrsb, 682 sotounpcb(head)->unp_addr->sun_path, 683 len); 684 else 685 sbuf_cat(&descrsb, "(unknown)"); 686 break; 687 } 688 689 /* 690 * If we can't print something more specific, at least 691 * print the domain name. 692 */ 693 if (sbuf_finish(&descrsb) != 0 || 694 sbuf_len(&descrsb) <= 0) { 695 sbuf_clear(&descrsb); 696 sbuf_cat(&descrsb, 697 head->so_proto->pr_domain->dom_name ?: 698 "unknown"); 699 sbuf_finish(&descrsb); 700 } 701 KASSERT(sbuf_len(&descrsb) > 0, 702 ("%s: sbuf creation failed", __func__)); 703 /* 704 * Preserve the historic listen queue overflow log 705 * message, that starts with "sonewconn:". It has 706 * been known to sysadmins for years and also test 707 * sys/kern/sonewconn_overflow checks for it. 708 */ 709 if (head->so_cred == 0) { 710 log(LOG_PRI(sooverprio), 711 "sonewconn: pcb %p (%s): " 712 "Listen queue overflow: %i already in " 713 "queue awaiting acceptance (%d " 714 "occurrences)\n", head->so_pcb, 715 sbuf_data(&descrsb), 716 qlen, overcount); 717 } else { 718 log(LOG_PRI(sooverprio), 719 "sonewconn: pcb %p (%s): " 720 "Listen queue overflow: " 721 "%i already in queue awaiting acceptance " 722 "(%d occurrences), euid %d, rgid %d, jail %s\n", 723 head->so_pcb, sbuf_data(&descrsb), qlen, 724 overcount, head->so_cred->cr_uid, 725 head->so_cred->cr_rgid, 726 head->so_cred->cr_prison ? 727 head->so_cred->cr_prison->pr_name : 728 "not_jailed"); 729 } 730 sbuf_delete(&descrsb); 731 732 overcount = 0; 733 } 734 735 return (NULL); 736 } 737 SOLISTEN_UNLOCK(head); 738 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 739 __func__, head)); 740 so = soalloc(head->so_vnet); 741 if (so == NULL) { 742 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 743 "limit reached or out of memory\n", 744 __func__, head->so_pcb); 745 return (NULL); 746 } 747 so->so_listen = head; 748 so->so_type = head->so_type; 749 /* 750 * POSIX is ambiguous on what options an accept(2)ed socket should 751 * inherit from the listener. Words "create a new socket" may be 752 * interpreted as not inheriting anything. Best programming practice 753 * for application developers is to not rely on such inheritance. 754 * FreeBSD had historically inherited all so_options excluding 755 * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, 756 * including those completely irrelevant to a new born socket. For 757 * compatibility with older versions we will inherit a list of 758 * meaningful options. 759 */ 760 so->so_options = head->so_options & (SO_KEEPALIVE | SO_DONTROUTE | 761 SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); 762 so->so_linger = head->so_linger; 763 so->so_state = head->so_state; 764 so->so_fibnum = head->so_fibnum; 765 so->so_proto = head->so_proto; 766 so->so_cred = crhold(head->so_cred); 767 #ifdef MAC 768 mac_socket_newconn(head, so); 769 #endif 770 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 771 so_rdknl_assert_lock); 772 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 773 so_wrknl_assert_lock); 774 VNET_SO_ASSERT(head); 775 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 776 sodealloc(so); 777 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 778 __func__, head->so_pcb); 779 return (NULL); 780 } 781 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 782 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 783 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 784 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 785 so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; 786 so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; 787 if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { 788 so->so_snd.sb_mtx = &so->so_snd_mtx; 789 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 790 } 791 792 return (so); 793 } 794 795 /* Connstatus may be 0 or SS_ISCONNECTED. */ 796 struct socket * 797 sonewconn(struct socket *head, int connstatus) 798 { 799 struct socket *so; 800 801 if ((so = solisten_clone(head)) == NULL) 802 return (NULL); 803 804 if (so->so_proto->pr_attach(so, 0, NULL) != 0) { 805 sodealloc(so); 806 log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", 807 __func__, head->so_pcb); 808 return (NULL); 809 } 810 811 (void)solisten_enqueue(so, connstatus); 812 813 return (so); 814 } 815 816 /* 817 * Enqueue socket cloned by solisten_clone() to the listen queue of the 818 * listener it has been cloned from. 819 * 820 * Return 'true' if socket landed on complete queue, otherwise 'false'. 821 */ 822 bool 823 solisten_enqueue(struct socket *so, int connstatus) 824 { 825 struct socket *head = so->so_listen; 826 827 MPASS(refcount_load(&so->so_count) == 0); 828 refcount_init(&so->so_count, 1); 829 830 SOLISTEN_LOCK(head); 831 if (head->sol_accept_filter != NULL) 832 connstatus = 0; 833 so->so_state |= connstatus; 834 soref(head); /* A socket on (in)complete queue refs head. */ 835 if (connstatus) { 836 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 837 so->so_qstate = SQ_COMP; 838 head->sol_qlen++; 839 solisten_wakeup(head); /* unlocks */ 840 return (true); 841 } else { 842 /* 843 * Keep removing sockets from the head until there's room for 844 * us to insert on the tail. In pre-locking revisions, this 845 * was a simple if(), but as we could be racing with other 846 * threads and soabort() requires dropping locks, we must 847 * loop waiting for the condition to be true. 848 */ 849 while (head->sol_incqlen > head->sol_qlimit) { 850 struct socket *sp; 851 852 sp = TAILQ_FIRST(&head->sol_incomp); 853 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 854 head->sol_incqlen--; 855 SOCK_LOCK(sp); 856 sp->so_qstate = SQ_NONE; 857 sp->so_listen = NULL; 858 SOCK_UNLOCK(sp); 859 sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ 860 soabort(sp); 861 SOLISTEN_LOCK(head); 862 } 863 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 864 so->so_qstate = SQ_INCOMP; 865 head->sol_incqlen++; 866 SOLISTEN_UNLOCK(head); 867 return (false); 868 } 869 } 870 871 #if defined(SCTP) || defined(SCTP_SUPPORT) 872 /* 873 * Socket part of sctp_peeloff(). Detach a new socket from an 874 * association. The new socket is returned with a reference. 875 * 876 * XXXGL: reduce copy-paste with solisten_clone(). 877 */ 878 struct socket * 879 sopeeloff(struct socket *head) 880 { 881 struct socket *so; 882 883 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 884 __func__, __LINE__, head)); 885 so = soalloc(head->so_vnet); 886 if (so == NULL) { 887 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 888 "limit reached or out of memory\n", 889 __func__, head->so_pcb); 890 return (NULL); 891 } 892 so->so_type = head->so_type; 893 so->so_options = head->so_options; 894 so->so_linger = head->so_linger; 895 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 896 so->so_fibnum = head->so_fibnum; 897 so->so_proto = head->so_proto; 898 so->so_cred = crhold(head->so_cred); 899 #ifdef MAC 900 mac_socket_newconn(head, so); 901 #endif 902 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 903 so_rdknl_assert_lock); 904 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 905 so_wrknl_assert_lock); 906 VNET_SO_ASSERT(head); 907 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 908 sodealloc(so); 909 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 910 __func__, head->so_pcb); 911 return (NULL); 912 } 913 if ((*so->so_proto->pr_attach)(so, 0, NULL)) { 914 sodealloc(so); 915 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 916 __func__, head->so_pcb); 917 return (NULL); 918 } 919 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 920 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 921 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 922 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 923 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 924 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 925 926 soref(so); 927 928 return (so); 929 } 930 #endif /* SCTP */ 931 932 int 933 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 934 { 935 int error; 936 937 CURVNET_SET(so->so_vnet); 938 error = so->so_proto->pr_bind(so, nam, td); 939 CURVNET_RESTORE(); 940 return (error); 941 } 942 943 int 944 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 945 { 946 int error; 947 948 CURVNET_SET(so->so_vnet); 949 error = so->so_proto->pr_bindat(fd, so, nam, td); 950 CURVNET_RESTORE(); 951 return (error); 952 } 953 954 /* 955 * solisten() transitions a socket from a non-listening state to a listening 956 * state, but can also be used to update the listen queue depth on an 957 * existing listen socket. The protocol will call back into the sockets 958 * layer using solisten_proto_check() and solisten_proto() to check and set 959 * socket-layer listen state. Call backs are used so that the protocol can 960 * acquire both protocol and socket layer locks in whatever order is required 961 * by the protocol. 962 * 963 * Protocol implementors are advised to hold the socket lock across the 964 * socket-layer test and set to avoid races at the socket layer. 965 */ 966 int 967 solisten(struct socket *so, int backlog, struct thread *td) 968 { 969 int error; 970 971 CURVNET_SET(so->so_vnet); 972 error = so->so_proto->pr_listen(so, backlog, td); 973 CURVNET_RESTORE(); 974 return (error); 975 } 976 977 /* 978 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 979 * order to interlock with socket I/O. 980 */ 981 int 982 solisten_proto_check(struct socket *so) 983 { 984 SOCK_LOCK_ASSERT(so); 985 986 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 987 SS_ISDISCONNECTING)) != 0) 988 return (EINVAL); 989 990 /* 991 * Sleeping is not permitted here, so simply fail if userspace is 992 * attempting to transmit or receive on the socket. This kind of 993 * transient failure is not ideal, but it should occur only if userspace 994 * is misusing the socket interfaces. 995 */ 996 if (!sx_try_xlock(&so->so_snd_sx)) 997 return (EAGAIN); 998 if (!sx_try_xlock(&so->so_rcv_sx)) { 999 sx_xunlock(&so->so_snd_sx); 1000 return (EAGAIN); 1001 } 1002 mtx_lock(&so->so_snd_mtx); 1003 mtx_lock(&so->so_rcv_mtx); 1004 1005 /* Interlock with soo_aio_queue() and KTLS. */ 1006 if (!SOLISTENING(so)) { 1007 bool ktls; 1008 1009 #ifdef KERN_TLS 1010 ktls = so->so_snd.sb_tls_info != NULL || 1011 so->so_rcv.sb_tls_info != NULL; 1012 #else 1013 ktls = false; 1014 #endif 1015 if (ktls || 1016 (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || 1017 (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { 1018 solisten_proto_abort(so); 1019 return (EINVAL); 1020 } 1021 } 1022 1023 return (0); 1024 } 1025 1026 /* 1027 * Undo the setup done by solisten_proto_check(). 1028 */ 1029 void 1030 solisten_proto_abort(struct socket *so) 1031 { 1032 mtx_unlock(&so->so_snd_mtx); 1033 mtx_unlock(&so->so_rcv_mtx); 1034 sx_xunlock(&so->so_snd_sx); 1035 sx_xunlock(&so->so_rcv_sx); 1036 } 1037 1038 void 1039 solisten_proto(struct socket *so, int backlog) 1040 { 1041 int sbrcv_lowat, sbsnd_lowat; 1042 u_int sbrcv_hiwat, sbsnd_hiwat; 1043 short sbrcv_flags, sbsnd_flags; 1044 sbintime_t sbrcv_timeo, sbsnd_timeo; 1045 1046 SOCK_LOCK_ASSERT(so); 1047 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 1048 SS_ISDISCONNECTING)) == 0, 1049 ("%s: bad socket state %p", __func__, so)); 1050 1051 if (SOLISTENING(so)) 1052 goto listening; 1053 1054 /* 1055 * Change this socket to listening state. 1056 */ 1057 sbrcv_lowat = so->so_rcv.sb_lowat; 1058 sbsnd_lowat = so->so_snd.sb_lowat; 1059 sbrcv_hiwat = so->so_rcv.sb_hiwat; 1060 sbsnd_hiwat = so->so_snd.sb_hiwat; 1061 sbrcv_flags = so->so_rcv.sb_flags; 1062 sbsnd_flags = so->so_snd.sb_flags; 1063 sbrcv_timeo = so->so_rcv.sb_timeo; 1064 sbsnd_timeo = so->so_snd.sb_timeo; 1065 1066 sbdestroy(so, SO_SND); 1067 sbdestroy(so, SO_RCV); 1068 1069 #ifdef INVARIANTS 1070 bzero(&so->so_rcv, 1071 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 1072 #endif 1073 1074 so->sol_sbrcv_lowat = sbrcv_lowat; 1075 so->sol_sbsnd_lowat = sbsnd_lowat; 1076 so->sol_sbrcv_hiwat = sbrcv_hiwat; 1077 so->sol_sbsnd_hiwat = sbsnd_hiwat; 1078 so->sol_sbrcv_flags = sbrcv_flags; 1079 so->sol_sbsnd_flags = sbsnd_flags; 1080 so->sol_sbrcv_timeo = sbrcv_timeo; 1081 so->sol_sbsnd_timeo = sbsnd_timeo; 1082 1083 so->sol_qlen = so->sol_incqlen = 0; 1084 TAILQ_INIT(&so->sol_incomp); 1085 TAILQ_INIT(&so->sol_comp); 1086 1087 so->sol_accept_filter = NULL; 1088 so->sol_accept_filter_arg = NULL; 1089 so->sol_accept_filter_str = NULL; 1090 1091 so->sol_upcall = NULL; 1092 so->sol_upcallarg = NULL; 1093 1094 so->so_options |= SO_ACCEPTCONN; 1095 1096 listening: 1097 if (backlog < 0 || backlog > somaxconn) 1098 backlog = somaxconn; 1099 so->sol_qlimit = backlog; 1100 1101 mtx_unlock(&so->so_snd_mtx); 1102 mtx_unlock(&so->so_rcv_mtx); 1103 sx_xunlock(&so->so_snd_sx); 1104 sx_xunlock(&so->so_rcv_sx); 1105 } 1106 1107 /* 1108 * Wakeup listeners/subsystems once we have a complete connection. 1109 * Enters with lock, returns unlocked. 1110 */ 1111 void 1112 solisten_wakeup(struct socket *sol) 1113 { 1114 1115 if (sol->sol_upcall != NULL) 1116 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1117 else { 1118 selwakeuppri(&sol->so_rdsel, PSOCK); 1119 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1120 } 1121 SOLISTEN_UNLOCK(sol); 1122 wakeup_one(&sol->sol_comp); 1123 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1124 pgsigio(&sol->so_sigio, SIGIO, 0); 1125 } 1126 1127 /* 1128 * Return single connection off a listening socket queue. Main consumer of 1129 * the function is kern_accept4(). Some modules, that do their own accept 1130 * management also use the function. The socket reference held by the 1131 * listen queue is handed to the caller. 1132 * 1133 * Listening socket must be locked on entry and is returned unlocked on 1134 * return. 1135 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1136 */ 1137 int 1138 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1139 { 1140 struct socket *so; 1141 int error; 1142 1143 SOLISTEN_LOCK_ASSERT(head); 1144 1145 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1146 head->so_error == 0) { 1147 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1148 "accept", 0); 1149 if (error != 0) { 1150 SOLISTEN_UNLOCK(head); 1151 return (error); 1152 } 1153 } 1154 if (head->so_error) { 1155 error = head->so_error; 1156 head->so_error = 0; 1157 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1158 error = EWOULDBLOCK; 1159 else 1160 error = 0; 1161 if (error) { 1162 SOLISTEN_UNLOCK(head); 1163 return (error); 1164 } 1165 so = TAILQ_FIRST(&head->sol_comp); 1166 SOCK_LOCK(so); 1167 KASSERT(so->so_qstate == SQ_COMP, 1168 ("%s: so %p not SQ_COMP", __func__, so)); 1169 head->sol_qlen--; 1170 so->so_qstate = SQ_NONE; 1171 so->so_listen = NULL; 1172 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1173 if (flags & ACCEPT4_INHERIT) 1174 so->so_state |= (head->so_state & SS_NBIO); 1175 else 1176 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1177 SOCK_UNLOCK(so); 1178 sorele_locked(head); 1179 1180 *ret = so; 1181 return (0); 1182 } 1183 1184 /* 1185 * Free socket upon release of the very last reference. 1186 */ 1187 static void 1188 sofree(struct socket *so) 1189 { 1190 struct protosw *pr = so->so_proto; 1191 1192 SOCK_LOCK_ASSERT(so); 1193 KASSERT(refcount_load(&so->so_count) == 0, 1194 ("%s: so %p has references", __func__, so)); 1195 KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, 1196 ("%s: so %p is on listen queue", __func__, so)); 1197 1198 SOCK_UNLOCK(so); 1199 1200 if (so->so_dtor != NULL) 1201 so->so_dtor(so); 1202 1203 VNET_SO_ASSERT(so); 1204 if (pr->pr_detach != NULL) 1205 pr->pr_detach(so); 1206 1207 /* 1208 * From this point on, we assume that no other references to this 1209 * socket exist anywhere else in the stack. Therefore, no locks need 1210 * to be acquired or held. 1211 */ 1212 if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { 1213 sbdestroy(so, SO_SND); 1214 sbdestroy(so, SO_RCV); 1215 } 1216 seldrain(&so->so_rdsel); 1217 seldrain(&so->so_wrsel); 1218 knlist_destroy(&so->so_rdsel.si_note); 1219 knlist_destroy(&so->so_wrsel.si_note); 1220 sodealloc(so); 1221 } 1222 1223 /* 1224 * Release a reference on a socket while holding the socket lock. 1225 * Unlocks the socket lock before returning. 1226 */ 1227 void 1228 sorele_locked(struct socket *so) 1229 { 1230 SOCK_LOCK_ASSERT(so); 1231 if (refcount_release(&so->so_count)) 1232 sofree(so); 1233 else 1234 SOCK_UNLOCK(so); 1235 } 1236 1237 /* 1238 * Close a socket on last file table reference removal. Initiate disconnect 1239 * if connected. Free socket when disconnect complete. 1240 * 1241 * This function will sorele() the socket. Note that soclose() may be called 1242 * prior to the ref count reaching zero. The actual socket structure will 1243 * not be freed until the ref count reaches zero. 1244 */ 1245 int 1246 soclose(struct socket *so) 1247 { 1248 struct accept_queue lqueue; 1249 int error = 0; 1250 bool listening, last __diagused; 1251 1252 CURVNET_SET(so->so_vnet); 1253 funsetown(&so->so_sigio); 1254 if (so->so_state & SS_ISCONNECTED) { 1255 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1256 error = sodisconnect(so); 1257 if (error) { 1258 if (error == ENOTCONN) 1259 error = 0; 1260 goto drop; 1261 } 1262 } 1263 1264 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 1265 if ((so->so_state & SS_ISDISCONNECTING) && 1266 (so->so_state & SS_NBIO)) 1267 goto drop; 1268 while (so->so_state & SS_ISCONNECTED) { 1269 error = tsleep(&so->so_timeo, 1270 PSOCK | PCATCH, "soclos", 1271 so->so_linger * hz); 1272 if (error) 1273 break; 1274 } 1275 } 1276 } 1277 1278 drop: 1279 if (so->so_proto->pr_close != NULL) 1280 so->so_proto->pr_close(so); 1281 1282 SOCK_LOCK(so); 1283 if ((listening = SOLISTENING(so))) { 1284 struct socket *sp; 1285 1286 TAILQ_INIT(&lqueue); 1287 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1288 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1289 1290 so->sol_qlen = so->sol_incqlen = 0; 1291 1292 TAILQ_FOREACH(sp, &lqueue, so_list) { 1293 SOCK_LOCK(sp); 1294 sp->so_qstate = SQ_NONE; 1295 sp->so_listen = NULL; 1296 SOCK_UNLOCK(sp); 1297 last = refcount_release(&so->so_count); 1298 KASSERT(!last, ("%s: released last reference for %p", 1299 __func__, so)); 1300 } 1301 } 1302 sorele_locked(so); 1303 if (listening) { 1304 struct socket *sp, *tsp; 1305 1306 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) 1307 soabort(sp); 1308 } 1309 CURVNET_RESTORE(); 1310 return (error); 1311 } 1312 1313 /* 1314 * soabort() is used to abruptly tear down a connection, such as when a 1315 * resource limit is reached (listen queue depth exceeded), or if a listen 1316 * socket is closed while there are sockets waiting to be accepted. 1317 * 1318 * This interface is tricky, because it is called on an unreferenced socket, 1319 * and must be called only by a thread that has actually removed the socket 1320 * from the listen queue it was on. Likely this thread holds the last 1321 * reference on the socket and soabort() will proceed with sofree(). But 1322 * it might be not the last, as the sockets on the listen queues are seen 1323 * from the protocol side. 1324 * 1325 * This interface will call into the protocol code, so must not be called 1326 * with any socket locks held. Protocols do call it while holding their own 1327 * recursible protocol mutexes, but this is something that should be subject 1328 * to review in the future. 1329 * 1330 * Usually socket should have a single reference left, but this is not a 1331 * requirement. In the past, when we have had named references for file 1332 * descriptor and protocol, we asserted that none of them are being held. 1333 */ 1334 void 1335 soabort(struct socket *so) 1336 { 1337 1338 VNET_SO_ASSERT(so); 1339 1340 if (so->so_proto->pr_abort != NULL) 1341 so->so_proto->pr_abort(so); 1342 SOCK_LOCK(so); 1343 sorele_locked(so); 1344 } 1345 1346 int 1347 soaccept(struct socket *so, struct sockaddr *sa) 1348 { 1349 #ifdef INVARIANTS 1350 u_char len = sa->sa_len; 1351 #endif 1352 int error; 1353 1354 CURVNET_SET(so->so_vnet); 1355 error = so->so_proto->pr_accept(so, sa); 1356 KASSERT(sa->sa_len <= len, 1357 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 1358 CURVNET_RESTORE(); 1359 return (error); 1360 } 1361 1362 int 1363 sopeeraddr(struct socket *so, struct sockaddr *sa) 1364 { 1365 #ifdef INVARIANTS 1366 u_char len = sa->sa_len; 1367 #endif 1368 int error; 1369 1370 CURVNET_SET(so->so_vnet); 1371 error = so->so_proto->pr_peeraddr(so, sa); 1372 KASSERT(sa->sa_len <= len, 1373 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 1374 CURVNET_RESTORE(); 1375 1376 return (error); 1377 } 1378 1379 int 1380 sosockaddr(struct socket *so, struct sockaddr *sa) 1381 { 1382 #ifdef INVARIANTS 1383 u_char len = sa->sa_len; 1384 #endif 1385 int error; 1386 1387 CURVNET_SET(so->so_vnet); 1388 error = so->so_proto->pr_sockaddr(so, sa); 1389 KASSERT(sa->sa_len <= len, 1390 ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); 1391 CURVNET_RESTORE(); 1392 1393 return (error); 1394 } 1395 1396 int 1397 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 1398 { 1399 1400 return (soconnectat(AT_FDCWD, so, nam, td)); 1401 } 1402 1403 int 1404 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1405 { 1406 int error; 1407 1408 CURVNET_SET(so->so_vnet); 1409 1410 /* 1411 * If protocol is connection-based, can only connect once. 1412 * Otherwise, if connected, try to disconnect first. This allows 1413 * user to disconnect by connecting to, e.g., a null address. 1414 * 1415 * Note, this check is racy and may need to be re-evaluated at the 1416 * protocol layer. 1417 */ 1418 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1419 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1420 (error = sodisconnect(so)))) { 1421 error = EISCONN; 1422 } else { 1423 /* 1424 * Prevent accumulated error from previous connection from 1425 * biting us. 1426 */ 1427 so->so_error = 0; 1428 if (fd == AT_FDCWD) { 1429 error = so->so_proto->pr_connect(so, nam, td); 1430 } else { 1431 error = so->so_proto->pr_connectat(fd, so, nam, td); 1432 } 1433 } 1434 CURVNET_RESTORE(); 1435 1436 return (error); 1437 } 1438 1439 int 1440 soconnect2(struct socket *so1, struct socket *so2) 1441 { 1442 int error; 1443 1444 CURVNET_SET(so1->so_vnet); 1445 error = so1->so_proto->pr_connect2(so1, so2); 1446 CURVNET_RESTORE(); 1447 return (error); 1448 } 1449 1450 int 1451 sodisconnect(struct socket *so) 1452 { 1453 int error; 1454 1455 if ((so->so_state & SS_ISCONNECTED) == 0) 1456 return (ENOTCONN); 1457 if (so->so_state & SS_ISDISCONNECTING) 1458 return (EALREADY); 1459 VNET_SO_ASSERT(so); 1460 error = so->so_proto->pr_disconnect(so); 1461 return (error); 1462 } 1463 1464 int 1465 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1466 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1467 { 1468 long space; 1469 ssize_t resid; 1470 int clen = 0, error, dontroute; 1471 1472 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1473 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1474 ("sosend_dgram: !PR_ATOMIC")); 1475 1476 if (uio != NULL) 1477 resid = uio->uio_resid; 1478 else 1479 resid = top->m_pkthdr.len; 1480 /* 1481 * In theory resid should be unsigned. However, space must be 1482 * signed, as it might be less than 0 if we over-committed, and we 1483 * must use a signed comparison of space and resid. On the other 1484 * hand, a negative resid causes us to loop sending 0-length 1485 * segments to the protocol. 1486 */ 1487 if (resid < 0) { 1488 error = EINVAL; 1489 goto out; 1490 } 1491 1492 dontroute = 1493 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1494 if (td != NULL) 1495 td->td_ru.ru_msgsnd++; 1496 if (control != NULL) 1497 clen = control->m_len; 1498 1499 SOCKBUF_LOCK(&so->so_snd); 1500 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1501 SOCKBUF_UNLOCK(&so->so_snd); 1502 error = EPIPE; 1503 goto out; 1504 } 1505 if (so->so_error) { 1506 error = so->so_error; 1507 so->so_error = 0; 1508 SOCKBUF_UNLOCK(&so->so_snd); 1509 goto out; 1510 } 1511 if ((so->so_state & SS_ISCONNECTED) == 0) { 1512 /* 1513 * `sendto' and `sendmsg' is allowed on a connection-based 1514 * socket if it supports implied connect. Return ENOTCONN if 1515 * not connected and no address is supplied. 1516 */ 1517 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1518 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1519 if (!(resid == 0 && clen != 0)) { 1520 SOCKBUF_UNLOCK(&so->so_snd); 1521 error = ENOTCONN; 1522 goto out; 1523 } 1524 } else if (addr == NULL) { 1525 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1526 error = ENOTCONN; 1527 else 1528 error = EDESTADDRREQ; 1529 SOCKBUF_UNLOCK(&so->so_snd); 1530 goto out; 1531 } 1532 } 1533 1534 /* 1535 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1536 * problem and need fixing. 1537 */ 1538 space = sbspace(&so->so_snd); 1539 if (flags & MSG_OOB) 1540 space += 1024; 1541 space -= clen; 1542 SOCKBUF_UNLOCK(&so->so_snd); 1543 if (resid > space) { 1544 error = EMSGSIZE; 1545 goto out; 1546 } 1547 if (uio == NULL) { 1548 resid = 0; 1549 if (flags & MSG_EOR) 1550 top->m_flags |= M_EOR; 1551 } else { 1552 /* 1553 * Copy the data from userland into a mbuf chain. 1554 * If no data is to be copied in, a single empty mbuf 1555 * is returned. 1556 */ 1557 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1558 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1559 if (top == NULL) { 1560 error = EFAULT; /* only possible error */ 1561 goto out; 1562 } 1563 space -= resid - uio->uio_resid; 1564 resid = uio->uio_resid; 1565 } 1566 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1567 /* 1568 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1569 * than with. 1570 */ 1571 if (dontroute) { 1572 SOCK_LOCK(so); 1573 so->so_options |= SO_DONTROUTE; 1574 SOCK_UNLOCK(so); 1575 } 1576 /* 1577 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1578 * of date. We could have received a reset packet in an interrupt or 1579 * maybe we slept while doing page faults in uiomove() etc. We could 1580 * probably recheck again inside the locking protection here, but 1581 * there are probably other places that this also happens. We must 1582 * rethink this. 1583 */ 1584 VNET_SO_ASSERT(so); 1585 error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1586 /* 1587 * If the user set MSG_EOF, the protocol understands this flag and 1588 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1589 */ 1590 ((flags & MSG_EOF) && 1591 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1592 (resid <= 0)) ? 1593 PRUS_EOF : 1594 /* If there is more to send set PRUS_MORETOCOME */ 1595 (flags & MSG_MORETOCOME) || 1596 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1597 top, addr, control, td); 1598 if (dontroute) { 1599 SOCK_LOCK(so); 1600 so->so_options &= ~SO_DONTROUTE; 1601 SOCK_UNLOCK(so); 1602 } 1603 clen = 0; 1604 control = NULL; 1605 top = NULL; 1606 out: 1607 if (top != NULL) 1608 m_freem(top); 1609 if (control != NULL) 1610 m_freem(control); 1611 return (error); 1612 } 1613 1614 /* 1615 * Send on a socket. If send must go all at once and message is larger than 1616 * send buffering, then hard error. Lock against other senders. If must go 1617 * all at once and not enough room now, then inform user that this would 1618 * block and do nothing. Otherwise, if nonblocking, send as much as 1619 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1620 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1621 * in mbuf chain must be small enough to send all at once. 1622 * 1623 * Returns nonzero on error, timeout or signal; callers must check for short 1624 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1625 * on return. 1626 */ 1627 int 1628 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1629 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1630 { 1631 long space; 1632 ssize_t resid; 1633 int clen = 0, error, dontroute; 1634 int atomic = sosendallatonce(so) || top; 1635 int pr_send_flag; 1636 #ifdef KERN_TLS 1637 struct ktls_session *tls; 1638 int tls_enq_cnt, tls_send_flag; 1639 uint8_t tls_rtype; 1640 1641 tls = NULL; 1642 tls_rtype = TLS_RLTYPE_APP; 1643 #endif 1644 if (uio != NULL) 1645 resid = uio->uio_resid; 1646 else if ((top->m_flags & M_PKTHDR) != 0) 1647 resid = top->m_pkthdr.len; 1648 else 1649 resid = m_length(top, NULL); 1650 /* 1651 * In theory resid should be unsigned. However, space must be 1652 * signed, as it might be less than 0 if we over-committed, and we 1653 * must use a signed comparison of space and resid. On the other 1654 * hand, a negative resid causes us to loop sending 0-length 1655 * segments to the protocol. 1656 * 1657 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1658 * type sockets since that's an error. 1659 */ 1660 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1661 error = EINVAL; 1662 goto out; 1663 } 1664 1665 dontroute = 1666 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1667 (so->so_proto->pr_flags & PR_ATOMIC); 1668 if (td != NULL) 1669 td->td_ru.ru_msgsnd++; 1670 if (control != NULL) 1671 clen = control->m_len; 1672 1673 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 1674 if (error) 1675 goto out; 1676 1677 #ifdef KERN_TLS 1678 tls_send_flag = 0; 1679 tls = ktls_hold(so->so_snd.sb_tls_info); 1680 if (tls != NULL) { 1681 if (tls->mode == TCP_TLS_MODE_SW) 1682 tls_send_flag = PRUS_NOTREADY; 1683 1684 if (control != NULL) { 1685 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1686 1687 if (clen >= sizeof(*cm) && 1688 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 1689 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 1690 clen = 0; 1691 m_freem(control); 1692 control = NULL; 1693 atomic = 1; 1694 } 1695 } 1696 1697 if (resid == 0 && !ktls_permit_empty_frames(tls)) { 1698 error = EINVAL; 1699 goto release; 1700 } 1701 } 1702 #endif 1703 1704 restart: 1705 do { 1706 SOCKBUF_LOCK(&so->so_snd); 1707 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1708 SOCKBUF_UNLOCK(&so->so_snd); 1709 error = EPIPE; 1710 goto release; 1711 } 1712 if (so->so_error) { 1713 error = so->so_error; 1714 so->so_error = 0; 1715 SOCKBUF_UNLOCK(&so->so_snd); 1716 goto release; 1717 } 1718 if ((so->so_state & SS_ISCONNECTED) == 0) { 1719 /* 1720 * `sendto' and `sendmsg' is allowed on a connection- 1721 * based socket if it supports implied connect. 1722 * Return ENOTCONN if not connected and no address is 1723 * supplied. 1724 */ 1725 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1726 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1727 if (!(resid == 0 && clen != 0)) { 1728 SOCKBUF_UNLOCK(&so->so_snd); 1729 error = ENOTCONN; 1730 goto release; 1731 } 1732 } else if (addr == NULL) { 1733 SOCKBUF_UNLOCK(&so->so_snd); 1734 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1735 error = ENOTCONN; 1736 else 1737 error = EDESTADDRREQ; 1738 goto release; 1739 } 1740 } 1741 space = sbspace(&so->so_snd); 1742 if (flags & MSG_OOB) 1743 space += 1024; 1744 if ((atomic && resid > so->so_snd.sb_hiwat) || 1745 clen > so->so_snd.sb_hiwat) { 1746 SOCKBUF_UNLOCK(&so->so_snd); 1747 error = EMSGSIZE; 1748 goto release; 1749 } 1750 if (space < resid + clen && 1751 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1752 if ((so->so_state & SS_NBIO) || 1753 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1754 SOCKBUF_UNLOCK(&so->so_snd); 1755 error = EWOULDBLOCK; 1756 goto release; 1757 } 1758 error = sbwait(so, SO_SND); 1759 SOCKBUF_UNLOCK(&so->so_snd); 1760 if (error) 1761 goto release; 1762 goto restart; 1763 } 1764 SOCKBUF_UNLOCK(&so->so_snd); 1765 space -= clen; 1766 do { 1767 if (uio == NULL) { 1768 resid = 0; 1769 if (flags & MSG_EOR) 1770 top->m_flags |= M_EOR; 1771 #ifdef KERN_TLS 1772 if (tls != NULL) { 1773 ktls_frame(top, tls, &tls_enq_cnt, 1774 tls_rtype); 1775 tls_rtype = TLS_RLTYPE_APP; 1776 } 1777 #endif 1778 } else { 1779 /* 1780 * Copy the data from userland into a mbuf 1781 * chain. If resid is 0, which can happen 1782 * only if we have control to send, then 1783 * a single empty mbuf is returned. This 1784 * is a workaround to prevent protocol send 1785 * methods to panic. 1786 */ 1787 #ifdef KERN_TLS 1788 if (tls != NULL) { 1789 top = m_uiotombuf(uio, M_WAITOK, space, 1790 tls->params.max_frame_len, 1791 M_EXTPG | 1792 ((flags & MSG_EOR) ? M_EOR : 0)); 1793 if (top != NULL) { 1794 ktls_frame(top, tls, 1795 &tls_enq_cnt, tls_rtype); 1796 } 1797 tls_rtype = TLS_RLTYPE_APP; 1798 } else 1799 #endif 1800 top = m_uiotombuf(uio, M_WAITOK, space, 1801 (atomic ? max_hdr : 0), 1802 (atomic ? M_PKTHDR : 0) | 1803 ((flags & MSG_EOR) ? M_EOR : 0)); 1804 if (top == NULL) { 1805 error = EFAULT; /* only possible error */ 1806 goto release; 1807 } 1808 space -= resid - uio->uio_resid; 1809 resid = uio->uio_resid; 1810 } 1811 if (dontroute) { 1812 SOCK_LOCK(so); 1813 so->so_options |= SO_DONTROUTE; 1814 SOCK_UNLOCK(so); 1815 } 1816 /* 1817 * XXX all the SBS_CANTSENDMORE checks previously 1818 * done could be out of date. We could have received 1819 * a reset packet in an interrupt or maybe we slept 1820 * while doing page faults in uiomove() etc. We 1821 * could probably recheck again inside the locking 1822 * protection here, but there are probably other 1823 * places that this also happens. We must rethink 1824 * this. 1825 */ 1826 VNET_SO_ASSERT(so); 1827 1828 pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : 1829 /* 1830 * If the user set MSG_EOF, the protocol understands 1831 * this flag and nothing left to send then use 1832 * PRU_SEND_EOF instead of PRU_SEND. 1833 */ 1834 ((flags & MSG_EOF) && 1835 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1836 (resid <= 0)) ? 1837 PRUS_EOF : 1838 /* If there is more to send set PRUS_MORETOCOME. */ 1839 (flags & MSG_MORETOCOME) || 1840 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 1841 1842 #ifdef KERN_TLS 1843 pr_send_flag |= tls_send_flag; 1844 #endif 1845 1846 error = so->so_proto->pr_send(so, pr_send_flag, top, 1847 addr, control, td); 1848 1849 if (dontroute) { 1850 SOCK_LOCK(so); 1851 so->so_options &= ~SO_DONTROUTE; 1852 SOCK_UNLOCK(so); 1853 } 1854 1855 #ifdef KERN_TLS 1856 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 1857 if (error != 0) { 1858 m_freem(top); 1859 top = NULL; 1860 } else { 1861 soref(so); 1862 ktls_enqueue(top, so, tls_enq_cnt); 1863 } 1864 } 1865 #endif 1866 clen = 0; 1867 control = NULL; 1868 top = NULL; 1869 if (error) 1870 goto release; 1871 } while (resid && space > 0); 1872 } while (resid); 1873 1874 release: 1875 SOCK_IO_SEND_UNLOCK(so); 1876 out: 1877 #ifdef KERN_TLS 1878 if (tls != NULL) 1879 ktls_free(tls); 1880 #endif 1881 if (top != NULL) 1882 m_freem(top); 1883 if (control != NULL) 1884 m_freem(control); 1885 return (error); 1886 } 1887 1888 /* 1889 * Send to a socket from a kernel thread. 1890 * 1891 * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. 1892 * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs 1893 * to be set at all. This function should just boil down to a static inline 1894 * calling the protocol method. 1895 */ 1896 int 1897 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1898 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1899 { 1900 int error; 1901 1902 CURVNET_SET(so->so_vnet); 1903 error = so->so_proto->pr_sosend(so, addr, uio, 1904 top, control, flags, td); 1905 CURVNET_RESTORE(); 1906 return (error); 1907 } 1908 1909 /* 1910 * send(2), write(2) or aio_write(2) on a socket. 1911 */ 1912 int 1913 sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1914 struct mbuf *control, int flags, struct proc *userproc) 1915 { 1916 struct thread *td; 1917 ssize_t len; 1918 int error; 1919 1920 td = uio->uio_td; 1921 len = uio->uio_resid; 1922 CURVNET_SET(so->so_vnet); 1923 error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, 1924 td); 1925 CURVNET_RESTORE(); 1926 if (error != 0) { 1927 /* 1928 * Clear transient errors for stream protocols if they made 1929 * some progress. Make exclusion for aio(4) that would 1930 * schedule a new write in case of EWOULDBLOCK and clear 1931 * error itself. See soaio_process_job(). 1932 */ 1933 if (uio->uio_resid != len && 1934 (so->so_proto->pr_flags & PR_ATOMIC) == 0 && 1935 userproc == NULL && 1936 (error == ERESTART || error == EINTR || 1937 error == EWOULDBLOCK)) 1938 error = 0; 1939 /* Generation of SIGPIPE can be controlled per socket. */ 1940 if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && 1941 (flags & MSG_NOSIGNAL) == 0) { 1942 if (userproc != NULL) { 1943 /* aio(4) job */ 1944 PROC_LOCK(userproc); 1945 kern_psignal(userproc, SIGPIPE); 1946 PROC_UNLOCK(userproc); 1947 } else { 1948 PROC_LOCK(td->td_proc); 1949 tdsignal(td, SIGPIPE); 1950 PROC_UNLOCK(td->td_proc); 1951 } 1952 } 1953 } 1954 return (error); 1955 } 1956 1957 /* 1958 * The part of soreceive() that implements reading non-inline out-of-band 1959 * data from a socket. For more complete comments, see soreceive(), from 1960 * which this code originated. 1961 * 1962 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1963 * unable to return an mbuf chain to the caller. 1964 */ 1965 static int 1966 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1967 { 1968 struct protosw *pr = so->so_proto; 1969 struct mbuf *m; 1970 int error; 1971 1972 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1973 VNET_SO_ASSERT(so); 1974 1975 m = m_get(M_WAITOK, MT_DATA); 1976 error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); 1977 if (error) 1978 goto bad; 1979 do { 1980 error = uiomove(mtod(m, void *), 1981 (int) min(uio->uio_resid, m->m_len), uio); 1982 m = m_free(m); 1983 } while (uio->uio_resid && error == 0 && m); 1984 bad: 1985 if (m != NULL) 1986 m_freem(m); 1987 return (error); 1988 } 1989 1990 /* 1991 * Following replacement or removal of the first mbuf on the first mbuf chain 1992 * of a socket buffer, push necessary state changes back into the socket 1993 * buffer so that other consumers see the values consistently. 'nextrecord' 1994 * is the callers locally stored value of the original value of 1995 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1996 * NOTE: 'nextrecord' may be NULL. 1997 */ 1998 static __inline void 1999 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 2000 { 2001 2002 SOCKBUF_LOCK_ASSERT(sb); 2003 /* 2004 * First, update for the new value of nextrecord. If necessary, make 2005 * it the first record. 2006 */ 2007 if (sb->sb_mb != NULL) 2008 sb->sb_mb->m_nextpkt = nextrecord; 2009 else 2010 sb->sb_mb = nextrecord; 2011 2012 /* 2013 * Now update any dependent socket buffer fields to reflect the new 2014 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 2015 * addition of a second clause that takes care of the case where 2016 * sb_mb has been updated, but remains the last record. 2017 */ 2018 if (sb->sb_mb == NULL) { 2019 sb->sb_mbtail = NULL; 2020 sb->sb_lastrecord = NULL; 2021 } else if (sb->sb_mb->m_nextpkt == NULL) 2022 sb->sb_lastrecord = sb->sb_mb; 2023 } 2024 2025 /* 2026 * Implement receive operations on a socket. We depend on the way that 2027 * records are added to the sockbuf by sbappend. In particular, each record 2028 * (mbufs linked through m_next) must begin with an address if the protocol 2029 * so specifies, followed by an optional mbuf or mbufs containing ancillary 2030 * data, and then zero or more mbufs of data. In order to allow parallelism 2031 * between network receive and copying to user space, as well as avoid 2032 * sleeping with a mutex held, we release the socket buffer mutex during the 2033 * user space copy. Although the sockbuf is locked, new data may still be 2034 * appended, and thus we must maintain consistency of the sockbuf during that 2035 * time. 2036 * 2037 * The caller may receive the data as a single mbuf chain by supplying an 2038 * mbuf **mp0 for use in returning the chain. The uio is then used only for 2039 * the count in uio_resid. 2040 */ 2041 int 2042 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 2043 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2044 { 2045 struct mbuf *m, **mp; 2046 int flags, error, offset; 2047 ssize_t len; 2048 struct protosw *pr = so->so_proto; 2049 struct mbuf *nextrecord; 2050 int moff, type = 0; 2051 ssize_t orig_resid = uio->uio_resid; 2052 bool report_real_len = false; 2053 2054 mp = mp0; 2055 if (psa != NULL) 2056 *psa = NULL; 2057 if (controlp != NULL) 2058 *controlp = NULL; 2059 if (flagsp != NULL) { 2060 report_real_len = *flagsp & MSG_TRUNC; 2061 *flagsp &= ~MSG_TRUNC; 2062 flags = *flagsp &~ MSG_EOR; 2063 } else 2064 flags = 0; 2065 if (flags & MSG_OOB) 2066 return (soreceive_rcvoob(so, uio, flags)); 2067 if (mp != NULL) 2068 *mp = NULL; 2069 2070 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 2071 if (error) 2072 return (error); 2073 2074 restart: 2075 SOCKBUF_LOCK(&so->so_rcv); 2076 m = so->so_rcv.sb_mb; 2077 /* 2078 * If we have less data than requested, block awaiting more (subject 2079 * to any timeout) if: 2080 * 1. the current count is less than the low water mark, or 2081 * 2. MSG_DONTWAIT is not set 2082 */ 2083 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 2084 sbavail(&so->so_rcv) < uio->uio_resid) && 2085 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 2086 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 2087 KASSERT(m != NULL || !sbavail(&so->so_rcv), 2088 ("receive: m == %p sbavail == %u", 2089 m, sbavail(&so->so_rcv))); 2090 if (so->so_error || so->so_rerror) { 2091 if (m != NULL) 2092 goto dontblock; 2093 if (so->so_error) 2094 error = so->so_error; 2095 else 2096 error = so->so_rerror; 2097 if ((flags & MSG_PEEK) == 0) { 2098 if (so->so_error) 2099 so->so_error = 0; 2100 else 2101 so->so_rerror = 0; 2102 } 2103 SOCKBUF_UNLOCK(&so->so_rcv); 2104 goto release; 2105 } 2106 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2107 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2108 if (m != NULL) 2109 goto dontblock; 2110 #ifdef KERN_TLS 2111 else if (so->so_rcv.sb_tlsdcc == 0 && 2112 so->so_rcv.sb_tlscc == 0) { 2113 #else 2114 else { 2115 #endif 2116 SOCKBUF_UNLOCK(&so->so_rcv); 2117 goto release; 2118 } 2119 } 2120 for (; m != NULL; m = m->m_next) 2121 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2122 m = so->so_rcv.sb_mb; 2123 goto dontblock; 2124 } 2125 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2126 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2127 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2128 SOCKBUF_UNLOCK(&so->so_rcv); 2129 error = ENOTCONN; 2130 goto release; 2131 } 2132 if (uio->uio_resid == 0 && !report_real_len) { 2133 SOCKBUF_UNLOCK(&so->so_rcv); 2134 goto release; 2135 } 2136 if ((so->so_state & SS_NBIO) || 2137 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2138 SOCKBUF_UNLOCK(&so->so_rcv); 2139 error = EWOULDBLOCK; 2140 goto release; 2141 } 2142 SBLASTRECORDCHK(&so->so_rcv); 2143 SBLASTMBUFCHK(&so->so_rcv); 2144 error = sbwait(so, SO_RCV); 2145 SOCKBUF_UNLOCK(&so->so_rcv); 2146 if (error) 2147 goto release; 2148 goto restart; 2149 } 2150 dontblock: 2151 /* 2152 * From this point onward, we maintain 'nextrecord' as a cache of the 2153 * pointer to the next record in the socket buffer. We must keep the 2154 * various socket buffer pointers and local stack versions of the 2155 * pointers in sync, pushing out modifications before dropping the 2156 * socket buffer mutex, and re-reading them when picking it up. 2157 * 2158 * Otherwise, we will race with the network stack appending new data 2159 * or records onto the socket buffer by using inconsistent/stale 2160 * versions of the field, possibly resulting in socket buffer 2161 * corruption. 2162 * 2163 * By holding the high-level sblock(), we prevent simultaneous 2164 * readers from pulling off the front of the socket buffer. 2165 */ 2166 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2167 if (uio->uio_td) 2168 uio->uio_td->td_ru.ru_msgrcv++; 2169 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2170 SBLASTRECORDCHK(&so->so_rcv); 2171 SBLASTMBUFCHK(&so->so_rcv); 2172 nextrecord = m->m_nextpkt; 2173 if (pr->pr_flags & PR_ADDR) { 2174 KASSERT(m->m_type == MT_SONAME, 2175 ("m->m_type == %d", m->m_type)); 2176 orig_resid = 0; 2177 if (psa != NULL) 2178 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2179 M_NOWAIT); 2180 if (flags & MSG_PEEK) { 2181 m = m->m_next; 2182 } else { 2183 sbfree(&so->so_rcv, m); 2184 so->so_rcv.sb_mb = m_free(m); 2185 m = so->so_rcv.sb_mb; 2186 sockbuf_pushsync(&so->so_rcv, nextrecord); 2187 } 2188 } 2189 2190 /* 2191 * Process one or more MT_CONTROL mbufs present before any data mbufs 2192 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2193 * just copy the data; if !MSG_PEEK, we call into the protocol to 2194 * perform externalization (or freeing if controlp == NULL). 2195 */ 2196 if (m != NULL && m->m_type == MT_CONTROL) { 2197 struct mbuf *cm = NULL, *cmn; 2198 struct mbuf **cme = &cm; 2199 #ifdef KERN_TLS 2200 struct cmsghdr *cmsg; 2201 struct tls_get_record tgr; 2202 2203 /* 2204 * For MSG_TLSAPPDATA, check for an alert record. 2205 * If found, return ENXIO without removing 2206 * it from the receive queue. This allows a subsequent 2207 * call without MSG_TLSAPPDATA to receive it. 2208 * Note that, for TLS, there should only be a single 2209 * control mbuf with the TLS_GET_RECORD message in it. 2210 */ 2211 if (flags & MSG_TLSAPPDATA) { 2212 cmsg = mtod(m, struct cmsghdr *); 2213 if (cmsg->cmsg_type == TLS_GET_RECORD && 2214 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2215 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2216 if (__predict_false(tgr.tls_type == 2217 TLS_RLTYPE_ALERT)) { 2218 SOCKBUF_UNLOCK(&so->so_rcv); 2219 error = ENXIO; 2220 goto release; 2221 } 2222 } 2223 } 2224 #endif 2225 2226 do { 2227 if (flags & MSG_PEEK) { 2228 if (controlp != NULL) { 2229 *controlp = m_copym(m, 0, m->m_len, 2230 M_NOWAIT); 2231 controlp = &(*controlp)->m_next; 2232 } 2233 m = m->m_next; 2234 } else { 2235 sbfree(&so->so_rcv, m); 2236 so->so_rcv.sb_mb = m->m_next; 2237 m->m_next = NULL; 2238 *cme = m; 2239 cme = &(*cme)->m_next; 2240 m = so->so_rcv.sb_mb; 2241 } 2242 } while (m != NULL && m->m_type == MT_CONTROL); 2243 if ((flags & MSG_PEEK) == 0) 2244 sockbuf_pushsync(&so->so_rcv, nextrecord); 2245 while (cm != NULL) { 2246 cmn = cm->m_next; 2247 cm->m_next = NULL; 2248 if (pr->pr_domain->dom_externalize != NULL) { 2249 SOCKBUF_UNLOCK(&so->so_rcv); 2250 VNET_SO_ASSERT(so); 2251 error = (*pr->pr_domain->dom_externalize) 2252 (cm, controlp, flags); 2253 SOCKBUF_LOCK(&so->so_rcv); 2254 } else if (controlp != NULL) 2255 *controlp = cm; 2256 else 2257 m_freem(cm); 2258 if (controlp != NULL) { 2259 while (*controlp != NULL) 2260 controlp = &(*controlp)->m_next; 2261 } 2262 cm = cmn; 2263 } 2264 if (m != NULL) 2265 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 2266 else 2267 nextrecord = so->so_rcv.sb_mb; 2268 orig_resid = 0; 2269 } 2270 if (m != NULL) { 2271 if ((flags & MSG_PEEK) == 0) { 2272 KASSERT(m->m_nextpkt == nextrecord, 2273 ("soreceive: post-control, nextrecord !sync")); 2274 if (nextrecord == NULL) { 2275 KASSERT(so->so_rcv.sb_mb == m, 2276 ("soreceive: post-control, sb_mb!=m")); 2277 KASSERT(so->so_rcv.sb_lastrecord == m, 2278 ("soreceive: post-control, lastrecord!=m")); 2279 } 2280 } 2281 type = m->m_type; 2282 if (type == MT_OOBDATA) 2283 flags |= MSG_OOB; 2284 } else { 2285 if ((flags & MSG_PEEK) == 0) { 2286 KASSERT(so->so_rcv.sb_mb == nextrecord, 2287 ("soreceive: sb_mb != nextrecord")); 2288 if (so->so_rcv.sb_mb == NULL) { 2289 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2290 ("soreceive: sb_lastercord != NULL")); 2291 } 2292 } 2293 } 2294 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2295 SBLASTRECORDCHK(&so->so_rcv); 2296 SBLASTMBUFCHK(&so->so_rcv); 2297 2298 /* 2299 * Now continue to read any data mbufs off of the head of the socket 2300 * buffer until the read request is satisfied. Note that 'type' is 2301 * used to store the type of any mbuf reads that have happened so far 2302 * such that soreceive() can stop reading if the type changes, which 2303 * causes soreceive() to return only one of regular data and inline 2304 * out-of-band data in a single socket receive operation. 2305 */ 2306 moff = 0; 2307 offset = 0; 2308 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 2309 && error == 0) { 2310 /* 2311 * If the type of mbuf has changed since the last mbuf 2312 * examined ('type'), end the receive operation. 2313 */ 2314 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2315 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 2316 if (type != m->m_type) 2317 break; 2318 } else if (type == MT_OOBDATA) 2319 break; 2320 else 2321 KASSERT(m->m_type == MT_DATA, 2322 ("m->m_type == %d", m->m_type)); 2323 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 2324 len = uio->uio_resid; 2325 if (so->so_oobmark && len > so->so_oobmark - offset) 2326 len = so->so_oobmark - offset; 2327 if (len > m->m_len - moff) 2328 len = m->m_len - moff; 2329 /* 2330 * If mp is set, just pass back the mbufs. Otherwise copy 2331 * them out via the uio, then free. Sockbuf must be 2332 * consistent here (points to current mbuf, it points to next 2333 * record) when we drop priority; we must note any additions 2334 * to the sockbuf when we block interrupts again. 2335 */ 2336 if (mp == NULL) { 2337 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2338 SBLASTRECORDCHK(&so->so_rcv); 2339 SBLASTMBUFCHK(&so->so_rcv); 2340 SOCKBUF_UNLOCK(&so->so_rcv); 2341 if ((m->m_flags & M_EXTPG) != 0) 2342 error = m_unmapped_uiomove(m, moff, uio, 2343 (int)len); 2344 else 2345 error = uiomove(mtod(m, char *) + moff, 2346 (int)len, uio); 2347 SOCKBUF_LOCK(&so->so_rcv); 2348 if (error) { 2349 /* 2350 * The MT_SONAME mbuf has already been removed 2351 * from the record, so it is necessary to 2352 * remove the data mbufs, if any, to preserve 2353 * the invariant in the case of PR_ADDR that 2354 * requires MT_SONAME mbufs at the head of 2355 * each record. 2356 */ 2357 if (pr->pr_flags & PR_ATOMIC && 2358 ((flags & MSG_PEEK) == 0)) 2359 (void)sbdroprecord_locked(&so->so_rcv); 2360 SOCKBUF_UNLOCK(&so->so_rcv); 2361 goto release; 2362 } 2363 } else 2364 uio->uio_resid -= len; 2365 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2366 if (len == m->m_len - moff) { 2367 if (m->m_flags & M_EOR) 2368 flags |= MSG_EOR; 2369 if (flags & MSG_PEEK) { 2370 m = m->m_next; 2371 moff = 0; 2372 } else { 2373 nextrecord = m->m_nextpkt; 2374 sbfree(&so->so_rcv, m); 2375 if (mp != NULL) { 2376 m->m_nextpkt = NULL; 2377 *mp = m; 2378 mp = &m->m_next; 2379 so->so_rcv.sb_mb = m = m->m_next; 2380 *mp = NULL; 2381 } else { 2382 so->so_rcv.sb_mb = m_free(m); 2383 m = so->so_rcv.sb_mb; 2384 } 2385 sockbuf_pushsync(&so->so_rcv, nextrecord); 2386 SBLASTRECORDCHK(&so->so_rcv); 2387 SBLASTMBUFCHK(&so->so_rcv); 2388 } 2389 } else { 2390 if (flags & MSG_PEEK) 2391 moff += len; 2392 else { 2393 if (mp != NULL) { 2394 if (flags & MSG_DONTWAIT) { 2395 *mp = m_copym(m, 0, len, 2396 M_NOWAIT); 2397 if (*mp == NULL) { 2398 /* 2399 * m_copym() couldn't 2400 * allocate an mbuf. 2401 * Adjust uio_resid back 2402 * (it was adjusted 2403 * down by len bytes, 2404 * which we didn't end 2405 * up "copying" over). 2406 */ 2407 uio->uio_resid += len; 2408 break; 2409 } 2410 } else { 2411 SOCKBUF_UNLOCK(&so->so_rcv); 2412 *mp = m_copym(m, 0, len, 2413 M_WAITOK); 2414 SOCKBUF_LOCK(&so->so_rcv); 2415 } 2416 } 2417 sbcut_locked(&so->so_rcv, len); 2418 } 2419 } 2420 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2421 if (so->so_oobmark) { 2422 if ((flags & MSG_PEEK) == 0) { 2423 so->so_oobmark -= len; 2424 if (so->so_oobmark == 0) { 2425 so->so_rcv.sb_state |= SBS_RCVATMARK; 2426 break; 2427 } 2428 } else { 2429 offset += len; 2430 if (offset == so->so_oobmark) 2431 break; 2432 } 2433 } 2434 if (flags & MSG_EOR) 2435 break; 2436 /* 2437 * If the MSG_WAITALL flag is set (for non-atomic socket), we 2438 * must not quit until "uio->uio_resid == 0" or an error 2439 * termination. If a signal/timeout occurs, return with a 2440 * short count but without error. Keep sockbuf locked 2441 * against other readers. 2442 */ 2443 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 2444 !sosendallatonce(so) && nextrecord == NULL) { 2445 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2446 if (so->so_error || so->so_rerror || 2447 so->so_rcv.sb_state & SBS_CANTRCVMORE) 2448 break; 2449 /* 2450 * Notify the protocol that some data has been 2451 * drained before blocking. 2452 */ 2453 if (pr->pr_flags & PR_WANTRCVD) { 2454 SOCKBUF_UNLOCK(&so->so_rcv); 2455 VNET_SO_ASSERT(so); 2456 pr->pr_rcvd(so, flags); 2457 SOCKBUF_LOCK(&so->so_rcv); 2458 if (__predict_false(so->so_rcv.sb_mb == NULL && 2459 (so->so_error || so->so_rerror || 2460 so->so_rcv.sb_state & SBS_CANTRCVMORE))) 2461 break; 2462 } 2463 SBLASTRECORDCHK(&so->so_rcv); 2464 SBLASTMBUFCHK(&so->so_rcv); 2465 /* 2466 * We could receive some data while was notifying 2467 * the protocol. Skip blocking in this case. 2468 */ 2469 if (so->so_rcv.sb_mb == NULL) { 2470 error = sbwait(so, SO_RCV); 2471 if (error) { 2472 SOCKBUF_UNLOCK(&so->so_rcv); 2473 goto release; 2474 } 2475 } 2476 m = so->so_rcv.sb_mb; 2477 if (m != NULL) 2478 nextrecord = m->m_nextpkt; 2479 } 2480 } 2481 2482 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2483 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 2484 if (report_real_len) 2485 uio->uio_resid -= m_length(m, NULL) - moff; 2486 flags |= MSG_TRUNC; 2487 if ((flags & MSG_PEEK) == 0) 2488 (void) sbdroprecord_locked(&so->so_rcv); 2489 } 2490 if ((flags & MSG_PEEK) == 0) { 2491 if (m == NULL) { 2492 /* 2493 * First part is an inline SB_EMPTY_FIXUP(). Second 2494 * part makes sure sb_lastrecord is up-to-date if 2495 * there is still data in the socket buffer. 2496 */ 2497 so->so_rcv.sb_mb = nextrecord; 2498 if (so->so_rcv.sb_mb == NULL) { 2499 so->so_rcv.sb_mbtail = NULL; 2500 so->so_rcv.sb_lastrecord = NULL; 2501 } else if (nextrecord->m_nextpkt == NULL) 2502 so->so_rcv.sb_lastrecord = nextrecord; 2503 } 2504 SBLASTRECORDCHK(&so->so_rcv); 2505 SBLASTMBUFCHK(&so->so_rcv); 2506 /* 2507 * If soreceive() is being done from the socket callback, 2508 * then don't need to generate ACK to peer to update window, 2509 * since ACK will be generated on return to TCP. 2510 */ 2511 if (!(flags & MSG_SOCALLBCK) && 2512 (pr->pr_flags & PR_WANTRCVD)) { 2513 SOCKBUF_UNLOCK(&so->so_rcv); 2514 VNET_SO_ASSERT(so); 2515 pr->pr_rcvd(so, flags); 2516 SOCKBUF_LOCK(&so->so_rcv); 2517 } 2518 } 2519 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2520 if (orig_resid == uio->uio_resid && orig_resid && 2521 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 2522 SOCKBUF_UNLOCK(&so->so_rcv); 2523 goto restart; 2524 } 2525 SOCKBUF_UNLOCK(&so->so_rcv); 2526 2527 if (flagsp != NULL) 2528 *flagsp |= flags; 2529 release: 2530 SOCK_IO_RECV_UNLOCK(so); 2531 return (error); 2532 } 2533 2534 /* 2535 * Optimized version of soreceive() for stream (TCP) sockets. 2536 */ 2537 int 2538 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 2539 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2540 { 2541 int len = 0, error = 0, flags, oresid; 2542 struct sockbuf *sb; 2543 struct mbuf *m, *n = NULL; 2544 2545 /* We only do stream sockets. */ 2546 if (so->so_type != SOCK_STREAM) 2547 return (EINVAL); 2548 if (psa != NULL) 2549 *psa = NULL; 2550 if (flagsp != NULL) 2551 flags = *flagsp &~ MSG_EOR; 2552 else 2553 flags = 0; 2554 if (controlp != NULL) 2555 *controlp = NULL; 2556 if (flags & MSG_OOB) 2557 return (soreceive_rcvoob(so, uio, flags)); 2558 if (mp0 != NULL) 2559 *mp0 = NULL; 2560 2561 sb = &so->so_rcv; 2562 2563 #ifdef KERN_TLS 2564 /* 2565 * KTLS store TLS records as records with a control message to 2566 * describe the framing. 2567 * 2568 * We check once here before acquiring locks to optimize the 2569 * common case. 2570 */ 2571 if (sb->sb_tls_info != NULL) 2572 return (soreceive_generic(so, psa, uio, mp0, controlp, 2573 flagsp)); 2574 #endif 2575 2576 /* Prevent other readers from entering the socket. */ 2577 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 2578 if (error) 2579 return (error); 2580 SOCKBUF_LOCK(sb); 2581 2582 #ifdef KERN_TLS 2583 if (sb->sb_tls_info != NULL) { 2584 SOCKBUF_UNLOCK(sb); 2585 SOCK_IO_RECV_UNLOCK(so); 2586 return (soreceive_generic(so, psa, uio, mp0, controlp, 2587 flagsp)); 2588 } 2589 #endif 2590 2591 /* Easy one, no space to copyout anything. */ 2592 if (uio->uio_resid == 0) { 2593 error = EINVAL; 2594 goto out; 2595 } 2596 oresid = uio->uio_resid; 2597 2598 /* We will never ever get anything unless we are or were connected. */ 2599 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2600 error = ENOTCONN; 2601 goto out; 2602 } 2603 2604 restart: 2605 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2606 2607 /* Abort if socket has reported problems. */ 2608 if (so->so_error) { 2609 if (sbavail(sb) > 0) 2610 goto deliver; 2611 if (oresid > uio->uio_resid) 2612 goto out; 2613 error = so->so_error; 2614 if (!(flags & MSG_PEEK)) 2615 so->so_error = 0; 2616 goto out; 2617 } 2618 2619 /* Door is closed. Deliver what is left, if any. */ 2620 if (sb->sb_state & SBS_CANTRCVMORE) { 2621 if (sbavail(sb) > 0) 2622 goto deliver; 2623 else 2624 goto out; 2625 } 2626 2627 /* Socket buffer is empty and we shall not block. */ 2628 if (sbavail(sb) == 0 && 2629 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2630 error = EAGAIN; 2631 goto out; 2632 } 2633 2634 /* Socket buffer got some data that we shall deliver now. */ 2635 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 2636 ((so->so_state & SS_NBIO) || 2637 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2638 sbavail(sb) >= sb->sb_lowat || 2639 sbavail(sb) >= uio->uio_resid || 2640 sbavail(sb) >= sb->sb_hiwat) ) { 2641 goto deliver; 2642 } 2643 2644 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2645 if ((flags & MSG_WAITALL) && 2646 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 2647 goto deliver; 2648 2649 /* 2650 * Wait and block until (more) data comes in. 2651 * NB: Drops the sockbuf lock during wait. 2652 */ 2653 error = sbwait(so, SO_RCV); 2654 if (error) 2655 goto out; 2656 goto restart; 2657 2658 deliver: 2659 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2660 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 2661 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2662 2663 /* Statistics. */ 2664 if (uio->uio_td) 2665 uio->uio_td->td_ru.ru_msgrcv++; 2666 2667 /* Fill uio until full or current end of socket buffer is reached. */ 2668 len = min(uio->uio_resid, sbavail(sb)); 2669 if (mp0 != NULL) { 2670 /* Dequeue as many mbufs as possible. */ 2671 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2672 if (*mp0 == NULL) 2673 *mp0 = sb->sb_mb; 2674 else 2675 m_cat(*mp0, sb->sb_mb); 2676 for (m = sb->sb_mb; 2677 m != NULL && m->m_len <= len; 2678 m = m->m_next) { 2679 KASSERT(!(m->m_flags & M_NOTAVAIL), 2680 ("%s: m %p not available", __func__, m)); 2681 len -= m->m_len; 2682 uio->uio_resid -= m->m_len; 2683 sbfree(sb, m); 2684 n = m; 2685 } 2686 n->m_next = NULL; 2687 sb->sb_mb = m; 2688 sb->sb_lastrecord = sb->sb_mb; 2689 if (sb->sb_mb == NULL) 2690 SB_EMPTY_FIXUP(sb); 2691 } 2692 /* Copy the remainder. */ 2693 if (len > 0) { 2694 KASSERT(sb->sb_mb != NULL, 2695 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2696 2697 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2698 if (m == NULL) 2699 len = 0; /* Don't flush data from sockbuf. */ 2700 else 2701 uio->uio_resid -= len; 2702 if (*mp0 != NULL) 2703 m_cat(*mp0, m); 2704 else 2705 *mp0 = m; 2706 if (*mp0 == NULL) { 2707 error = ENOBUFS; 2708 goto out; 2709 } 2710 } 2711 } else { 2712 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2713 SOCKBUF_UNLOCK(sb); 2714 error = m_mbuftouio(uio, sb->sb_mb, len); 2715 SOCKBUF_LOCK(sb); 2716 if (error) 2717 goto out; 2718 } 2719 SBLASTRECORDCHK(sb); 2720 SBLASTMBUFCHK(sb); 2721 2722 /* 2723 * Remove the delivered data from the socket buffer unless we 2724 * were only peeking. 2725 */ 2726 if (!(flags & MSG_PEEK)) { 2727 if (len > 0) 2728 sbdrop_locked(sb, len); 2729 2730 /* Notify protocol that we drained some data. */ 2731 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2732 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2733 !(flags & MSG_SOCALLBCK))) { 2734 SOCKBUF_UNLOCK(sb); 2735 VNET_SO_ASSERT(so); 2736 so->so_proto->pr_rcvd(so, flags); 2737 SOCKBUF_LOCK(sb); 2738 } 2739 } 2740 2741 /* 2742 * For MSG_WAITALL we may have to loop again and wait for 2743 * more data to come in. 2744 */ 2745 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2746 goto restart; 2747 out: 2748 SBLASTRECORDCHK(sb); 2749 SBLASTMBUFCHK(sb); 2750 SOCKBUF_UNLOCK(sb); 2751 SOCK_IO_RECV_UNLOCK(so); 2752 return (error); 2753 } 2754 2755 /* 2756 * Optimized version of soreceive() for simple datagram cases from userspace. 2757 * Unlike in the stream case, we're able to drop a datagram if copyout() 2758 * fails, and because we handle datagrams atomically, we don't need to use a 2759 * sleep lock to prevent I/O interlacing. 2760 */ 2761 int 2762 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2763 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2764 { 2765 struct mbuf *m, *m2; 2766 int flags, error; 2767 ssize_t len; 2768 struct protosw *pr = so->so_proto; 2769 struct mbuf *nextrecord; 2770 2771 if (psa != NULL) 2772 *psa = NULL; 2773 if (controlp != NULL) 2774 *controlp = NULL; 2775 if (flagsp != NULL) 2776 flags = *flagsp &~ MSG_EOR; 2777 else 2778 flags = 0; 2779 2780 /* 2781 * For any complicated cases, fall back to the full 2782 * soreceive_generic(). 2783 */ 2784 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) 2785 return (soreceive_generic(so, psa, uio, mp0, controlp, 2786 flagsp)); 2787 2788 /* 2789 * Enforce restrictions on use. 2790 */ 2791 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2792 ("soreceive_dgram: wantrcvd")); 2793 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2794 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2795 ("soreceive_dgram: SBS_RCVATMARK")); 2796 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2797 ("soreceive_dgram: P_CONNREQUIRED")); 2798 2799 /* 2800 * Loop blocking while waiting for a datagram. 2801 */ 2802 SOCKBUF_LOCK(&so->so_rcv); 2803 while ((m = so->so_rcv.sb_mb) == NULL) { 2804 KASSERT(sbavail(&so->so_rcv) == 0, 2805 ("soreceive_dgram: sb_mb NULL but sbavail %u", 2806 sbavail(&so->so_rcv))); 2807 if (so->so_error) { 2808 error = so->so_error; 2809 so->so_error = 0; 2810 SOCKBUF_UNLOCK(&so->so_rcv); 2811 return (error); 2812 } 2813 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2814 uio->uio_resid == 0) { 2815 SOCKBUF_UNLOCK(&so->so_rcv); 2816 return (0); 2817 } 2818 if ((so->so_state & SS_NBIO) || 2819 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2820 SOCKBUF_UNLOCK(&so->so_rcv); 2821 return (EWOULDBLOCK); 2822 } 2823 SBLASTRECORDCHK(&so->so_rcv); 2824 SBLASTMBUFCHK(&so->so_rcv); 2825 error = sbwait(so, SO_RCV); 2826 if (error) { 2827 SOCKBUF_UNLOCK(&so->so_rcv); 2828 return (error); 2829 } 2830 } 2831 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2832 2833 if (uio->uio_td) 2834 uio->uio_td->td_ru.ru_msgrcv++; 2835 SBLASTRECORDCHK(&so->so_rcv); 2836 SBLASTMBUFCHK(&so->so_rcv); 2837 nextrecord = m->m_nextpkt; 2838 if (nextrecord == NULL) { 2839 KASSERT(so->so_rcv.sb_lastrecord == m, 2840 ("soreceive_dgram: lastrecord != m")); 2841 } 2842 2843 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2844 ("soreceive_dgram: m_nextpkt != nextrecord")); 2845 2846 /* 2847 * Pull 'm' and its chain off the front of the packet queue. 2848 */ 2849 so->so_rcv.sb_mb = NULL; 2850 sockbuf_pushsync(&so->so_rcv, nextrecord); 2851 2852 /* 2853 * Walk 'm's chain and free that many bytes from the socket buffer. 2854 */ 2855 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2856 sbfree(&so->so_rcv, m2); 2857 2858 /* 2859 * Do a few last checks before we let go of the lock. 2860 */ 2861 SBLASTRECORDCHK(&so->so_rcv); 2862 SBLASTMBUFCHK(&so->so_rcv); 2863 SOCKBUF_UNLOCK(&so->so_rcv); 2864 2865 if (pr->pr_flags & PR_ADDR) { 2866 KASSERT(m->m_type == MT_SONAME, 2867 ("m->m_type == %d", m->m_type)); 2868 if (psa != NULL) 2869 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2870 M_NOWAIT); 2871 m = m_free(m); 2872 } 2873 if (m == NULL) { 2874 /* XXXRW: Can this happen? */ 2875 return (0); 2876 } 2877 2878 /* 2879 * Packet to copyout() is now in 'm' and it is disconnected from the 2880 * queue. 2881 * 2882 * Process one or more MT_CONTROL mbufs present before any data mbufs 2883 * in the first mbuf chain on the socket buffer. We call into the 2884 * protocol to perform externalization (or freeing if controlp == 2885 * NULL). In some cases there can be only MT_CONTROL mbufs without 2886 * MT_DATA mbufs. 2887 */ 2888 if (m->m_type == MT_CONTROL) { 2889 struct mbuf *cm = NULL, *cmn; 2890 struct mbuf **cme = &cm; 2891 2892 do { 2893 m2 = m->m_next; 2894 m->m_next = NULL; 2895 *cme = m; 2896 cme = &(*cme)->m_next; 2897 m = m2; 2898 } while (m != NULL && m->m_type == MT_CONTROL); 2899 while (cm != NULL) { 2900 cmn = cm->m_next; 2901 cm->m_next = NULL; 2902 if (pr->pr_domain->dom_externalize != NULL) { 2903 error = (*pr->pr_domain->dom_externalize) 2904 (cm, controlp, flags); 2905 } else if (controlp != NULL) 2906 *controlp = cm; 2907 else 2908 m_freem(cm); 2909 if (controlp != NULL) { 2910 while (*controlp != NULL) 2911 controlp = &(*controlp)->m_next; 2912 } 2913 cm = cmn; 2914 } 2915 } 2916 KASSERT(m == NULL || m->m_type == MT_DATA, 2917 ("soreceive_dgram: !data")); 2918 while (m != NULL && uio->uio_resid > 0) { 2919 len = uio->uio_resid; 2920 if (len > m->m_len) 2921 len = m->m_len; 2922 error = uiomove(mtod(m, char *), (int)len, uio); 2923 if (error) { 2924 m_freem(m); 2925 return (error); 2926 } 2927 if (len == m->m_len) 2928 m = m_free(m); 2929 else { 2930 m->m_data += len; 2931 m->m_len -= len; 2932 } 2933 } 2934 if (m != NULL) { 2935 flags |= MSG_TRUNC; 2936 m_freem(m); 2937 } 2938 if (flagsp != NULL) 2939 *flagsp |= flags; 2940 return (0); 2941 } 2942 2943 int 2944 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2945 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2946 { 2947 int error; 2948 2949 CURVNET_SET(so->so_vnet); 2950 error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); 2951 CURVNET_RESTORE(); 2952 return (error); 2953 } 2954 2955 int 2956 soshutdown(struct socket *so, enum shutdown_how how) 2957 { 2958 int error; 2959 2960 CURVNET_SET(so->so_vnet); 2961 error = so->so_proto->pr_shutdown(so, how); 2962 CURVNET_RESTORE(); 2963 2964 return (error); 2965 } 2966 2967 /* 2968 * Used by several pr_shutdown implementations that use generic socket buffers. 2969 */ 2970 void 2971 sorflush(struct socket *so) 2972 { 2973 int error; 2974 2975 VNET_SO_ASSERT(so); 2976 2977 /* 2978 * Dislodge threads currently blocked in receive and wait to acquire 2979 * a lock against other simultaneous readers before clearing the 2980 * socket buffer. Don't let our acquire be interrupted by a signal 2981 * despite any existing socket disposition on interruptable waiting. 2982 * 2983 * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive 2984 * methods that read the top of the socket buffer without acquisition 2985 * of the socket buffer mutex, assuming that top of the buffer 2986 * exclusively belongs to the read(2) syscall. This is handy when 2987 * performing MSG_PEEK. 2988 */ 2989 socantrcvmore(so); 2990 2991 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 2992 if (error != 0) { 2993 KASSERT(SOLISTENING(so), 2994 ("%s: soiolock(%p) failed", __func__, so)); 2995 return; 2996 } 2997 2998 sbrelease(so, SO_RCV); 2999 SOCK_IO_RECV_UNLOCK(so); 3000 3001 } 3002 3003 /* 3004 * Wrapper for Socket established helper hook. 3005 * Parameters: socket, context of the hook point, hook id. 3006 */ 3007 static int inline 3008 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 3009 { 3010 struct socket_hhook_data hhook_data = { 3011 .so = so, 3012 .hctx = hctx, 3013 .m = NULL, 3014 .status = 0 3015 }; 3016 3017 CURVNET_SET(so->so_vnet); 3018 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 3019 CURVNET_RESTORE(); 3020 3021 /* Ugly but needed, since hhooks return void for now */ 3022 return (hhook_data.status); 3023 } 3024 3025 /* 3026 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 3027 * additional variant to handle the case where the option value needs to be 3028 * some kind of integer, but not a specific size. In addition to their use 3029 * here, these functions are also called by the protocol-level pr_ctloutput() 3030 * routines. 3031 */ 3032 int 3033 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 3034 { 3035 size_t valsize; 3036 3037 /* 3038 * If the user gives us more than we wanted, we ignore it, but if we 3039 * don't get the minimum length the caller wants, we return EINVAL. 3040 * On success, sopt->sopt_valsize is set to however much we actually 3041 * retrieved. 3042 */ 3043 if ((valsize = sopt->sopt_valsize) < minlen) 3044 return EINVAL; 3045 if (valsize > len) 3046 sopt->sopt_valsize = valsize = len; 3047 3048 if (sopt->sopt_td != NULL) 3049 return (copyin(sopt->sopt_val, buf, valsize)); 3050 3051 bcopy(sopt->sopt_val, buf, valsize); 3052 return (0); 3053 } 3054 3055 /* 3056 * Kernel version of setsockopt(2). 3057 * 3058 * XXX: optlen is size_t, not socklen_t 3059 */ 3060 int 3061 so_setsockopt(struct socket *so, int level, int optname, void *optval, 3062 size_t optlen) 3063 { 3064 struct sockopt sopt; 3065 3066 sopt.sopt_level = level; 3067 sopt.sopt_name = optname; 3068 sopt.sopt_dir = SOPT_SET; 3069 sopt.sopt_val = optval; 3070 sopt.sopt_valsize = optlen; 3071 sopt.sopt_td = NULL; 3072 return (sosetopt(so, &sopt)); 3073 } 3074 3075 int 3076 sosetopt(struct socket *so, struct sockopt *sopt) 3077 { 3078 int error, optval; 3079 struct linger l; 3080 struct timeval tv; 3081 sbintime_t val, *valp; 3082 uint32_t val32; 3083 #ifdef MAC 3084 struct mac extmac; 3085 #endif 3086 3087 CURVNET_SET(so->so_vnet); 3088 error = 0; 3089 if (sopt->sopt_level != SOL_SOCKET) { 3090 if (so->so_proto->pr_ctloutput != NULL) 3091 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3092 else 3093 error = ENOPROTOOPT; 3094 } else { 3095 switch (sopt->sopt_name) { 3096 case SO_ACCEPTFILTER: 3097 error = accept_filt_setopt(so, sopt); 3098 if (error) 3099 goto bad; 3100 break; 3101 3102 case SO_LINGER: 3103 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3104 if (error) 3105 goto bad; 3106 if (l.l_linger < 0 || 3107 l.l_linger > USHRT_MAX || 3108 l.l_linger > (INT_MAX / hz)) { 3109 error = EDOM; 3110 goto bad; 3111 } 3112 SOCK_LOCK(so); 3113 so->so_linger = l.l_linger; 3114 if (l.l_onoff) 3115 so->so_options |= SO_LINGER; 3116 else 3117 so->so_options &= ~SO_LINGER; 3118 SOCK_UNLOCK(so); 3119 break; 3120 3121 case SO_DEBUG: 3122 case SO_KEEPALIVE: 3123 case SO_DONTROUTE: 3124 case SO_USELOOPBACK: 3125 case SO_BROADCAST: 3126 case SO_REUSEADDR: 3127 case SO_REUSEPORT: 3128 case SO_REUSEPORT_LB: 3129 case SO_OOBINLINE: 3130 case SO_TIMESTAMP: 3131 case SO_BINTIME: 3132 case SO_NOSIGPIPE: 3133 case SO_NO_DDP: 3134 case SO_NO_OFFLOAD: 3135 case SO_RERROR: 3136 error = sooptcopyin(sopt, &optval, sizeof optval, 3137 sizeof optval); 3138 if (error) 3139 goto bad; 3140 SOCK_LOCK(so); 3141 if (optval) 3142 so->so_options |= sopt->sopt_name; 3143 else 3144 so->so_options &= ~sopt->sopt_name; 3145 SOCK_UNLOCK(so); 3146 break; 3147 3148 case SO_SETFIB: 3149 error = sooptcopyin(sopt, &optval, sizeof optval, 3150 sizeof optval); 3151 if (error) 3152 goto bad; 3153 3154 if (optval < 0 || optval >= rt_numfibs) { 3155 error = EINVAL; 3156 goto bad; 3157 } 3158 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 3159 (so->so_proto->pr_domain->dom_family == PF_INET6) || 3160 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 3161 so->so_fibnum = optval; 3162 else 3163 so->so_fibnum = 0; 3164 break; 3165 3166 case SO_USER_COOKIE: 3167 error = sooptcopyin(sopt, &val32, sizeof val32, 3168 sizeof val32); 3169 if (error) 3170 goto bad; 3171 so->so_user_cookie = val32; 3172 break; 3173 3174 case SO_SNDBUF: 3175 case SO_RCVBUF: 3176 case SO_SNDLOWAT: 3177 case SO_RCVLOWAT: 3178 error = so->so_proto->pr_setsbopt(so, sopt); 3179 if (error) 3180 goto bad; 3181 break; 3182 3183 case SO_SNDTIMEO: 3184 case SO_RCVTIMEO: 3185 #ifdef COMPAT_FREEBSD32 3186 if (SV_CURPROC_FLAG(SV_ILP32)) { 3187 struct timeval32 tv32; 3188 3189 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3190 sizeof tv32); 3191 CP(tv32, tv, tv_sec); 3192 CP(tv32, tv, tv_usec); 3193 } else 3194 #endif 3195 error = sooptcopyin(sopt, &tv, sizeof tv, 3196 sizeof tv); 3197 if (error) 3198 goto bad; 3199 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3200 tv.tv_usec >= 1000000) { 3201 error = EDOM; 3202 goto bad; 3203 } 3204 if (tv.tv_sec > INT32_MAX) 3205 val = SBT_MAX; 3206 else 3207 val = tvtosbt(tv); 3208 SOCK_LOCK(so); 3209 valp = sopt->sopt_name == SO_SNDTIMEO ? 3210 (SOLISTENING(so) ? &so->sol_sbsnd_timeo : 3211 &so->so_snd.sb_timeo) : 3212 (SOLISTENING(so) ? &so->sol_sbrcv_timeo : 3213 &so->so_rcv.sb_timeo); 3214 *valp = val; 3215 SOCK_UNLOCK(so); 3216 break; 3217 3218 case SO_LABEL: 3219 #ifdef MAC 3220 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3221 sizeof extmac); 3222 if (error) 3223 goto bad; 3224 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3225 so, &extmac); 3226 #else 3227 error = EOPNOTSUPP; 3228 #endif 3229 break; 3230 3231 case SO_TS_CLOCK: 3232 error = sooptcopyin(sopt, &optval, sizeof optval, 3233 sizeof optval); 3234 if (error) 3235 goto bad; 3236 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 3237 error = EINVAL; 3238 goto bad; 3239 } 3240 so->so_ts_clock = optval; 3241 break; 3242 3243 case SO_MAX_PACING_RATE: 3244 error = sooptcopyin(sopt, &val32, sizeof(val32), 3245 sizeof(val32)); 3246 if (error) 3247 goto bad; 3248 so->so_max_pacing_rate = val32; 3249 break; 3250 3251 default: 3252 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3253 error = hhook_run_socket(so, sopt, 3254 HHOOK_SOCKET_OPT); 3255 else 3256 error = ENOPROTOOPT; 3257 break; 3258 } 3259 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 3260 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 3261 } 3262 bad: 3263 CURVNET_RESTORE(); 3264 return (error); 3265 } 3266 3267 /* 3268 * Helper routine for getsockopt. 3269 */ 3270 int 3271 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 3272 { 3273 int error; 3274 size_t valsize; 3275 3276 error = 0; 3277 3278 /* 3279 * Documented get behavior is that we always return a value, possibly 3280 * truncated to fit in the user's buffer. Traditional behavior is 3281 * that we always tell the user precisely how much we copied, rather 3282 * than something useful like the total amount we had available for 3283 * her. Note that this interface is not idempotent; the entire 3284 * answer must be generated ahead of time. 3285 */ 3286 valsize = min(len, sopt->sopt_valsize); 3287 sopt->sopt_valsize = valsize; 3288 if (sopt->sopt_val != NULL) { 3289 if (sopt->sopt_td != NULL) 3290 error = copyout(buf, sopt->sopt_val, valsize); 3291 else 3292 bcopy(buf, sopt->sopt_val, valsize); 3293 } 3294 return (error); 3295 } 3296 3297 int 3298 sogetopt(struct socket *so, struct sockopt *sopt) 3299 { 3300 int error, optval; 3301 struct linger l; 3302 struct timeval tv; 3303 #ifdef MAC 3304 struct mac extmac; 3305 #endif 3306 3307 CURVNET_SET(so->so_vnet); 3308 error = 0; 3309 if (sopt->sopt_level != SOL_SOCKET) { 3310 if (so->so_proto->pr_ctloutput != NULL) 3311 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3312 else 3313 error = ENOPROTOOPT; 3314 CURVNET_RESTORE(); 3315 return (error); 3316 } else { 3317 switch (sopt->sopt_name) { 3318 case SO_ACCEPTFILTER: 3319 error = accept_filt_getopt(so, sopt); 3320 break; 3321 3322 case SO_LINGER: 3323 SOCK_LOCK(so); 3324 l.l_onoff = so->so_options & SO_LINGER; 3325 l.l_linger = so->so_linger; 3326 SOCK_UNLOCK(so); 3327 error = sooptcopyout(sopt, &l, sizeof l); 3328 break; 3329 3330 case SO_USELOOPBACK: 3331 case SO_DONTROUTE: 3332 case SO_DEBUG: 3333 case SO_KEEPALIVE: 3334 case SO_REUSEADDR: 3335 case SO_REUSEPORT: 3336 case SO_REUSEPORT_LB: 3337 case SO_BROADCAST: 3338 case SO_OOBINLINE: 3339 case SO_ACCEPTCONN: 3340 case SO_TIMESTAMP: 3341 case SO_BINTIME: 3342 case SO_NOSIGPIPE: 3343 case SO_NO_DDP: 3344 case SO_NO_OFFLOAD: 3345 case SO_RERROR: 3346 optval = so->so_options & sopt->sopt_name; 3347 integer: 3348 error = sooptcopyout(sopt, &optval, sizeof optval); 3349 break; 3350 3351 case SO_DOMAIN: 3352 optval = so->so_proto->pr_domain->dom_family; 3353 goto integer; 3354 3355 case SO_TYPE: 3356 optval = so->so_type; 3357 goto integer; 3358 3359 case SO_PROTOCOL: 3360 optval = so->so_proto->pr_protocol; 3361 goto integer; 3362 3363 case SO_ERROR: 3364 SOCK_LOCK(so); 3365 if (so->so_error) { 3366 optval = so->so_error; 3367 so->so_error = 0; 3368 } else { 3369 optval = so->so_rerror; 3370 so->so_rerror = 0; 3371 } 3372 SOCK_UNLOCK(so); 3373 goto integer; 3374 3375 case SO_SNDBUF: 3376 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 3377 so->so_snd.sb_hiwat; 3378 goto integer; 3379 3380 case SO_RCVBUF: 3381 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 3382 so->so_rcv.sb_hiwat; 3383 goto integer; 3384 3385 case SO_SNDLOWAT: 3386 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 3387 so->so_snd.sb_lowat; 3388 goto integer; 3389 3390 case SO_RCVLOWAT: 3391 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 3392 so->so_rcv.sb_lowat; 3393 goto integer; 3394 3395 case SO_SNDTIMEO: 3396 case SO_RCVTIMEO: 3397 SOCK_LOCK(so); 3398 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 3399 (SOLISTENING(so) ? so->sol_sbsnd_timeo : 3400 so->so_snd.sb_timeo) : 3401 (SOLISTENING(so) ? so->sol_sbrcv_timeo : 3402 so->so_rcv.sb_timeo)); 3403 SOCK_UNLOCK(so); 3404 #ifdef COMPAT_FREEBSD32 3405 if (SV_CURPROC_FLAG(SV_ILP32)) { 3406 struct timeval32 tv32; 3407 3408 CP(tv, tv32, tv_sec); 3409 CP(tv, tv32, tv_usec); 3410 error = sooptcopyout(sopt, &tv32, sizeof tv32); 3411 } else 3412 #endif 3413 error = sooptcopyout(sopt, &tv, sizeof tv); 3414 break; 3415 3416 case SO_LABEL: 3417 #ifdef MAC 3418 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3419 sizeof(extmac)); 3420 if (error) 3421 goto bad; 3422 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 3423 so, &extmac); 3424 if (error) 3425 goto bad; 3426 /* Don't copy out extmac, it is unchanged. */ 3427 #else 3428 error = EOPNOTSUPP; 3429 #endif 3430 break; 3431 3432 case SO_PEERLABEL: 3433 #ifdef MAC 3434 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3435 sizeof(extmac)); 3436 if (error) 3437 goto bad; 3438 error = mac_getsockopt_peerlabel( 3439 sopt->sopt_td->td_ucred, so, &extmac); 3440 if (error) 3441 goto bad; 3442 /* Don't copy out extmac, it is unchanged. */ 3443 #else 3444 error = EOPNOTSUPP; 3445 #endif 3446 break; 3447 3448 case SO_LISTENQLIMIT: 3449 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 3450 goto integer; 3451 3452 case SO_LISTENQLEN: 3453 optval = SOLISTENING(so) ? so->sol_qlen : 0; 3454 goto integer; 3455 3456 case SO_LISTENINCQLEN: 3457 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 3458 goto integer; 3459 3460 case SO_TS_CLOCK: 3461 optval = so->so_ts_clock; 3462 goto integer; 3463 3464 case SO_MAX_PACING_RATE: 3465 optval = so->so_max_pacing_rate; 3466 goto integer; 3467 3468 default: 3469 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3470 error = hhook_run_socket(so, sopt, 3471 HHOOK_SOCKET_OPT); 3472 else 3473 error = ENOPROTOOPT; 3474 break; 3475 } 3476 } 3477 #ifdef MAC 3478 bad: 3479 #endif 3480 CURVNET_RESTORE(); 3481 return (error); 3482 } 3483 3484 int 3485 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 3486 { 3487 struct mbuf *m, *m_prev; 3488 int sopt_size = sopt->sopt_valsize; 3489 3490 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3491 if (m == NULL) 3492 return ENOBUFS; 3493 if (sopt_size > MLEN) { 3494 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 3495 if ((m->m_flags & M_EXT) == 0) { 3496 m_free(m); 3497 return ENOBUFS; 3498 } 3499 m->m_len = min(MCLBYTES, sopt_size); 3500 } else { 3501 m->m_len = min(MLEN, sopt_size); 3502 } 3503 sopt_size -= m->m_len; 3504 *mp = m; 3505 m_prev = m; 3506 3507 while (sopt_size) { 3508 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3509 if (m == NULL) { 3510 m_freem(*mp); 3511 return ENOBUFS; 3512 } 3513 if (sopt_size > MLEN) { 3514 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 3515 M_NOWAIT); 3516 if ((m->m_flags & M_EXT) == 0) { 3517 m_freem(m); 3518 m_freem(*mp); 3519 return ENOBUFS; 3520 } 3521 m->m_len = min(MCLBYTES, sopt_size); 3522 } else { 3523 m->m_len = min(MLEN, sopt_size); 3524 } 3525 sopt_size -= m->m_len; 3526 m_prev->m_next = m; 3527 m_prev = m; 3528 } 3529 return (0); 3530 } 3531 3532 int 3533 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 3534 { 3535 struct mbuf *m0 = m; 3536 3537 if (sopt->sopt_val == NULL) 3538 return (0); 3539 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3540 if (sopt->sopt_td != NULL) { 3541 int error; 3542 3543 error = copyin(sopt->sopt_val, mtod(m, char *), 3544 m->m_len); 3545 if (error != 0) { 3546 m_freem(m0); 3547 return(error); 3548 } 3549 } else 3550 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 3551 sopt->sopt_valsize -= m->m_len; 3552 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3553 m = m->m_next; 3554 } 3555 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 3556 panic("ip6_sooptmcopyin"); 3557 return (0); 3558 } 3559 3560 int 3561 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 3562 { 3563 struct mbuf *m0 = m; 3564 size_t valsize = 0; 3565 3566 if (sopt->sopt_val == NULL) 3567 return (0); 3568 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3569 if (sopt->sopt_td != NULL) { 3570 int error; 3571 3572 error = copyout(mtod(m, char *), sopt->sopt_val, 3573 m->m_len); 3574 if (error != 0) { 3575 m_freem(m0); 3576 return(error); 3577 } 3578 } else 3579 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 3580 sopt->sopt_valsize -= m->m_len; 3581 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3582 valsize += m->m_len; 3583 m = m->m_next; 3584 } 3585 if (m != NULL) { 3586 /* enough soopt buffer should be given from user-land */ 3587 m_freem(m0); 3588 return(EINVAL); 3589 } 3590 sopt->sopt_valsize = valsize; 3591 return (0); 3592 } 3593 3594 /* 3595 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3596 * out-of-band data, which will then notify socket consumers. 3597 */ 3598 void 3599 sohasoutofband(struct socket *so) 3600 { 3601 3602 if (so->so_sigio != NULL) 3603 pgsigio(&so->so_sigio, SIGURG, 0); 3604 selwakeuppri(&so->so_rdsel, PSOCK); 3605 } 3606 3607 int 3608 sopoll(struct socket *so, int events, struct ucred *active_cred, 3609 struct thread *td) 3610 { 3611 3612 /* 3613 * We do not need to set or assert curvnet as long as everyone uses 3614 * sopoll_generic(). 3615 */ 3616 return (so->so_proto->pr_sopoll(so, events, active_cred, td)); 3617 } 3618 3619 int 3620 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3621 struct thread *td) 3622 { 3623 int revents; 3624 3625 SOCK_LOCK(so); 3626 if (SOLISTENING(so)) { 3627 if (!(events & (POLLIN | POLLRDNORM))) 3628 revents = 0; 3629 else if (!TAILQ_EMPTY(&so->sol_comp)) 3630 revents = events & (POLLIN | POLLRDNORM); 3631 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 3632 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 3633 else { 3634 selrecord(td, &so->so_rdsel); 3635 revents = 0; 3636 } 3637 } else { 3638 revents = 0; 3639 SOCK_SENDBUF_LOCK(so); 3640 SOCK_RECVBUF_LOCK(so); 3641 if (events & (POLLIN | POLLRDNORM)) 3642 if (soreadabledata(so)) 3643 revents |= events & (POLLIN | POLLRDNORM); 3644 if (events & (POLLOUT | POLLWRNORM)) 3645 if (sowriteable(so)) 3646 revents |= events & (POLLOUT | POLLWRNORM); 3647 if (events & (POLLPRI | POLLRDBAND)) 3648 if (so->so_oobmark || 3649 (so->so_rcv.sb_state & SBS_RCVATMARK)) 3650 revents |= events & (POLLPRI | POLLRDBAND); 3651 if ((events & POLLINIGNEOF) == 0) { 3652 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3653 revents |= events & (POLLIN | POLLRDNORM); 3654 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3655 revents |= POLLHUP; 3656 } 3657 } 3658 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 3659 revents |= events & POLLRDHUP; 3660 if (revents == 0) { 3661 if (events & 3662 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 3663 selrecord(td, &so->so_rdsel); 3664 so->so_rcv.sb_flags |= SB_SEL; 3665 } 3666 if (events & (POLLOUT | POLLWRNORM)) { 3667 selrecord(td, &so->so_wrsel); 3668 so->so_snd.sb_flags |= SB_SEL; 3669 } 3670 } 3671 SOCK_RECVBUF_UNLOCK(so); 3672 SOCK_SENDBUF_UNLOCK(so); 3673 } 3674 SOCK_UNLOCK(so); 3675 return (revents); 3676 } 3677 3678 int 3679 soo_kqfilter(struct file *fp, struct knote *kn) 3680 { 3681 struct socket *so = kn->kn_fp->f_data; 3682 struct sockbuf *sb; 3683 sb_which which; 3684 struct knlist *knl; 3685 3686 switch (kn->kn_filter) { 3687 case EVFILT_READ: 3688 kn->kn_fop = &soread_filtops; 3689 knl = &so->so_rdsel.si_note; 3690 sb = &so->so_rcv; 3691 which = SO_RCV; 3692 break; 3693 case EVFILT_WRITE: 3694 kn->kn_fop = &sowrite_filtops; 3695 knl = &so->so_wrsel.si_note; 3696 sb = &so->so_snd; 3697 which = SO_SND; 3698 break; 3699 case EVFILT_EMPTY: 3700 kn->kn_fop = &soempty_filtops; 3701 knl = &so->so_wrsel.si_note; 3702 sb = &so->so_snd; 3703 which = SO_SND; 3704 break; 3705 default: 3706 return (EINVAL); 3707 } 3708 3709 SOCK_LOCK(so); 3710 if (SOLISTENING(so)) { 3711 knlist_add(knl, kn, 1); 3712 } else { 3713 SOCK_BUF_LOCK(so, which); 3714 knlist_add(knl, kn, 1); 3715 sb->sb_flags |= SB_KNOTE; 3716 SOCK_BUF_UNLOCK(so, which); 3717 } 3718 SOCK_UNLOCK(so); 3719 return (0); 3720 } 3721 3722 static void 3723 filt_sordetach(struct knote *kn) 3724 { 3725 struct socket *so = kn->kn_fp->f_data; 3726 3727 so_rdknl_lock(so); 3728 knlist_remove(&so->so_rdsel.si_note, kn, 1); 3729 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 3730 so->so_rcv.sb_flags &= ~SB_KNOTE; 3731 so_rdknl_unlock(so); 3732 } 3733 3734 /*ARGSUSED*/ 3735 static int 3736 filt_soread(struct knote *kn, long hint) 3737 { 3738 struct socket *so; 3739 3740 so = kn->kn_fp->f_data; 3741 3742 if (SOLISTENING(so)) { 3743 SOCK_LOCK_ASSERT(so); 3744 kn->kn_data = so->sol_qlen; 3745 if (so->so_error) { 3746 kn->kn_flags |= EV_EOF; 3747 kn->kn_fflags = so->so_error; 3748 return (1); 3749 } 3750 return (!TAILQ_EMPTY(&so->sol_comp)); 3751 } 3752 3753 SOCK_RECVBUF_LOCK_ASSERT(so); 3754 3755 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 3756 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3757 kn->kn_flags |= EV_EOF; 3758 kn->kn_fflags = so->so_error; 3759 return (1); 3760 } else if (so->so_error || so->so_rerror) 3761 return (1); 3762 3763 if (kn->kn_sfflags & NOTE_LOWAT) { 3764 if (kn->kn_data >= kn->kn_sdata) 3765 return (1); 3766 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 3767 return (1); 3768 3769 /* This hook returning non-zero indicates an event, not error */ 3770 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 3771 } 3772 3773 static void 3774 filt_sowdetach(struct knote *kn) 3775 { 3776 struct socket *so = kn->kn_fp->f_data; 3777 3778 so_wrknl_lock(so); 3779 knlist_remove(&so->so_wrsel.si_note, kn, 1); 3780 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 3781 so->so_snd.sb_flags &= ~SB_KNOTE; 3782 so_wrknl_unlock(so); 3783 } 3784 3785 /*ARGSUSED*/ 3786 static int 3787 filt_sowrite(struct knote *kn, long hint) 3788 { 3789 struct socket *so; 3790 3791 so = kn->kn_fp->f_data; 3792 3793 if (SOLISTENING(so)) 3794 return (0); 3795 3796 SOCK_SENDBUF_LOCK_ASSERT(so); 3797 kn->kn_data = sbspace(&so->so_snd); 3798 3799 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 3800 3801 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3802 kn->kn_flags |= EV_EOF; 3803 kn->kn_fflags = so->so_error; 3804 return (1); 3805 } else if (so->so_error) /* temporary udp error */ 3806 return (1); 3807 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3808 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3809 return (0); 3810 else if (kn->kn_sfflags & NOTE_LOWAT) 3811 return (kn->kn_data >= kn->kn_sdata); 3812 else 3813 return (kn->kn_data >= so->so_snd.sb_lowat); 3814 } 3815 3816 static int 3817 filt_soempty(struct knote *kn, long hint) 3818 { 3819 struct socket *so; 3820 3821 so = kn->kn_fp->f_data; 3822 3823 if (SOLISTENING(so)) 3824 return (1); 3825 3826 SOCK_SENDBUF_LOCK_ASSERT(so); 3827 kn->kn_data = sbused(&so->so_snd); 3828 3829 if (kn->kn_data == 0) 3830 return (1); 3831 else 3832 return (0); 3833 } 3834 3835 int 3836 socheckuid(struct socket *so, uid_t uid) 3837 { 3838 3839 if (so == NULL) 3840 return (EPERM); 3841 if (so->so_cred->cr_uid != uid) 3842 return (EPERM); 3843 return (0); 3844 } 3845 3846 /* 3847 * These functions are used by protocols to notify the socket layer (and its 3848 * consumers) of state changes in the sockets driven by protocol-side events. 3849 */ 3850 3851 /* 3852 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3853 * 3854 * Normal sequence from the active (originating) side is that 3855 * soisconnecting() is called during processing of connect() call, resulting 3856 * in an eventual call to soisconnected() if/when the connection is 3857 * established. When the connection is torn down soisdisconnecting() is 3858 * called during processing of disconnect() call, and soisdisconnected() is 3859 * called when the connection to the peer is totally severed. The semantics 3860 * of these routines are such that connectionless protocols can call 3861 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3862 * calls when setting up a ``connection'' takes no time. 3863 * 3864 * From the passive side, a socket is created with two queues of sockets: 3865 * so_incomp for connections in progress and so_comp for connections already 3866 * made and awaiting user acceptance. As a protocol is preparing incoming 3867 * connections, it creates a socket structure queued on so_incomp by calling 3868 * sonewconn(). When the connection is established, soisconnected() is 3869 * called, and transfers the socket structure to so_comp, making it available 3870 * to accept(). 3871 * 3872 * If a socket is closed with sockets on either so_incomp or so_comp, these 3873 * sockets are dropped. 3874 * 3875 * If higher-level protocols are implemented in the kernel, the wakeups done 3876 * here will sometimes cause software-interrupt process scheduling. 3877 */ 3878 void 3879 soisconnecting(struct socket *so) 3880 { 3881 3882 SOCK_LOCK(so); 3883 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3884 so->so_state |= SS_ISCONNECTING; 3885 SOCK_UNLOCK(so); 3886 } 3887 3888 void 3889 soisconnected(struct socket *so) 3890 { 3891 bool last __diagused; 3892 3893 SOCK_LOCK(so); 3894 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); 3895 so->so_state |= SS_ISCONNECTED; 3896 3897 if (so->so_qstate == SQ_INCOMP) { 3898 struct socket *head = so->so_listen; 3899 int ret; 3900 3901 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 3902 /* 3903 * Promoting a socket from incomplete queue to complete, we 3904 * need to go through reverse order of locking. We first do 3905 * trylock, and if that doesn't succeed, we go the hard way 3906 * leaving a reference and rechecking consistency after proper 3907 * locking. 3908 */ 3909 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 3910 soref(head); 3911 SOCK_UNLOCK(so); 3912 SOLISTEN_LOCK(head); 3913 SOCK_LOCK(so); 3914 if (__predict_false(head != so->so_listen)) { 3915 /* 3916 * The socket went off the listen queue, 3917 * should be lost race to close(2) of sol. 3918 * The socket is about to soabort(). 3919 */ 3920 SOCK_UNLOCK(so); 3921 sorele_locked(head); 3922 return; 3923 } 3924 last = refcount_release(&head->so_count); 3925 KASSERT(!last, ("%s: released last reference for %p", 3926 __func__, head)); 3927 } 3928 again: 3929 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3930 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 3931 head->sol_incqlen--; 3932 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 3933 head->sol_qlen++; 3934 so->so_qstate = SQ_COMP; 3935 SOCK_UNLOCK(so); 3936 solisten_wakeup(head); /* unlocks */ 3937 } else { 3938 SOCK_RECVBUF_LOCK(so); 3939 soupcall_set(so, SO_RCV, 3940 head->sol_accept_filter->accf_callback, 3941 head->sol_accept_filter_arg); 3942 so->so_options &= ~SO_ACCEPTFILTER; 3943 ret = head->sol_accept_filter->accf_callback(so, 3944 head->sol_accept_filter_arg, M_NOWAIT); 3945 if (ret == SU_ISCONNECTED) { 3946 soupcall_clear(so, SO_RCV); 3947 SOCK_RECVBUF_UNLOCK(so); 3948 goto again; 3949 } 3950 SOCK_RECVBUF_UNLOCK(so); 3951 SOCK_UNLOCK(so); 3952 SOLISTEN_UNLOCK(head); 3953 } 3954 return; 3955 } 3956 SOCK_UNLOCK(so); 3957 wakeup(&so->so_timeo); 3958 sorwakeup(so); 3959 sowwakeup(so); 3960 } 3961 3962 void 3963 soisdisconnecting(struct socket *so) 3964 { 3965 3966 SOCK_LOCK(so); 3967 so->so_state &= ~SS_ISCONNECTING; 3968 so->so_state |= SS_ISDISCONNECTING; 3969 3970 if (!SOLISTENING(so)) { 3971 SOCK_RECVBUF_LOCK(so); 3972 socantrcvmore_locked(so); 3973 SOCK_SENDBUF_LOCK(so); 3974 socantsendmore_locked(so); 3975 } 3976 SOCK_UNLOCK(so); 3977 wakeup(&so->so_timeo); 3978 } 3979 3980 void 3981 soisdisconnected(struct socket *so) 3982 { 3983 3984 SOCK_LOCK(so); 3985 3986 /* 3987 * There is at least one reader of so_state that does not 3988 * acquire socket lock, namely soreceive_generic(). Ensure 3989 * that it never sees all flags that track connection status 3990 * cleared, by ordering the update with a barrier semantic of 3991 * our release thread fence. 3992 */ 3993 so->so_state |= SS_ISDISCONNECTED; 3994 atomic_thread_fence_rel(); 3995 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3996 3997 if (!SOLISTENING(so)) { 3998 SOCK_UNLOCK(so); 3999 SOCK_RECVBUF_LOCK(so); 4000 socantrcvmore_locked(so); 4001 SOCK_SENDBUF_LOCK(so); 4002 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 4003 socantsendmore_locked(so); 4004 } else 4005 SOCK_UNLOCK(so); 4006 wakeup(&so->so_timeo); 4007 } 4008 4009 int 4010 soiolock(struct socket *so, struct sx *sx, int flags) 4011 { 4012 int error; 4013 4014 KASSERT((flags & SBL_VALID) == flags, 4015 ("soiolock: invalid flags %#x", flags)); 4016 4017 if ((flags & SBL_WAIT) != 0) { 4018 if ((flags & SBL_NOINTR) != 0) { 4019 sx_xlock(sx); 4020 } else { 4021 error = sx_xlock_sig(sx); 4022 if (error != 0) 4023 return (error); 4024 } 4025 } else if (!sx_try_xlock(sx)) { 4026 return (EWOULDBLOCK); 4027 } 4028 4029 if (__predict_false(SOLISTENING(so))) { 4030 sx_xunlock(sx); 4031 return (ENOTCONN); 4032 } 4033 return (0); 4034 } 4035 4036 void 4037 soiounlock(struct sx *sx) 4038 { 4039 sx_xunlock(sx); 4040 } 4041 4042 /* 4043 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 4044 */ 4045 struct sockaddr * 4046 sodupsockaddr(const struct sockaddr *sa, int mflags) 4047 { 4048 struct sockaddr *sa2; 4049 4050 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 4051 if (sa2) 4052 bcopy(sa, sa2, sa->sa_len); 4053 return sa2; 4054 } 4055 4056 /* 4057 * Register per-socket destructor. 4058 */ 4059 void 4060 sodtor_set(struct socket *so, so_dtor_t *func) 4061 { 4062 4063 SOCK_LOCK_ASSERT(so); 4064 so->so_dtor = func; 4065 } 4066 4067 /* 4068 * Register per-socket buffer upcalls. 4069 */ 4070 void 4071 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) 4072 { 4073 struct sockbuf *sb; 4074 4075 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4076 4077 switch (which) { 4078 case SO_RCV: 4079 sb = &so->so_rcv; 4080 break; 4081 case SO_SND: 4082 sb = &so->so_snd; 4083 break; 4084 } 4085 SOCK_BUF_LOCK_ASSERT(so, which); 4086 sb->sb_upcall = func; 4087 sb->sb_upcallarg = arg; 4088 sb->sb_flags |= SB_UPCALL; 4089 } 4090 4091 void 4092 soupcall_clear(struct socket *so, sb_which which) 4093 { 4094 struct sockbuf *sb; 4095 4096 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4097 4098 switch (which) { 4099 case SO_RCV: 4100 sb = &so->so_rcv; 4101 break; 4102 case SO_SND: 4103 sb = &so->so_snd; 4104 break; 4105 } 4106 SOCK_BUF_LOCK_ASSERT(so, which); 4107 KASSERT(sb->sb_upcall != NULL, 4108 ("%s: so %p no upcall to clear", __func__, so)); 4109 sb->sb_upcall = NULL; 4110 sb->sb_upcallarg = NULL; 4111 sb->sb_flags &= ~SB_UPCALL; 4112 } 4113 4114 void 4115 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4116 { 4117 4118 SOLISTEN_LOCK_ASSERT(so); 4119 so->sol_upcall = func; 4120 so->sol_upcallarg = arg; 4121 } 4122 4123 static void 4124 so_rdknl_lock(void *arg) 4125 { 4126 struct socket *so = arg; 4127 4128 retry: 4129 if (SOLISTENING(so)) { 4130 SOLISTEN_LOCK(so); 4131 } else { 4132 SOCK_RECVBUF_LOCK(so); 4133 if (__predict_false(SOLISTENING(so))) { 4134 SOCK_RECVBUF_UNLOCK(so); 4135 goto retry; 4136 } 4137 } 4138 } 4139 4140 static void 4141 so_rdknl_unlock(void *arg) 4142 { 4143 struct socket *so = arg; 4144 4145 if (SOLISTENING(so)) 4146 SOLISTEN_UNLOCK(so); 4147 else 4148 SOCK_RECVBUF_UNLOCK(so); 4149 } 4150 4151 static void 4152 so_rdknl_assert_lock(void *arg, int what) 4153 { 4154 struct socket *so = arg; 4155 4156 if (what == LA_LOCKED) { 4157 if (SOLISTENING(so)) 4158 SOLISTEN_LOCK_ASSERT(so); 4159 else 4160 SOCK_RECVBUF_LOCK_ASSERT(so); 4161 } else { 4162 if (SOLISTENING(so)) 4163 SOLISTEN_UNLOCK_ASSERT(so); 4164 else 4165 SOCK_RECVBUF_UNLOCK_ASSERT(so); 4166 } 4167 } 4168 4169 static void 4170 so_wrknl_lock(void *arg) 4171 { 4172 struct socket *so = arg; 4173 4174 retry: 4175 if (SOLISTENING(so)) { 4176 SOLISTEN_LOCK(so); 4177 } else { 4178 SOCK_SENDBUF_LOCK(so); 4179 if (__predict_false(SOLISTENING(so))) { 4180 SOCK_SENDBUF_UNLOCK(so); 4181 goto retry; 4182 } 4183 } 4184 } 4185 4186 static void 4187 so_wrknl_unlock(void *arg) 4188 { 4189 struct socket *so = arg; 4190 4191 if (SOLISTENING(so)) 4192 SOLISTEN_UNLOCK(so); 4193 else 4194 SOCK_SENDBUF_UNLOCK(so); 4195 } 4196 4197 static void 4198 so_wrknl_assert_lock(void *arg, int what) 4199 { 4200 struct socket *so = arg; 4201 4202 if (what == LA_LOCKED) { 4203 if (SOLISTENING(so)) 4204 SOLISTEN_LOCK_ASSERT(so); 4205 else 4206 SOCK_SENDBUF_LOCK_ASSERT(so); 4207 } else { 4208 if (SOLISTENING(so)) 4209 SOLISTEN_UNLOCK_ASSERT(so); 4210 else 4211 SOCK_SENDBUF_UNLOCK_ASSERT(so); 4212 } 4213 } 4214 4215 /* 4216 * Create an external-format (``xsocket'') structure using the information in 4217 * the kernel-format socket structure pointed to by so. This is done to 4218 * reduce the spew of irrelevant information over this interface, to isolate 4219 * user code from changes in the kernel structure, and potentially to provide 4220 * information-hiding if we decide that some of this information should be 4221 * hidden from users. 4222 */ 4223 void 4224 sotoxsocket(struct socket *so, struct xsocket *xso) 4225 { 4226 4227 bzero(xso, sizeof(*xso)); 4228 xso->xso_len = sizeof *xso; 4229 xso->xso_so = (uintptr_t)so; 4230 xso->so_type = so->so_type; 4231 xso->so_options = so->so_options; 4232 xso->so_linger = so->so_linger; 4233 xso->so_state = so->so_state; 4234 xso->so_pcb = (uintptr_t)so->so_pcb; 4235 xso->xso_protocol = so->so_proto->pr_protocol; 4236 xso->xso_family = so->so_proto->pr_domain->dom_family; 4237 xso->so_timeo = so->so_timeo; 4238 xso->so_error = so->so_error; 4239 xso->so_uid = so->so_cred->cr_uid; 4240 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 4241 if (SOLISTENING(so)) { 4242 xso->so_qlen = so->sol_qlen; 4243 xso->so_incqlen = so->sol_incqlen; 4244 xso->so_qlimit = so->sol_qlimit; 4245 xso->so_oobmark = 0; 4246 } else { 4247 xso->so_state |= so->so_qstate; 4248 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 4249 xso->so_oobmark = so->so_oobmark; 4250 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 4251 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 4252 } 4253 } 4254 4255 struct sockbuf * 4256 so_sockbuf_rcv(struct socket *so) 4257 { 4258 4259 return (&so->so_rcv); 4260 } 4261 4262 struct sockbuf * 4263 so_sockbuf_snd(struct socket *so) 4264 { 4265 4266 return (&so->so_snd); 4267 } 4268 4269 int 4270 so_state_get(const struct socket *so) 4271 { 4272 4273 return (so->so_state); 4274 } 4275 4276 void 4277 so_state_set(struct socket *so, int val) 4278 { 4279 4280 so->so_state = val; 4281 } 4282 4283 int 4284 so_options_get(const struct socket *so) 4285 { 4286 4287 return (so->so_options); 4288 } 4289 4290 void 4291 so_options_set(struct socket *so, int val) 4292 { 4293 4294 so->so_options = val; 4295 } 4296 4297 int 4298 so_error_get(const struct socket *so) 4299 { 4300 4301 return (so->so_error); 4302 } 4303 4304 void 4305 so_error_set(struct socket *so, int val) 4306 { 4307 4308 so->so_error = val; 4309 } 4310 4311 int 4312 so_linger_get(const struct socket *so) 4313 { 4314 4315 return (so->so_linger); 4316 } 4317 4318 void 4319 so_linger_set(struct socket *so, int val) 4320 { 4321 4322 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), 4323 ("%s: val %d out of range", __func__, val)); 4324 4325 so->so_linger = val; 4326 } 4327 4328 struct protosw * 4329 so_protosw_get(const struct socket *so) 4330 { 4331 4332 return (so->so_proto); 4333 } 4334 4335 void 4336 so_protosw_set(struct socket *so, struct protosw *val) 4337 { 4338 4339 so->so_proto = val; 4340 } 4341 4342 void 4343 so_sorwakeup(struct socket *so) 4344 { 4345 4346 sorwakeup(so); 4347 } 4348 4349 void 4350 so_sowwakeup(struct socket *so) 4351 { 4352 4353 sowwakeup(so); 4354 } 4355 4356 void 4357 so_sorwakeup_locked(struct socket *so) 4358 { 4359 4360 sorwakeup_locked(so); 4361 } 4362 4363 void 4364 so_sowwakeup_locked(struct socket *so) 4365 { 4366 4367 sowwakeup_locked(so); 4368 } 4369 4370 void 4371 so_lock(struct socket *so) 4372 { 4373 4374 SOCK_LOCK(so); 4375 } 4376 4377 void 4378 so_unlock(struct socket *so) 4379 { 4380 4381 SOCK_UNLOCK(so); 4382 } 4383