1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 35 */ 36 37 /* 38 * Comments on the socket life cycle: 39 * 40 * soalloc() sets of socket layer state for a socket, called only by 41 * socreate() and sonewconn(). Socket layer private. 42 * 43 * sodealloc() tears down socket layer state for a socket, called only by 44 * sofree() and sonewconn(). Socket layer private. 45 * 46 * pru_attach() associates protocol layer state with an allocated socket; 47 * called only once, may fail, aborting socket allocation. This is called 48 * from socreate() and sonewconn(). Socket layer private. 49 * 50 * pru_detach() disassociates protocol layer state from an attached socket, 51 * and will be called exactly once for sockets in which pru_attach() has 52 * been successfully called. If pru_attach() returned an error, 53 * pru_detach() will not be called. Socket layer private. 54 * 55 * pru_abort() and pru_close() notify the protocol layer that the last 56 * consumer of a socket is starting to tear down the socket, and that the 57 * protocol should terminate the connection. Historically, pru_abort() also 58 * detached protocol state from the socket state, but this is no longer the 59 * case. 60 * 61 * socreate() creates a socket and attaches protocol state. This is a public 62 * interface that may be used by socket layer consumers to create new 63 * sockets. 64 * 65 * sonewconn() creates a socket and attaches protocol state. This is a 66 * public interface that may be used by protocols to create new sockets when 67 * a new connection is received and will be available for accept() on a 68 * listen socket. 69 * 70 * soclose() destroys a socket after possibly waiting for it to disconnect. 71 * This is a public interface that socket consumers should use to close and 72 * release a socket when done with it. 73 * 74 * soabort() destroys a socket without waiting for it to disconnect (used 75 * only for incoming connections that are already partially or fully 76 * connected). This is used internally by the socket layer when clearing 77 * listen socket queues (due to overflow or close on the listen socket), but 78 * is also a public interface protocols may use to abort connections in 79 * their incomplete listen queues should they no longer be required. Sockets 80 * placed in completed connection listen queues should not be aborted for 81 * reasons described in the comment above the soclose() implementation. This 82 * is not a general purpose close routine, and except in the specific 83 * circumstances described here, should not be used. 84 * 85 * sofree() will free a socket and its protocol state if all references on 86 * the socket have been released, and is the public interface to attempt to 87 * free a socket when a reference is removed. This is a socket layer private 88 * interface. 89 * 90 * NOTE: In addition to socreate() and soclose(), which provide a single 91 * socket reference to the consumer to be managed as required, there are two 92 * calls to explicitly manage socket references, soref(), and sorele(). 93 * Currently, these are generally required only when transitioning a socket 94 * from a listen queue to a file descriptor, in order to prevent garbage 95 * collection of the socket at an untimely moment. For a number of reasons, 96 * these interfaces are not preferred, and should be avoided. 97 * 98 * NOTE: With regard to VNETs the general rule is that callers do not set 99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 101 * and sorflush(), which are usually called from a pre-set VNET context. 102 * sopoll() currently does not need a VNET context to be set. 103 */ 104 105 #include <sys/cdefs.h> 106 __FBSDID("$FreeBSD$"); 107 108 #include "opt_inet.h" 109 #include "opt_inet6.h" 110 #include "opt_kern_tls.h" 111 #include "opt_sctp.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/capsicum.h> 116 #include <sys/fcntl.h> 117 #include <sys/limits.h> 118 #include <sys/lock.h> 119 #include <sys/mac.h> 120 #include <sys/malloc.h> 121 #include <sys/mbuf.h> 122 #include <sys/mutex.h> 123 #include <sys/domain.h> 124 #include <sys/file.h> /* for struct knote */ 125 #include <sys/hhook.h> 126 #include <sys/kernel.h> 127 #include <sys/khelp.h> 128 #include <sys/ktls.h> 129 #include <sys/event.h> 130 #include <sys/eventhandler.h> 131 #include <sys/poll.h> 132 #include <sys/proc.h> 133 #include <sys/protosw.h> 134 #include <sys/sbuf.h> 135 #include <sys/socket.h> 136 #include <sys/socketvar.h> 137 #include <sys/resourcevar.h> 138 #include <net/route.h> 139 #include <sys/signalvar.h> 140 #include <sys/stat.h> 141 #include <sys/sx.h> 142 #include <sys/sysctl.h> 143 #include <sys/taskqueue.h> 144 #include <sys/uio.h> 145 #include <sys/un.h> 146 #include <sys/unpcb.h> 147 #include <sys/jail.h> 148 #include <sys/syslog.h> 149 #include <netinet/in.h> 150 #include <netinet/in_pcb.h> 151 #include <netinet/tcp.h> 152 153 #include <net/vnet.h> 154 155 #include <security/mac/mac_framework.h> 156 157 #include <vm/uma.h> 158 159 #ifdef COMPAT_FREEBSD32 160 #include <sys/mount.h> 161 #include <sys/sysent.h> 162 #include <compat/freebsd32/freebsd32.h> 163 #endif 164 165 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 166 int flags); 167 static void so_rdknl_lock(void *); 168 static void so_rdknl_unlock(void *); 169 static void so_rdknl_assert_lock(void *, int); 170 static void so_wrknl_lock(void *); 171 static void so_wrknl_unlock(void *); 172 static void so_wrknl_assert_lock(void *, int); 173 174 static void filt_sordetach(struct knote *kn); 175 static int filt_soread(struct knote *kn, long hint); 176 static void filt_sowdetach(struct knote *kn); 177 static int filt_sowrite(struct knote *kn, long hint); 178 static int filt_soempty(struct knote *kn, long hint); 179 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); 180 fo_kqfilter_t soo_kqfilter; 181 182 static struct filterops soread_filtops = { 183 .f_isfd = 1, 184 .f_detach = filt_sordetach, 185 .f_event = filt_soread, 186 }; 187 static struct filterops sowrite_filtops = { 188 .f_isfd = 1, 189 .f_detach = filt_sowdetach, 190 .f_event = filt_sowrite, 191 }; 192 static struct filterops soempty_filtops = { 193 .f_isfd = 1, 194 .f_detach = filt_sowdetach, 195 .f_event = filt_soempty, 196 }; 197 198 so_gen_t so_gencnt; /* generation count for sockets */ 199 200 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 201 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 202 203 #define VNET_SO_ASSERT(so) \ 204 VNET_ASSERT(curvnet != NULL, \ 205 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 206 207 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 208 #define V_socket_hhh VNET(socket_hhh) 209 210 /* 211 * Limit on the number of connections in the listen queue waiting 212 * for accept(2). 213 * NB: The original sysctl somaxconn is still available but hidden 214 * to prevent confusion about the actual purpose of this number. 215 */ 216 static u_int somaxconn = SOMAXCONN; 217 218 static int 219 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 220 { 221 int error; 222 int val; 223 224 val = somaxconn; 225 error = sysctl_handle_int(oidp, &val, 0, req); 226 if (error || !req->newptr ) 227 return (error); 228 229 /* 230 * The purpose of the UINT_MAX / 3 limit, is so that the formula 231 * 3 * so_qlimit / 2 232 * below, will not overflow. 233 */ 234 235 if (val < 1 || val > UINT_MAX / 3) 236 return (EINVAL); 237 238 somaxconn = val; 239 return (0); 240 } 241 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 242 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), 243 sysctl_somaxconn, "I", 244 "Maximum listen socket pending connection accept queue size"); 245 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 246 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0, 247 sizeof(int), sysctl_somaxconn, "I", 248 "Maximum listen socket pending connection accept queue size (compat)"); 249 250 static int numopensockets; 251 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 252 &numopensockets, 0, "Number of open sockets"); 253 254 /* 255 * accept_mtx locks down per-socket fields relating to accept queues. See 256 * socketvar.h for an annotation of the protected fields of struct socket. 257 */ 258 struct mtx accept_mtx; 259 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 260 261 /* 262 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 263 * so_gencnt field. 264 */ 265 static struct mtx so_global_mtx; 266 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 267 268 /* 269 * General IPC sysctl name space, used by sockets and a variety of other IPC 270 * types. 271 */ 272 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 273 "IPC"); 274 275 /* 276 * Initialize the socket subsystem and set up the socket 277 * memory allocator. 278 */ 279 static uma_zone_t socket_zone; 280 int maxsockets; 281 282 static void 283 socket_zone_change(void *tag) 284 { 285 286 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 287 } 288 289 static void 290 socket_hhook_register(int subtype) 291 { 292 293 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 294 &V_socket_hhh[subtype], 295 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 296 printf("%s: WARNING: unable to register hook\n", __func__); 297 } 298 299 static void 300 socket_hhook_deregister(int subtype) 301 { 302 303 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 304 printf("%s: WARNING: unable to deregister hook\n", __func__); 305 } 306 307 static void 308 socket_init(void *tag) 309 { 310 311 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 312 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 313 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 314 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 315 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 316 EVENTHANDLER_PRI_FIRST); 317 } 318 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 319 320 static void 321 socket_vnet_init(const void *unused __unused) 322 { 323 int i; 324 325 /* We expect a contiguous range */ 326 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 327 socket_hhook_register(i); 328 } 329 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 330 socket_vnet_init, NULL); 331 332 static void 333 socket_vnet_uninit(const void *unused __unused) 334 { 335 int i; 336 337 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 338 socket_hhook_deregister(i); 339 } 340 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 341 socket_vnet_uninit, NULL); 342 343 /* 344 * Initialise maxsockets. This SYSINIT must be run after 345 * tunable_mbinit(). 346 */ 347 static void 348 init_maxsockets(void *ignored) 349 { 350 351 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 352 maxsockets = imax(maxsockets, maxfiles); 353 } 354 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 355 356 /* 357 * Sysctl to get and set the maximum global sockets limit. Notify protocols 358 * of the change so that they can update their dependent limits as required. 359 */ 360 static int 361 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 362 { 363 int error, newmaxsockets; 364 365 newmaxsockets = maxsockets; 366 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 367 if (error == 0 && req->newptr) { 368 if (newmaxsockets > maxsockets && 369 newmaxsockets <= maxfiles) { 370 maxsockets = newmaxsockets; 371 EVENTHANDLER_INVOKE(maxsockets_change); 372 } else 373 error = EINVAL; 374 } 375 return (error); 376 } 377 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 378 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0, 379 sysctl_maxsockets, "IU", 380 "Maximum number of sockets available"); 381 382 /* 383 * Socket operation routines. These routines are called by the routines in 384 * sys_socket.c or from a system process, and implement the semantics of 385 * socket operations by switching out to the protocol specific routines. 386 */ 387 388 /* 389 * Get a socket structure from our zone, and initialize it. Note that it 390 * would probably be better to allocate socket and PCB at the same time, but 391 * I'm not convinced that all the protocols can be easily modified to do 392 * this. 393 * 394 * soalloc() returns a socket with a ref count of 0. 395 */ 396 static struct socket * 397 soalloc(struct vnet *vnet) 398 { 399 struct socket *so; 400 401 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 402 if (so == NULL) 403 return (NULL); 404 #ifdef MAC 405 if (mac_socket_init(so, M_NOWAIT) != 0) { 406 uma_zfree(socket_zone, so); 407 return (NULL); 408 } 409 #endif 410 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 411 uma_zfree(socket_zone, so); 412 return (NULL); 413 } 414 415 /* 416 * The socket locking protocol allows to lock 2 sockets at a time, 417 * however, the first one must be a listening socket. WITNESS lacks 418 * a feature to change class of an existing lock, so we use DUPOK. 419 */ 420 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 421 so->so_snd.sb_mtx = &so->so_snd_mtx; 422 so->so_rcv.sb_mtx = &so->so_rcv_mtx; 423 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 424 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 425 so->so_rcv.sb_sel = &so->so_rdsel; 426 so->so_snd.sb_sel = &so->so_wrsel; 427 sx_init(&so->so_snd_sx, "so_snd_sx"); 428 sx_init(&so->so_rcv_sx, "so_rcv_sx"); 429 TAILQ_INIT(&so->so_snd.sb_aiojobq); 430 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 431 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 432 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 433 #ifdef VIMAGE 434 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 435 __func__, __LINE__, so)); 436 so->so_vnet = vnet; 437 #endif 438 /* We shouldn't need the so_global_mtx */ 439 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 440 /* Do we need more comprehensive error returns? */ 441 uma_zfree(socket_zone, so); 442 return (NULL); 443 } 444 mtx_lock(&so_global_mtx); 445 so->so_gencnt = ++so_gencnt; 446 ++numopensockets; 447 #ifdef VIMAGE 448 vnet->vnet_sockcnt++; 449 #endif 450 mtx_unlock(&so_global_mtx); 451 452 return (so); 453 } 454 455 /* 456 * Free the storage associated with a socket at the socket layer, tear down 457 * locks, labels, etc. All protocol state is assumed already to have been 458 * torn down (and possibly never set up) by the caller. 459 */ 460 static void 461 sodealloc(struct socket *so) 462 { 463 464 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 465 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 466 467 mtx_lock(&so_global_mtx); 468 so->so_gencnt = ++so_gencnt; 469 --numopensockets; /* Could be below, but faster here. */ 470 #ifdef VIMAGE 471 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 472 __func__, __LINE__, so)); 473 so->so_vnet->vnet_sockcnt--; 474 #endif 475 mtx_unlock(&so_global_mtx); 476 #ifdef MAC 477 mac_socket_destroy(so); 478 #endif 479 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 480 481 khelp_destroy_osd(&so->osd); 482 if (SOLISTENING(so)) { 483 if (so->sol_accept_filter != NULL) 484 accept_filt_setopt(so, NULL); 485 } else { 486 if (so->so_rcv.sb_hiwat) 487 (void)chgsbsize(so->so_cred->cr_uidinfo, 488 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 489 if (so->so_snd.sb_hiwat) 490 (void)chgsbsize(so->so_cred->cr_uidinfo, 491 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 492 sx_destroy(&so->so_snd_sx); 493 sx_destroy(&so->so_rcv_sx); 494 SOCKBUF_LOCK_DESTROY(&so->so_snd); 495 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 496 } 497 crfree(so->so_cred); 498 mtx_destroy(&so->so_lock); 499 uma_zfree(socket_zone, so); 500 } 501 502 /* 503 * socreate returns a socket with a ref count of 1. The socket should be 504 * closed with soclose(). 505 */ 506 int 507 socreate(int dom, struct socket **aso, int type, int proto, 508 struct ucred *cred, struct thread *td) 509 { 510 struct protosw *prp; 511 struct socket *so; 512 int error; 513 514 if (proto) 515 prp = pffindproto(dom, proto, type); 516 else 517 prp = pffindtype(dom, type); 518 519 if (prp == NULL) { 520 /* No support for domain. */ 521 if (pffinddomain(dom) == NULL) 522 return (EAFNOSUPPORT); 523 /* No support for socket type. */ 524 if (proto == 0 && type != 0) 525 return (EPROTOTYPE); 526 return (EPROTONOSUPPORT); 527 } 528 if (prp->pr_usrreqs->pru_attach == NULL || 529 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 530 return (EPROTONOSUPPORT); 531 532 if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) 533 return (ECAPMODE); 534 535 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 536 return (EPROTONOSUPPORT); 537 538 if (prp->pr_type != type) 539 return (EPROTOTYPE); 540 so = soalloc(CRED_TO_VNET(cred)); 541 if (so == NULL) 542 return (ENOBUFS); 543 544 so->so_type = type; 545 so->so_cred = crhold(cred); 546 if ((prp->pr_domain->dom_family == PF_INET) || 547 (prp->pr_domain->dom_family == PF_INET6) || 548 (prp->pr_domain->dom_family == PF_ROUTE)) 549 so->so_fibnum = td->td_proc->p_fibnum; 550 else 551 so->so_fibnum = 0; 552 so->so_proto = prp; 553 #ifdef MAC 554 mac_socket_create(cred, so); 555 #endif 556 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 557 so_rdknl_assert_lock); 558 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 559 so_wrknl_assert_lock); 560 /* 561 * Auto-sizing of socket buffers is managed by the protocols and 562 * the appropriate flags must be set in the pru_attach function. 563 */ 564 CURVNET_SET(so->so_vnet); 565 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 566 CURVNET_RESTORE(); 567 if (error) { 568 sodealloc(so); 569 return (error); 570 } 571 soref(so); 572 *aso = so; 573 return (0); 574 } 575 576 #ifdef REGRESSION 577 static int regression_sonewconn_earlytest = 1; 578 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 579 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 580 #endif 581 582 static struct timeval overinterval = { 60, 0 }; 583 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, 584 &overinterval, 585 "Delay in seconds between warnings for listen socket overflows"); 586 587 /* 588 * When an attempt at a new connection is noted on a socket which accepts 589 * connections, sonewconn is called. If the connection is possible (subject 590 * to space constraints, etc.) then we allocate a new structure, properly 591 * linked into the data structure of the original socket, and return this. 592 * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. 593 * 594 * Note: the ref count on the socket is 0 on return. 595 */ 596 struct socket * 597 sonewconn(struct socket *head, int connstatus) 598 { 599 struct sbuf descrsb; 600 struct socket *so; 601 int len, overcount; 602 u_int qlen; 603 const char localprefix[] = "local:"; 604 char descrbuf[SUNPATHLEN + sizeof(localprefix)]; 605 #if defined(INET6) 606 char addrbuf[INET6_ADDRSTRLEN]; 607 #elif defined(INET) 608 char addrbuf[INET_ADDRSTRLEN]; 609 #endif 610 bool dolog, over; 611 612 SOLISTEN_LOCK(head); 613 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 614 #ifdef REGRESSION 615 if (regression_sonewconn_earlytest && over) { 616 #else 617 if (over) { 618 #endif 619 head->sol_overcount++; 620 dolog = !!ratecheck(&head->sol_lastover, &overinterval); 621 622 /* 623 * If we're going to log, copy the overflow count and queue 624 * length from the listen socket before dropping the lock. 625 * Also, reset the overflow count. 626 */ 627 if (dolog) { 628 overcount = head->sol_overcount; 629 head->sol_overcount = 0; 630 qlen = head->sol_qlen; 631 } 632 SOLISTEN_UNLOCK(head); 633 634 if (dolog) { 635 /* 636 * Try to print something descriptive about the 637 * socket for the error message. 638 */ 639 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), 640 SBUF_FIXEDLEN); 641 switch (head->so_proto->pr_domain->dom_family) { 642 #if defined(INET) || defined(INET6) 643 #ifdef INET 644 case AF_INET: 645 #endif 646 #ifdef INET6 647 case AF_INET6: 648 if (head->so_proto->pr_domain->dom_family == 649 AF_INET6 || 650 (sotoinpcb(head)->inp_inc.inc_flags & 651 INC_ISIPV6)) { 652 ip6_sprintf(addrbuf, 653 &sotoinpcb(head)->inp_inc.inc6_laddr); 654 sbuf_printf(&descrsb, "[%s]", addrbuf); 655 } else 656 #endif 657 { 658 #ifdef INET 659 inet_ntoa_r( 660 sotoinpcb(head)->inp_inc.inc_laddr, 661 addrbuf); 662 sbuf_cat(&descrsb, addrbuf); 663 #endif 664 } 665 sbuf_printf(&descrsb, ":%hu (proto %u)", 666 ntohs(sotoinpcb(head)->inp_inc.inc_lport), 667 head->so_proto->pr_protocol); 668 break; 669 #endif /* INET || INET6 */ 670 case AF_UNIX: 671 sbuf_cat(&descrsb, localprefix); 672 if (sotounpcb(head)->unp_addr != NULL) 673 len = 674 sotounpcb(head)->unp_addr->sun_len - 675 offsetof(struct sockaddr_un, 676 sun_path); 677 else 678 len = 0; 679 if (len > 0) 680 sbuf_bcat(&descrsb, 681 sotounpcb(head)->unp_addr->sun_path, 682 len); 683 else 684 sbuf_cat(&descrsb, "(unknown)"); 685 break; 686 } 687 688 /* 689 * If we can't print something more specific, at least 690 * print the domain name. 691 */ 692 if (sbuf_finish(&descrsb) != 0 || 693 sbuf_len(&descrsb) <= 0) { 694 sbuf_clear(&descrsb); 695 sbuf_cat(&descrsb, 696 head->so_proto->pr_domain->dom_name ?: 697 "unknown"); 698 sbuf_finish(&descrsb); 699 } 700 KASSERT(sbuf_len(&descrsb) > 0, 701 ("%s: sbuf creation failed", __func__)); 702 log(LOG_DEBUG, 703 "%s: pcb %p (%s): Listen queue overflow: " 704 "%i already in queue awaiting acceptance " 705 "(%d occurrences)\n", 706 __func__, head->so_pcb, sbuf_data(&descrsb), 707 qlen, overcount); 708 sbuf_delete(&descrsb); 709 710 overcount = 0; 711 } 712 713 return (NULL); 714 } 715 SOLISTEN_UNLOCK(head); 716 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 717 __func__, head)); 718 so = soalloc(head->so_vnet); 719 if (so == NULL) { 720 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 721 "limit reached or out of memory\n", 722 __func__, head->so_pcb); 723 return (NULL); 724 } 725 so->so_listen = head; 726 so->so_type = head->so_type; 727 so->so_options = head->so_options & ~SO_ACCEPTCONN; 728 so->so_linger = head->so_linger; 729 so->so_state = head->so_state | SS_NOFDREF; 730 so->so_fibnum = head->so_fibnum; 731 so->so_proto = head->so_proto; 732 so->so_cred = crhold(head->so_cred); 733 #ifdef MAC 734 mac_socket_newconn(head, so); 735 #endif 736 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 737 so_rdknl_assert_lock); 738 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 739 so_wrknl_assert_lock); 740 VNET_SO_ASSERT(head); 741 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 742 sodealloc(so); 743 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 744 __func__, head->so_pcb); 745 return (NULL); 746 } 747 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 748 sodealloc(so); 749 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 750 __func__, head->so_pcb); 751 return (NULL); 752 } 753 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 754 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 755 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 756 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 757 so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; 758 so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; 759 760 SOLISTEN_LOCK(head); 761 if (head->sol_accept_filter != NULL) 762 connstatus = 0; 763 so->so_state |= connstatus; 764 soref(head); /* A socket on (in)complete queue refs head. */ 765 if (connstatus) { 766 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 767 so->so_qstate = SQ_COMP; 768 head->sol_qlen++; 769 solisten_wakeup(head); /* unlocks */ 770 } else { 771 /* 772 * Keep removing sockets from the head until there's room for 773 * us to insert on the tail. In pre-locking revisions, this 774 * was a simple if(), but as we could be racing with other 775 * threads and soabort() requires dropping locks, we must 776 * loop waiting for the condition to be true. 777 */ 778 while (head->sol_incqlen > head->sol_qlimit) { 779 struct socket *sp; 780 781 sp = TAILQ_FIRST(&head->sol_incomp); 782 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 783 head->sol_incqlen--; 784 SOCK_LOCK(sp); 785 sp->so_qstate = SQ_NONE; 786 sp->so_listen = NULL; 787 SOCK_UNLOCK(sp); 788 sorele(head); /* does SOLISTEN_UNLOCK, head stays */ 789 soabort(sp); 790 SOLISTEN_LOCK(head); 791 } 792 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 793 so->so_qstate = SQ_INCOMP; 794 head->sol_incqlen++; 795 SOLISTEN_UNLOCK(head); 796 } 797 return (so); 798 } 799 800 #if defined(SCTP) || defined(SCTP_SUPPORT) 801 /* 802 * Socket part of sctp_peeloff(). Detach a new socket from an 803 * association. The new socket is returned with a reference. 804 */ 805 struct socket * 806 sopeeloff(struct socket *head) 807 { 808 struct socket *so; 809 810 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 811 __func__, __LINE__, head)); 812 so = soalloc(head->so_vnet); 813 if (so == NULL) { 814 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 815 "limit reached or out of memory\n", 816 __func__, head->so_pcb); 817 return (NULL); 818 } 819 so->so_type = head->so_type; 820 so->so_options = head->so_options; 821 so->so_linger = head->so_linger; 822 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 823 so->so_fibnum = head->so_fibnum; 824 so->so_proto = head->so_proto; 825 so->so_cred = crhold(head->so_cred); 826 #ifdef MAC 827 mac_socket_newconn(head, so); 828 #endif 829 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 830 so_rdknl_assert_lock); 831 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 832 so_wrknl_assert_lock); 833 VNET_SO_ASSERT(head); 834 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 835 sodealloc(so); 836 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 837 __func__, head->so_pcb); 838 return (NULL); 839 } 840 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 841 sodealloc(so); 842 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 843 __func__, head->so_pcb); 844 return (NULL); 845 } 846 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 847 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 848 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 849 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 850 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 851 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 852 853 soref(so); 854 855 return (so); 856 } 857 #endif /* SCTP */ 858 859 int 860 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 861 { 862 int error; 863 864 CURVNET_SET(so->so_vnet); 865 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 866 CURVNET_RESTORE(); 867 return (error); 868 } 869 870 int 871 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 872 { 873 int error; 874 875 CURVNET_SET(so->so_vnet); 876 error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); 877 CURVNET_RESTORE(); 878 return (error); 879 } 880 881 /* 882 * solisten() transitions a socket from a non-listening state to a listening 883 * state, but can also be used to update the listen queue depth on an 884 * existing listen socket. The protocol will call back into the sockets 885 * layer using solisten_proto_check() and solisten_proto() to check and set 886 * socket-layer listen state. Call backs are used so that the protocol can 887 * acquire both protocol and socket layer locks in whatever order is required 888 * by the protocol. 889 * 890 * Protocol implementors are advised to hold the socket lock across the 891 * socket-layer test and set to avoid races at the socket layer. 892 */ 893 int 894 solisten(struct socket *so, int backlog, struct thread *td) 895 { 896 int error; 897 898 CURVNET_SET(so->so_vnet); 899 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 900 CURVNET_RESTORE(); 901 return (error); 902 } 903 904 /* 905 * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in 906 * order to interlock with socket I/O. 907 */ 908 int 909 solisten_proto_check(struct socket *so) 910 { 911 SOCK_LOCK_ASSERT(so); 912 913 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 914 SS_ISDISCONNECTING)) != 0) 915 return (EINVAL); 916 917 /* 918 * Sleeping is not permitted here, so simply fail if userspace is 919 * attempting to transmit or receive on the socket. This kind of 920 * transient failure is not ideal, but it should occur only if userspace 921 * is misusing the socket interfaces. 922 */ 923 if (!sx_try_xlock(&so->so_snd_sx)) 924 return (EAGAIN); 925 if (!sx_try_xlock(&so->so_rcv_sx)) { 926 sx_xunlock(&so->so_snd_sx); 927 return (EAGAIN); 928 } 929 mtx_lock(&so->so_snd_mtx); 930 mtx_lock(&so->so_rcv_mtx); 931 return (0); 932 } 933 934 /* 935 * Undo the setup done by solisten_proto_check(). 936 */ 937 void 938 solisten_proto_abort(struct socket *so) 939 { 940 mtx_unlock(&so->so_snd_mtx); 941 mtx_unlock(&so->so_rcv_mtx); 942 sx_xunlock(&so->so_snd_sx); 943 sx_xunlock(&so->so_rcv_sx); 944 } 945 946 void 947 solisten_proto(struct socket *so, int backlog) 948 { 949 int sbrcv_lowat, sbsnd_lowat; 950 u_int sbrcv_hiwat, sbsnd_hiwat; 951 short sbrcv_flags, sbsnd_flags; 952 sbintime_t sbrcv_timeo, sbsnd_timeo; 953 954 SOCK_LOCK_ASSERT(so); 955 KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 956 SS_ISDISCONNECTING)) == 0, 957 ("%s: bad socket state %p", __func__, so)); 958 959 if (SOLISTENING(so)) 960 goto listening; 961 962 /* 963 * Change this socket to listening state. 964 */ 965 sbrcv_lowat = so->so_rcv.sb_lowat; 966 sbsnd_lowat = so->so_snd.sb_lowat; 967 sbrcv_hiwat = so->so_rcv.sb_hiwat; 968 sbsnd_hiwat = so->so_snd.sb_hiwat; 969 sbrcv_flags = so->so_rcv.sb_flags; 970 sbsnd_flags = so->so_snd.sb_flags; 971 sbrcv_timeo = so->so_rcv.sb_timeo; 972 sbsnd_timeo = so->so_snd.sb_timeo; 973 974 sbdestroy(&so->so_snd, so); 975 sbdestroy(&so->so_rcv, so); 976 977 #ifdef INVARIANTS 978 bzero(&so->so_rcv, 979 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 980 #endif 981 982 so->sol_sbrcv_lowat = sbrcv_lowat; 983 so->sol_sbsnd_lowat = sbsnd_lowat; 984 so->sol_sbrcv_hiwat = sbrcv_hiwat; 985 so->sol_sbsnd_hiwat = sbsnd_hiwat; 986 so->sol_sbrcv_flags = sbrcv_flags; 987 so->sol_sbsnd_flags = sbsnd_flags; 988 so->sol_sbrcv_timeo = sbrcv_timeo; 989 so->sol_sbsnd_timeo = sbsnd_timeo; 990 991 so->sol_qlen = so->sol_incqlen = 0; 992 TAILQ_INIT(&so->sol_incomp); 993 TAILQ_INIT(&so->sol_comp); 994 995 so->sol_accept_filter = NULL; 996 so->sol_accept_filter_arg = NULL; 997 so->sol_accept_filter_str = NULL; 998 999 so->sol_upcall = NULL; 1000 so->sol_upcallarg = NULL; 1001 1002 so->so_options |= SO_ACCEPTCONN; 1003 1004 listening: 1005 if (backlog < 0 || backlog > somaxconn) 1006 backlog = somaxconn; 1007 so->sol_qlimit = backlog; 1008 1009 mtx_unlock(&so->so_snd_mtx); 1010 mtx_unlock(&so->so_rcv_mtx); 1011 sx_xunlock(&so->so_snd_sx); 1012 sx_xunlock(&so->so_rcv_sx); 1013 } 1014 1015 /* 1016 * Wakeup listeners/subsystems once we have a complete connection. 1017 * Enters with lock, returns unlocked. 1018 */ 1019 void 1020 solisten_wakeup(struct socket *sol) 1021 { 1022 1023 if (sol->sol_upcall != NULL) 1024 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 1025 else { 1026 selwakeuppri(&sol->so_rdsel, PSOCK); 1027 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 1028 } 1029 SOLISTEN_UNLOCK(sol); 1030 wakeup_one(&sol->sol_comp); 1031 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 1032 pgsigio(&sol->so_sigio, SIGIO, 0); 1033 } 1034 1035 /* 1036 * Return single connection off a listening socket queue. Main consumer of 1037 * the function is kern_accept4(). Some modules, that do their own accept 1038 * management also use the function. 1039 * 1040 * Listening socket must be locked on entry and is returned unlocked on 1041 * return. 1042 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 1043 */ 1044 int 1045 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 1046 { 1047 struct socket *so; 1048 int error; 1049 1050 SOLISTEN_LOCK_ASSERT(head); 1051 1052 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 1053 head->so_error == 0) { 1054 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, 1055 "accept", 0); 1056 if (error != 0) { 1057 SOLISTEN_UNLOCK(head); 1058 return (error); 1059 } 1060 } 1061 if (head->so_error) { 1062 error = head->so_error; 1063 head->so_error = 0; 1064 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 1065 error = EWOULDBLOCK; 1066 else 1067 error = 0; 1068 if (error) { 1069 SOLISTEN_UNLOCK(head); 1070 return (error); 1071 } 1072 so = TAILQ_FIRST(&head->sol_comp); 1073 SOCK_LOCK(so); 1074 KASSERT(so->so_qstate == SQ_COMP, 1075 ("%s: so %p not SQ_COMP", __func__, so)); 1076 soref(so); 1077 head->sol_qlen--; 1078 so->so_qstate = SQ_NONE; 1079 so->so_listen = NULL; 1080 TAILQ_REMOVE(&head->sol_comp, so, so_list); 1081 if (flags & ACCEPT4_INHERIT) 1082 so->so_state |= (head->so_state & SS_NBIO); 1083 else 1084 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 1085 SOCK_UNLOCK(so); 1086 sorele(head); 1087 1088 *ret = so; 1089 return (0); 1090 } 1091 1092 /* 1093 * Evaluate the reference count and named references on a socket; if no 1094 * references remain, free it. This should be called whenever a reference is 1095 * released, such as in sorele(), but also when named reference flags are 1096 * cleared in socket or protocol code. 1097 * 1098 * sofree() will free the socket if: 1099 * 1100 * - There are no outstanding file descriptor references or related consumers 1101 * (so_count == 0). 1102 * 1103 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 1104 * 1105 * - The protocol does not have an outstanding strong reference on the socket 1106 * (SS_PROTOREF). 1107 * 1108 * - The socket is not in a completed connection queue, so a process has been 1109 * notified that it is present. If it is removed, the user process may 1110 * block in accept() despite select() saying the socket was ready. 1111 */ 1112 void 1113 sofree(struct socket *so) 1114 { 1115 struct protosw *pr = so->so_proto; 1116 1117 SOCK_LOCK_ASSERT(so); 1118 1119 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 1120 (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { 1121 SOCK_UNLOCK(so); 1122 return; 1123 } 1124 1125 if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { 1126 struct socket *sol; 1127 1128 sol = so->so_listen; 1129 KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); 1130 1131 /* 1132 * To solve race between close of a listening socket and 1133 * a socket on its incomplete queue, we need to lock both. 1134 * The order is first listening socket, then regular. 1135 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this 1136 * function and the listening socket are the only pointers 1137 * to so. To preserve so and sol, we reference both and then 1138 * relock. 1139 * After relock the socket may not move to so_comp since it 1140 * doesn't have PCB already, but it may be removed from 1141 * so_incomp. If that happens, we share responsiblity on 1142 * freeing the socket, but soclose() has already removed 1143 * it from queue. 1144 */ 1145 soref(sol); 1146 soref(so); 1147 SOCK_UNLOCK(so); 1148 SOLISTEN_LOCK(sol); 1149 SOCK_LOCK(so); 1150 if (so->so_qstate == SQ_INCOMP) { 1151 KASSERT(so->so_listen == sol, 1152 ("%s: so %p migrated out of sol %p", 1153 __func__, so, sol)); 1154 TAILQ_REMOVE(&sol->sol_incomp, so, so_list); 1155 sol->sol_incqlen--; 1156 /* This is guarenteed not to be the last. */ 1157 refcount_release(&sol->so_count); 1158 so->so_qstate = SQ_NONE; 1159 so->so_listen = NULL; 1160 } else 1161 KASSERT(so->so_listen == NULL, 1162 ("%s: so %p not on (in)comp with so_listen", 1163 __func__, so)); 1164 sorele(sol); 1165 KASSERT(so->so_count == 1, 1166 ("%s: so %p count %u", __func__, so, so->so_count)); 1167 so->so_count = 0; 1168 } 1169 if (SOLISTENING(so)) 1170 so->so_error = ECONNABORTED; 1171 SOCK_UNLOCK(so); 1172 1173 if (so->so_dtor != NULL) 1174 so->so_dtor(so); 1175 1176 VNET_SO_ASSERT(so); 1177 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1178 (*pr->pr_domain->dom_dispose)(so); 1179 if (pr->pr_usrreqs->pru_detach != NULL) 1180 (*pr->pr_usrreqs->pru_detach)(so); 1181 1182 /* 1183 * From this point on, we assume that no other references to this 1184 * socket exist anywhere else in the stack. Therefore, no locks need 1185 * to be acquired or held. 1186 * 1187 * We used to do a lot of socket buffer and socket locking here, as 1188 * well as invoke sorflush() and perform wakeups. The direct call to 1189 * dom_dispose() and sbdestroy() are an inlining of what was 1190 * necessary from sorflush(). 1191 * 1192 * Notice that the socket buffer and kqueue state are torn down 1193 * before calling pru_detach. This means that protocols shold not 1194 * assume they can perform socket wakeups, etc, in their detach code. 1195 */ 1196 if (!SOLISTENING(so)) { 1197 sbdestroy(&so->so_snd, so); 1198 sbdestroy(&so->so_rcv, so); 1199 } 1200 seldrain(&so->so_rdsel); 1201 seldrain(&so->so_wrsel); 1202 knlist_destroy(&so->so_rdsel.si_note); 1203 knlist_destroy(&so->so_wrsel.si_note); 1204 sodealloc(so); 1205 } 1206 1207 /* 1208 * Close a socket on last file table reference removal. Initiate disconnect 1209 * if connected. Free socket when disconnect complete. 1210 * 1211 * This function will sorele() the socket. Note that soclose() may be called 1212 * prior to the ref count reaching zero. The actual socket structure will 1213 * not be freed until the ref count reaches zero. 1214 */ 1215 int 1216 soclose(struct socket *so) 1217 { 1218 struct accept_queue lqueue; 1219 int error = 0; 1220 1221 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 1222 1223 CURVNET_SET(so->so_vnet); 1224 funsetown(&so->so_sigio); 1225 if (so->so_state & SS_ISCONNECTED) { 1226 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1227 error = sodisconnect(so); 1228 if (error) { 1229 if (error == ENOTCONN) 1230 error = 0; 1231 goto drop; 1232 } 1233 } 1234 1235 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { 1236 if ((so->so_state & SS_ISDISCONNECTING) && 1237 (so->so_state & SS_NBIO)) 1238 goto drop; 1239 while (so->so_state & SS_ISCONNECTED) { 1240 error = tsleep(&so->so_timeo, 1241 PSOCK | PCATCH, "soclos", 1242 so->so_linger * hz); 1243 if (error) 1244 break; 1245 } 1246 } 1247 } 1248 1249 drop: 1250 if (so->so_proto->pr_usrreqs->pru_close != NULL) 1251 (*so->so_proto->pr_usrreqs->pru_close)(so); 1252 1253 SOCK_LOCK(so); 1254 if (SOLISTENING(so)) { 1255 struct socket *sp; 1256 1257 TAILQ_INIT(&lqueue); 1258 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1259 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1260 1261 so->sol_qlen = so->sol_incqlen = 0; 1262 1263 TAILQ_FOREACH(sp, &lqueue, so_list) { 1264 SOCK_LOCK(sp); 1265 sp->so_qstate = SQ_NONE; 1266 sp->so_listen = NULL; 1267 SOCK_UNLOCK(sp); 1268 /* Guaranteed not to be the last. */ 1269 refcount_release(&so->so_count); 1270 } 1271 } 1272 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 1273 so->so_state |= SS_NOFDREF; 1274 sorele(so); 1275 if (SOLISTENING(so)) { 1276 struct socket *sp, *tsp; 1277 1278 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { 1279 SOCK_LOCK(sp); 1280 if (sp->so_count == 0) { 1281 SOCK_UNLOCK(sp); 1282 soabort(sp); 1283 } else 1284 /* sp is now in sofree() */ 1285 SOCK_UNLOCK(sp); 1286 } 1287 } 1288 CURVNET_RESTORE(); 1289 return (error); 1290 } 1291 1292 /* 1293 * soabort() is used to abruptly tear down a connection, such as when a 1294 * resource limit is reached (listen queue depth exceeded), or if a listen 1295 * socket is closed while there are sockets waiting to be accepted. 1296 * 1297 * This interface is tricky, because it is called on an unreferenced socket, 1298 * and must be called only by a thread that has actually removed the socket 1299 * from the listen queue it was on, or races with other threads are risked. 1300 * 1301 * This interface will call into the protocol code, so must not be called 1302 * with any socket locks held. Protocols do call it while holding their own 1303 * recursible protocol mutexes, but this is something that should be subject 1304 * to review in the future. 1305 */ 1306 void 1307 soabort(struct socket *so) 1308 { 1309 1310 /* 1311 * In as much as is possible, assert that no references to this 1312 * socket are held. This is not quite the same as asserting that the 1313 * current thread is responsible for arranging for no references, but 1314 * is as close as we can get for now. 1315 */ 1316 KASSERT(so->so_count == 0, ("soabort: so_count")); 1317 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 1318 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 1319 VNET_SO_ASSERT(so); 1320 1321 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 1322 (*so->so_proto->pr_usrreqs->pru_abort)(so); 1323 SOCK_LOCK(so); 1324 sofree(so); 1325 } 1326 1327 int 1328 soaccept(struct socket *so, struct sockaddr **nam) 1329 { 1330 int error; 1331 1332 SOCK_LOCK(so); 1333 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 1334 so->so_state &= ~SS_NOFDREF; 1335 SOCK_UNLOCK(so); 1336 1337 CURVNET_SET(so->so_vnet); 1338 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 1339 CURVNET_RESTORE(); 1340 return (error); 1341 } 1342 1343 int 1344 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 1345 { 1346 1347 return (soconnectat(AT_FDCWD, so, nam, td)); 1348 } 1349 1350 int 1351 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1352 { 1353 int error; 1354 1355 CURVNET_SET(so->so_vnet); 1356 /* 1357 * If protocol is connection-based, can only connect once. 1358 * Otherwise, if connected, try to disconnect first. This allows 1359 * user to disconnect by connecting to, e.g., a null address. 1360 */ 1361 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1362 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1363 (error = sodisconnect(so)))) { 1364 error = EISCONN; 1365 } else { 1366 /* 1367 * Prevent accumulated error from previous connection from 1368 * biting us. 1369 */ 1370 so->so_error = 0; 1371 if (fd == AT_FDCWD) { 1372 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, 1373 nam, td); 1374 } else { 1375 error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, 1376 so, nam, td); 1377 } 1378 } 1379 CURVNET_RESTORE(); 1380 1381 return (error); 1382 } 1383 1384 int 1385 soconnect2(struct socket *so1, struct socket *so2) 1386 { 1387 int error; 1388 1389 CURVNET_SET(so1->so_vnet); 1390 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 1391 CURVNET_RESTORE(); 1392 return (error); 1393 } 1394 1395 int 1396 sodisconnect(struct socket *so) 1397 { 1398 int error; 1399 1400 if ((so->so_state & SS_ISCONNECTED) == 0) 1401 return (ENOTCONN); 1402 if (so->so_state & SS_ISDISCONNECTING) 1403 return (EALREADY); 1404 VNET_SO_ASSERT(so); 1405 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 1406 return (error); 1407 } 1408 1409 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1410 1411 int 1412 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1413 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1414 { 1415 long space; 1416 ssize_t resid; 1417 int clen = 0, error, dontroute; 1418 1419 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1420 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1421 ("sosend_dgram: !PR_ATOMIC")); 1422 1423 if (uio != NULL) 1424 resid = uio->uio_resid; 1425 else 1426 resid = top->m_pkthdr.len; 1427 /* 1428 * In theory resid should be unsigned. However, space must be 1429 * signed, as it might be less than 0 if we over-committed, and we 1430 * must use a signed comparison of space and resid. On the other 1431 * hand, a negative resid causes us to loop sending 0-length 1432 * segments to the protocol. 1433 */ 1434 if (resid < 0) { 1435 error = EINVAL; 1436 goto out; 1437 } 1438 1439 dontroute = 1440 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1441 if (td != NULL) 1442 td->td_ru.ru_msgsnd++; 1443 if (control != NULL) 1444 clen = control->m_len; 1445 1446 SOCKBUF_LOCK(&so->so_snd); 1447 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1448 SOCKBUF_UNLOCK(&so->so_snd); 1449 error = EPIPE; 1450 goto out; 1451 } 1452 if (so->so_error) { 1453 error = so->so_error; 1454 so->so_error = 0; 1455 SOCKBUF_UNLOCK(&so->so_snd); 1456 goto out; 1457 } 1458 if ((so->so_state & SS_ISCONNECTED) == 0) { 1459 /* 1460 * `sendto' and `sendmsg' is allowed on a connection-based 1461 * socket if it supports implied connect. Return ENOTCONN if 1462 * not connected and no address is supplied. 1463 */ 1464 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1465 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1466 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1467 !(resid == 0 && clen != 0)) { 1468 SOCKBUF_UNLOCK(&so->so_snd); 1469 error = ENOTCONN; 1470 goto out; 1471 } 1472 } else if (addr == NULL) { 1473 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1474 error = ENOTCONN; 1475 else 1476 error = EDESTADDRREQ; 1477 SOCKBUF_UNLOCK(&so->so_snd); 1478 goto out; 1479 } 1480 } 1481 1482 /* 1483 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1484 * problem and need fixing. 1485 */ 1486 space = sbspace(&so->so_snd); 1487 if (flags & MSG_OOB) 1488 space += 1024; 1489 space -= clen; 1490 SOCKBUF_UNLOCK(&so->so_snd); 1491 if (resid > space) { 1492 error = EMSGSIZE; 1493 goto out; 1494 } 1495 if (uio == NULL) { 1496 resid = 0; 1497 if (flags & MSG_EOR) 1498 top->m_flags |= M_EOR; 1499 } else { 1500 /* 1501 * Copy the data from userland into a mbuf chain. 1502 * If no data is to be copied in, a single empty mbuf 1503 * is returned. 1504 */ 1505 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1506 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1507 if (top == NULL) { 1508 error = EFAULT; /* only possible error */ 1509 goto out; 1510 } 1511 space -= resid - uio->uio_resid; 1512 resid = uio->uio_resid; 1513 } 1514 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1515 /* 1516 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1517 * than with. 1518 */ 1519 if (dontroute) { 1520 SOCK_LOCK(so); 1521 so->so_options |= SO_DONTROUTE; 1522 SOCK_UNLOCK(so); 1523 } 1524 /* 1525 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1526 * of date. We could have received a reset packet in an interrupt or 1527 * maybe we slept while doing page faults in uiomove() etc. We could 1528 * probably recheck again inside the locking protection here, but 1529 * there are probably other places that this also happens. We must 1530 * rethink this. 1531 */ 1532 VNET_SO_ASSERT(so); 1533 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1534 (flags & MSG_OOB) ? PRUS_OOB : 1535 /* 1536 * If the user set MSG_EOF, the protocol understands this flag and 1537 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1538 */ 1539 ((flags & MSG_EOF) && 1540 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1541 (resid <= 0)) ? 1542 PRUS_EOF : 1543 /* If there is more to send set PRUS_MORETOCOME */ 1544 (flags & MSG_MORETOCOME) || 1545 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1546 top, addr, control, td); 1547 if (dontroute) { 1548 SOCK_LOCK(so); 1549 so->so_options &= ~SO_DONTROUTE; 1550 SOCK_UNLOCK(so); 1551 } 1552 clen = 0; 1553 control = NULL; 1554 top = NULL; 1555 out: 1556 if (top != NULL) 1557 m_freem(top); 1558 if (control != NULL) 1559 m_freem(control); 1560 return (error); 1561 } 1562 1563 /* 1564 * Send on a socket. If send must go all at once and message is larger than 1565 * send buffering, then hard error. Lock against other senders. If must go 1566 * all at once and not enough room now, then inform user that this would 1567 * block and do nothing. Otherwise, if nonblocking, send as much as 1568 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1569 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1570 * in mbuf chain must be small enough to send all at once. 1571 * 1572 * Returns nonzero on error, timeout or signal; callers must check for short 1573 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1574 * on return. 1575 */ 1576 int 1577 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1578 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1579 { 1580 long space; 1581 ssize_t resid; 1582 int clen = 0, error, dontroute; 1583 int atomic = sosendallatonce(so) || top; 1584 int pru_flag; 1585 #ifdef KERN_TLS 1586 struct ktls_session *tls; 1587 int tls_enq_cnt, tls_pruflag; 1588 uint8_t tls_rtype; 1589 1590 tls = NULL; 1591 tls_rtype = TLS_RLTYPE_APP; 1592 #endif 1593 if (uio != NULL) 1594 resid = uio->uio_resid; 1595 else if ((top->m_flags & M_PKTHDR) != 0) 1596 resid = top->m_pkthdr.len; 1597 else 1598 resid = m_length(top, NULL); 1599 /* 1600 * In theory resid should be unsigned. However, space must be 1601 * signed, as it might be less than 0 if we over-committed, and we 1602 * must use a signed comparison of space and resid. On the other 1603 * hand, a negative resid causes us to loop sending 0-length 1604 * segments to the protocol. 1605 * 1606 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1607 * type sockets since that's an error. 1608 */ 1609 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1610 error = EINVAL; 1611 goto out; 1612 } 1613 1614 dontroute = 1615 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1616 (so->so_proto->pr_flags & PR_ATOMIC); 1617 if (td != NULL) 1618 td->td_ru.ru_msgsnd++; 1619 if (control != NULL) 1620 clen = control->m_len; 1621 1622 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 1623 if (error) 1624 goto out; 1625 1626 #ifdef KERN_TLS 1627 tls_pruflag = 0; 1628 tls = ktls_hold(so->so_snd.sb_tls_info); 1629 if (tls != NULL) { 1630 if (tls->mode == TCP_TLS_MODE_SW) 1631 tls_pruflag = PRUS_NOTREADY; 1632 1633 if (control != NULL) { 1634 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1635 1636 if (clen >= sizeof(*cm) && 1637 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 1638 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 1639 clen = 0; 1640 m_freem(control); 1641 control = NULL; 1642 atomic = 1; 1643 } 1644 } 1645 } 1646 #endif 1647 1648 restart: 1649 do { 1650 SOCKBUF_LOCK(&so->so_snd); 1651 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1652 SOCKBUF_UNLOCK(&so->so_snd); 1653 error = EPIPE; 1654 goto release; 1655 } 1656 if (so->so_error) { 1657 error = so->so_error; 1658 so->so_error = 0; 1659 SOCKBUF_UNLOCK(&so->so_snd); 1660 goto release; 1661 } 1662 if ((so->so_state & SS_ISCONNECTED) == 0) { 1663 /* 1664 * `sendto' and `sendmsg' is allowed on a connection- 1665 * based socket if it supports implied connect. 1666 * Return ENOTCONN if not connected and no address is 1667 * supplied. 1668 */ 1669 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1670 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1671 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1672 !(resid == 0 && clen != 0)) { 1673 SOCKBUF_UNLOCK(&so->so_snd); 1674 error = ENOTCONN; 1675 goto release; 1676 } 1677 } else if (addr == NULL) { 1678 SOCKBUF_UNLOCK(&so->so_snd); 1679 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1680 error = ENOTCONN; 1681 else 1682 error = EDESTADDRREQ; 1683 goto release; 1684 } 1685 } 1686 space = sbspace(&so->so_snd); 1687 if (flags & MSG_OOB) 1688 space += 1024; 1689 if ((atomic && resid > so->so_snd.sb_hiwat) || 1690 clen > so->so_snd.sb_hiwat) { 1691 SOCKBUF_UNLOCK(&so->so_snd); 1692 error = EMSGSIZE; 1693 goto release; 1694 } 1695 if (space < resid + clen && 1696 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1697 if ((so->so_state & SS_NBIO) || 1698 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1699 SOCKBUF_UNLOCK(&so->so_snd); 1700 error = EWOULDBLOCK; 1701 goto release; 1702 } 1703 error = sbwait(&so->so_snd); 1704 SOCKBUF_UNLOCK(&so->so_snd); 1705 if (error) 1706 goto release; 1707 goto restart; 1708 } 1709 SOCKBUF_UNLOCK(&so->so_snd); 1710 space -= clen; 1711 do { 1712 if (uio == NULL) { 1713 resid = 0; 1714 if (flags & MSG_EOR) 1715 top->m_flags |= M_EOR; 1716 #ifdef KERN_TLS 1717 if (tls != NULL) { 1718 ktls_frame(top, tls, &tls_enq_cnt, 1719 tls_rtype); 1720 tls_rtype = TLS_RLTYPE_APP; 1721 } 1722 #endif 1723 } else { 1724 /* 1725 * Copy the data from userland into a mbuf 1726 * chain. If resid is 0, which can happen 1727 * only if we have control to send, then 1728 * a single empty mbuf is returned. This 1729 * is a workaround to prevent protocol send 1730 * methods to panic. 1731 */ 1732 #ifdef KERN_TLS 1733 if (tls != NULL) { 1734 top = m_uiotombuf(uio, M_WAITOK, space, 1735 tls->params.max_frame_len, 1736 M_EXTPG | 1737 ((flags & MSG_EOR) ? M_EOR : 0)); 1738 if (top != NULL) { 1739 ktls_frame(top, tls, 1740 &tls_enq_cnt, tls_rtype); 1741 } 1742 tls_rtype = TLS_RLTYPE_APP; 1743 } else 1744 #endif 1745 top = m_uiotombuf(uio, M_WAITOK, space, 1746 (atomic ? max_hdr : 0), 1747 (atomic ? M_PKTHDR : 0) | 1748 ((flags & MSG_EOR) ? M_EOR : 0)); 1749 if (top == NULL) { 1750 error = EFAULT; /* only possible error */ 1751 goto release; 1752 } 1753 space -= resid - uio->uio_resid; 1754 resid = uio->uio_resid; 1755 } 1756 if (dontroute) { 1757 SOCK_LOCK(so); 1758 so->so_options |= SO_DONTROUTE; 1759 SOCK_UNLOCK(so); 1760 } 1761 /* 1762 * XXX all the SBS_CANTSENDMORE checks previously 1763 * done could be out of date. We could have received 1764 * a reset packet in an interrupt or maybe we slept 1765 * while doing page faults in uiomove() etc. We 1766 * could probably recheck again inside the locking 1767 * protection here, but there are probably other 1768 * places that this also happens. We must rethink 1769 * this. 1770 */ 1771 VNET_SO_ASSERT(so); 1772 1773 pru_flag = (flags & MSG_OOB) ? PRUS_OOB : 1774 /* 1775 * If the user set MSG_EOF, the protocol understands 1776 * this flag and nothing left to send then use 1777 * PRU_SEND_EOF instead of PRU_SEND. 1778 */ 1779 ((flags & MSG_EOF) && 1780 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1781 (resid <= 0)) ? 1782 PRUS_EOF : 1783 /* If there is more to send set PRUS_MORETOCOME. */ 1784 (flags & MSG_MORETOCOME) || 1785 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 1786 1787 #ifdef KERN_TLS 1788 pru_flag |= tls_pruflag; 1789 #endif 1790 1791 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1792 pru_flag, top, addr, control, td); 1793 1794 if (dontroute) { 1795 SOCK_LOCK(so); 1796 so->so_options &= ~SO_DONTROUTE; 1797 SOCK_UNLOCK(so); 1798 } 1799 1800 #ifdef KERN_TLS 1801 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 1802 if (error != 0) { 1803 m_freem(top); 1804 top = NULL; 1805 } else { 1806 soref(so); 1807 ktls_enqueue(top, so, tls_enq_cnt); 1808 } 1809 } 1810 #endif 1811 clen = 0; 1812 control = NULL; 1813 top = NULL; 1814 if (error) 1815 goto release; 1816 } while (resid && space > 0); 1817 } while (resid); 1818 1819 release: 1820 SOCK_IO_SEND_UNLOCK(so); 1821 out: 1822 #ifdef KERN_TLS 1823 if (tls != NULL) 1824 ktls_free(tls); 1825 #endif 1826 if (top != NULL) 1827 m_freem(top); 1828 if (control != NULL) 1829 m_freem(control); 1830 return (error); 1831 } 1832 1833 int 1834 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1835 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1836 { 1837 int error; 1838 1839 CURVNET_SET(so->so_vnet); 1840 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, 1841 top, control, flags, td); 1842 CURVNET_RESTORE(); 1843 return (error); 1844 } 1845 1846 /* 1847 * The part of soreceive() that implements reading non-inline out-of-band 1848 * data from a socket. For more complete comments, see soreceive(), from 1849 * which this code originated. 1850 * 1851 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1852 * unable to return an mbuf chain to the caller. 1853 */ 1854 static int 1855 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1856 { 1857 struct protosw *pr = so->so_proto; 1858 struct mbuf *m; 1859 int error; 1860 1861 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1862 VNET_SO_ASSERT(so); 1863 1864 m = m_get(M_WAITOK, MT_DATA); 1865 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1866 if (error) 1867 goto bad; 1868 do { 1869 error = uiomove(mtod(m, void *), 1870 (int) min(uio->uio_resid, m->m_len), uio); 1871 m = m_free(m); 1872 } while (uio->uio_resid && error == 0 && m); 1873 bad: 1874 if (m != NULL) 1875 m_freem(m); 1876 return (error); 1877 } 1878 1879 /* 1880 * Following replacement or removal of the first mbuf on the first mbuf chain 1881 * of a socket buffer, push necessary state changes back into the socket 1882 * buffer so that other consumers see the values consistently. 'nextrecord' 1883 * is the callers locally stored value of the original value of 1884 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1885 * NOTE: 'nextrecord' may be NULL. 1886 */ 1887 static __inline void 1888 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1889 { 1890 1891 SOCKBUF_LOCK_ASSERT(sb); 1892 /* 1893 * First, update for the new value of nextrecord. If necessary, make 1894 * it the first record. 1895 */ 1896 if (sb->sb_mb != NULL) 1897 sb->sb_mb->m_nextpkt = nextrecord; 1898 else 1899 sb->sb_mb = nextrecord; 1900 1901 /* 1902 * Now update any dependent socket buffer fields to reflect the new 1903 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1904 * addition of a second clause that takes care of the case where 1905 * sb_mb has been updated, but remains the last record. 1906 */ 1907 if (sb->sb_mb == NULL) { 1908 sb->sb_mbtail = NULL; 1909 sb->sb_lastrecord = NULL; 1910 } else if (sb->sb_mb->m_nextpkt == NULL) 1911 sb->sb_lastrecord = sb->sb_mb; 1912 } 1913 1914 /* 1915 * Implement receive operations on a socket. We depend on the way that 1916 * records are added to the sockbuf by sbappend. In particular, each record 1917 * (mbufs linked through m_next) must begin with an address if the protocol 1918 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1919 * data, and then zero or more mbufs of data. In order to allow parallelism 1920 * between network receive and copying to user space, as well as avoid 1921 * sleeping with a mutex held, we release the socket buffer mutex during the 1922 * user space copy. Although the sockbuf is locked, new data may still be 1923 * appended, and thus we must maintain consistency of the sockbuf during that 1924 * time. 1925 * 1926 * The caller may receive the data as a single mbuf chain by supplying an 1927 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1928 * the count in uio_resid. 1929 */ 1930 int 1931 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1932 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1933 { 1934 struct mbuf *m, **mp; 1935 int flags, error, offset; 1936 ssize_t len; 1937 struct protosw *pr = so->so_proto; 1938 struct mbuf *nextrecord; 1939 int moff, type = 0; 1940 ssize_t orig_resid = uio->uio_resid; 1941 1942 mp = mp0; 1943 if (psa != NULL) 1944 *psa = NULL; 1945 if (controlp != NULL) 1946 *controlp = NULL; 1947 if (flagsp != NULL) 1948 flags = *flagsp &~ MSG_EOR; 1949 else 1950 flags = 0; 1951 if (flags & MSG_OOB) 1952 return (soreceive_rcvoob(so, uio, flags)); 1953 if (mp != NULL) 1954 *mp = NULL; 1955 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1956 && uio->uio_resid) { 1957 VNET_SO_ASSERT(so); 1958 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1959 } 1960 1961 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 1962 if (error) 1963 return (error); 1964 1965 restart: 1966 SOCKBUF_LOCK(&so->so_rcv); 1967 m = so->so_rcv.sb_mb; 1968 /* 1969 * If we have less data than requested, block awaiting more (subject 1970 * to any timeout) if: 1971 * 1. the current count is less than the low water mark, or 1972 * 2. MSG_DONTWAIT is not set 1973 */ 1974 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1975 sbavail(&so->so_rcv) < uio->uio_resid) && 1976 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 1977 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1978 KASSERT(m != NULL || !sbavail(&so->so_rcv), 1979 ("receive: m == %p sbavail == %u", 1980 m, sbavail(&so->so_rcv))); 1981 if (so->so_error || so->so_rerror) { 1982 if (m != NULL) 1983 goto dontblock; 1984 if (so->so_error) 1985 error = so->so_error; 1986 else 1987 error = so->so_rerror; 1988 if ((flags & MSG_PEEK) == 0) { 1989 if (so->so_error) 1990 so->so_error = 0; 1991 else 1992 so->so_rerror = 0; 1993 } 1994 SOCKBUF_UNLOCK(&so->so_rcv); 1995 goto release; 1996 } 1997 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1998 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1999 if (m != NULL) 2000 goto dontblock; 2001 #ifdef KERN_TLS 2002 else if (so->so_rcv.sb_tlsdcc == 0 && 2003 so->so_rcv.sb_tlscc == 0) { 2004 #else 2005 else { 2006 #endif 2007 SOCKBUF_UNLOCK(&so->so_rcv); 2008 goto release; 2009 } 2010 } 2011 for (; m != NULL; m = m->m_next) 2012 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 2013 m = so->so_rcv.sb_mb; 2014 goto dontblock; 2015 } 2016 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | 2017 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && 2018 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { 2019 SOCKBUF_UNLOCK(&so->so_rcv); 2020 error = ENOTCONN; 2021 goto release; 2022 } 2023 if (uio->uio_resid == 0) { 2024 SOCKBUF_UNLOCK(&so->so_rcv); 2025 goto release; 2026 } 2027 if ((so->so_state & SS_NBIO) || 2028 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2029 SOCKBUF_UNLOCK(&so->so_rcv); 2030 error = EWOULDBLOCK; 2031 goto release; 2032 } 2033 SBLASTRECORDCHK(&so->so_rcv); 2034 SBLASTMBUFCHK(&so->so_rcv); 2035 error = sbwait(&so->so_rcv); 2036 SOCKBUF_UNLOCK(&so->so_rcv); 2037 if (error) 2038 goto release; 2039 goto restart; 2040 } 2041 dontblock: 2042 /* 2043 * From this point onward, we maintain 'nextrecord' as a cache of the 2044 * pointer to the next record in the socket buffer. We must keep the 2045 * various socket buffer pointers and local stack versions of the 2046 * pointers in sync, pushing out modifications before dropping the 2047 * socket buffer mutex, and re-reading them when picking it up. 2048 * 2049 * Otherwise, we will race with the network stack appending new data 2050 * or records onto the socket buffer by using inconsistent/stale 2051 * versions of the field, possibly resulting in socket buffer 2052 * corruption. 2053 * 2054 * By holding the high-level sblock(), we prevent simultaneous 2055 * readers from pulling off the front of the socket buffer. 2056 */ 2057 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2058 if (uio->uio_td) 2059 uio->uio_td->td_ru.ru_msgrcv++; 2060 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 2061 SBLASTRECORDCHK(&so->so_rcv); 2062 SBLASTMBUFCHK(&so->so_rcv); 2063 nextrecord = m->m_nextpkt; 2064 if (pr->pr_flags & PR_ADDR) { 2065 KASSERT(m->m_type == MT_SONAME, 2066 ("m->m_type == %d", m->m_type)); 2067 orig_resid = 0; 2068 if (psa != NULL) 2069 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2070 M_NOWAIT); 2071 if (flags & MSG_PEEK) { 2072 m = m->m_next; 2073 } else { 2074 sbfree(&so->so_rcv, m); 2075 so->so_rcv.sb_mb = m_free(m); 2076 m = so->so_rcv.sb_mb; 2077 sockbuf_pushsync(&so->so_rcv, nextrecord); 2078 } 2079 } 2080 2081 /* 2082 * Process one or more MT_CONTROL mbufs present before any data mbufs 2083 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 2084 * just copy the data; if !MSG_PEEK, we call into the protocol to 2085 * perform externalization (or freeing if controlp == NULL). 2086 */ 2087 if (m != NULL && m->m_type == MT_CONTROL) { 2088 struct mbuf *cm = NULL, *cmn; 2089 struct mbuf **cme = &cm; 2090 #ifdef KERN_TLS 2091 struct cmsghdr *cmsg; 2092 struct tls_get_record tgr; 2093 2094 /* 2095 * For MSG_TLSAPPDATA, check for a non-application data 2096 * record. If found, return ENXIO without removing 2097 * it from the receive queue. This allows a subsequent 2098 * call without MSG_TLSAPPDATA to receive it. 2099 * Note that, for TLS, there should only be a single 2100 * control mbuf with the TLS_GET_RECORD message in it. 2101 */ 2102 if (flags & MSG_TLSAPPDATA) { 2103 cmsg = mtod(m, struct cmsghdr *); 2104 if (cmsg->cmsg_type == TLS_GET_RECORD && 2105 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { 2106 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); 2107 /* This will need to change for TLS 1.3. */ 2108 if (tgr.tls_type != TLS_RLTYPE_APP) { 2109 SOCKBUF_UNLOCK(&so->so_rcv); 2110 error = ENXIO; 2111 goto release; 2112 } 2113 } 2114 } 2115 #endif 2116 2117 do { 2118 if (flags & MSG_PEEK) { 2119 if (controlp != NULL) { 2120 *controlp = m_copym(m, 0, m->m_len, 2121 M_NOWAIT); 2122 controlp = &(*controlp)->m_next; 2123 } 2124 m = m->m_next; 2125 } else { 2126 sbfree(&so->so_rcv, m); 2127 so->so_rcv.sb_mb = m->m_next; 2128 m->m_next = NULL; 2129 *cme = m; 2130 cme = &(*cme)->m_next; 2131 m = so->so_rcv.sb_mb; 2132 } 2133 } while (m != NULL && m->m_type == MT_CONTROL); 2134 if ((flags & MSG_PEEK) == 0) 2135 sockbuf_pushsync(&so->so_rcv, nextrecord); 2136 while (cm != NULL) { 2137 cmn = cm->m_next; 2138 cm->m_next = NULL; 2139 if (pr->pr_domain->dom_externalize != NULL) { 2140 SOCKBUF_UNLOCK(&so->so_rcv); 2141 VNET_SO_ASSERT(so); 2142 error = (*pr->pr_domain->dom_externalize) 2143 (cm, controlp, flags); 2144 SOCKBUF_LOCK(&so->so_rcv); 2145 } else if (controlp != NULL) 2146 *controlp = cm; 2147 else 2148 m_freem(cm); 2149 if (controlp != NULL) { 2150 orig_resid = 0; 2151 while (*controlp != NULL) 2152 controlp = &(*controlp)->m_next; 2153 } 2154 cm = cmn; 2155 } 2156 if (m != NULL) 2157 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 2158 else 2159 nextrecord = so->so_rcv.sb_mb; 2160 orig_resid = 0; 2161 } 2162 if (m != NULL) { 2163 if ((flags & MSG_PEEK) == 0) { 2164 KASSERT(m->m_nextpkt == nextrecord, 2165 ("soreceive: post-control, nextrecord !sync")); 2166 if (nextrecord == NULL) { 2167 KASSERT(so->so_rcv.sb_mb == m, 2168 ("soreceive: post-control, sb_mb!=m")); 2169 KASSERT(so->so_rcv.sb_lastrecord == m, 2170 ("soreceive: post-control, lastrecord!=m")); 2171 } 2172 } 2173 type = m->m_type; 2174 if (type == MT_OOBDATA) 2175 flags |= MSG_OOB; 2176 } else { 2177 if ((flags & MSG_PEEK) == 0) { 2178 KASSERT(so->so_rcv.sb_mb == nextrecord, 2179 ("soreceive: sb_mb != nextrecord")); 2180 if (so->so_rcv.sb_mb == NULL) { 2181 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2182 ("soreceive: sb_lastercord != NULL")); 2183 } 2184 } 2185 } 2186 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2187 SBLASTRECORDCHK(&so->so_rcv); 2188 SBLASTMBUFCHK(&so->so_rcv); 2189 2190 /* 2191 * Now continue to read any data mbufs off of the head of the socket 2192 * buffer until the read request is satisfied. Note that 'type' is 2193 * used to store the type of any mbuf reads that have happened so far 2194 * such that soreceive() can stop reading if the type changes, which 2195 * causes soreceive() to return only one of regular data and inline 2196 * out-of-band data in a single socket receive operation. 2197 */ 2198 moff = 0; 2199 offset = 0; 2200 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 2201 && error == 0) { 2202 /* 2203 * If the type of mbuf has changed since the last mbuf 2204 * examined ('type'), end the receive operation. 2205 */ 2206 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2207 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 2208 if (type != m->m_type) 2209 break; 2210 } else if (type == MT_OOBDATA) 2211 break; 2212 else 2213 KASSERT(m->m_type == MT_DATA, 2214 ("m->m_type == %d", m->m_type)); 2215 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 2216 len = uio->uio_resid; 2217 if (so->so_oobmark && len > so->so_oobmark - offset) 2218 len = so->so_oobmark - offset; 2219 if (len > m->m_len - moff) 2220 len = m->m_len - moff; 2221 /* 2222 * If mp is set, just pass back the mbufs. Otherwise copy 2223 * them out via the uio, then free. Sockbuf must be 2224 * consistent here (points to current mbuf, it points to next 2225 * record) when we drop priority; we must note any additions 2226 * to the sockbuf when we block interrupts again. 2227 */ 2228 if (mp == NULL) { 2229 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2230 SBLASTRECORDCHK(&so->so_rcv); 2231 SBLASTMBUFCHK(&so->so_rcv); 2232 SOCKBUF_UNLOCK(&so->so_rcv); 2233 if ((m->m_flags & M_EXTPG) != 0) 2234 error = m_unmapped_uiomove(m, moff, uio, 2235 (int)len); 2236 else 2237 error = uiomove(mtod(m, char *) + moff, 2238 (int)len, uio); 2239 SOCKBUF_LOCK(&so->so_rcv); 2240 if (error) { 2241 /* 2242 * The MT_SONAME mbuf has already been removed 2243 * from the record, so it is necessary to 2244 * remove the data mbufs, if any, to preserve 2245 * the invariant in the case of PR_ADDR that 2246 * requires MT_SONAME mbufs at the head of 2247 * each record. 2248 */ 2249 if (pr->pr_flags & PR_ATOMIC && 2250 ((flags & MSG_PEEK) == 0)) 2251 (void)sbdroprecord_locked(&so->so_rcv); 2252 SOCKBUF_UNLOCK(&so->so_rcv); 2253 goto release; 2254 } 2255 } else 2256 uio->uio_resid -= len; 2257 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2258 if (len == m->m_len - moff) { 2259 if (m->m_flags & M_EOR) 2260 flags |= MSG_EOR; 2261 if (flags & MSG_PEEK) { 2262 m = m->m_next; 2263 moff = 0; 2264 } else { 2265 nextrecord = m->m_nextpkt; 2266 sbfree(&so->so_rcv, m); 2267 if (mp != NULL) { 2268 m->m_nextpkt = NULL; 2269 *mp = m; 2270 mp = &m->m_next; 2271 so->so_rcv.sb_mb = m = m->m_next; 2272 *mp = NULL; 2273 } else { 2274 so->so_rcv.sb_mb = m_free(m); 2275 m = so->so_rcv.sb_mb; 2276 } 2277 sockbuf_pushsync(&so->so_rcv, nextrecord); 2278 SBLASTRECORDCHK(&so->so_rcv); 2279 SBLASTMBUFCHK(&so->so_rcv); 2280 } 2281 } else { 2282 if (flags & MSG_PEEK) 2283 moff += len; 2284 else { 2285 if (mp != NULL) { 2286 if (flags & MSG_DONTWAIT) { 2287 *mp = m_copym(m, 0, len, 2288 M_NOWAIT); 2289 if (*mp == NULL) { 2290 /* 2291 * m_copym() couldn't 2292 * allocate an mbuf. 2293 * Adjust uio_resid back 2294 * (it was adjusted 2295 * down by len bytes, 2296 * which we didn't end 2297 * up "copying" over). 2298 */ 2299 uio->uio_resid += len; 2300 break; 2301 } 2302 } else { 2303 SOCKBUF_UNLOCK(&so->so_rcv); 2304 *mp = m_copym(m, 0, len, 2305 M_WAITOK); 2306 SOCKBUF_LOCK(&so->so_rcv); 2307 } 2308 } 2309 sbcut_locked(&so->so_rcv, len); 2310 } 2311 } 2312 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2313 if (so->so_oobmark) { 2314 if ((flags & MSG_PEEK) == 0) { 2315 so->so_oobmark -= len; 2316 if (so->so_oobmark == 0) { 2317 so->so_rcv.sb_state |= SBS_RCVATMARK; 2318 break; 2319 } 2320 } else { 2321 offset += len; 2322 if (offset == so->so_oobmark) 2323 break; 2324 } 2325 } 2326 if (flags & MSG_EOR) 2327 break; 2328 /* 2329 * If the MSG_WAITALL flag is set (for non-atomic socket), we 2330 * must not quit until "uio->uio_resid == 0" or an error 2331 * termination. If a signal/timeout occurs, return with a 2332 * short count but without error. Keep sockbuf locked 2333 * against other readers. 2334 */ 2335 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 2336 !sosendallatonce(so) && nextrecord == NULL) { 2337 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2338 if (so->so_error || so->so_rerror || 2339 so->so_rcv.sb_state & SBS_CANTRCVMORE) 2340 break; 2341 /* 2342 * Notify the protocol that some data has been 2343 * drained before blocking. 2344 */ 2345 if (pr->pr_flags & PR_WANTRCVD) { 2346 SOCKBUF_UNLOCK(&so->so_rcv); 2347 VNET_SO_ASSERT(so); 2348 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2349 SOCKBUF_LOCK(&so->so_rcv); 2350 } 2351 SBLASTRECORDCHK(&so->so_rcv); 2352 SBLASTMBUFCHK(&so->so_rcv); 2353 /* 2354 * We could receive some data while was notifying 2355 * the protocol. Skip blocking in this case. 2356 */ 2357 if (so->so_rcv.sb_mb == NULL) { 2358 error = sbwait(&so->so_rcv); 2359 if (error) { 2360 SOCKBUF_UNLOCK(&so->so_rcv); 2361 goto release; 2362 } 2363 } 2364 m = so->so_rcv.sb_mb; 2365 if (m != NULL) 2366 nextrecord = m->m_nextpkt; 2367 } 2368 } 2369 2370 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2371 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 2372 flags |= MSG_TRUNC; 2373 if ((flags & MSG_PEEK) == 0) 2374 (void) sbdroprecord_locked(&so->so_rcv); 2375 } 2376 if ((flags & MSG_PEEK) == 0) { 2377 if (m == NULL) { 2378 /* 2379 * First part is an inline SB_EMPTY_FIXUP(). Second 2380 * part makes sure sb_lastrecord is up-to-date if 2381 * there is still data in the socket buffer. 2382 */ 2383 so->so_rcv.sb_mb = nextrecord; 2384 if (so->so_rcv.sb_mb == NULL) { 2385 so->so_rcv.sb_mbtail = NULL; 2386 so->so_rcv.sb_lastrecord = NULL; 2387 } else if (nextrecord->m_nextpkt == NULL) 2388 so->so_rcv.sb_lastrecord = nextrecord; 2389 } 2390 SBLASTRECORDCHK(&so->so_rcv); 2391 SBLASTMBUFCHK(&so->so_rcv); 2392 /* 2393 * If soreceive() is being done from the socket callback, 2394 * then don't need to generate ACK to peer to update window, 2395 * since ACK will be generated on return to TCP. 2396 */ 2397 if (!(flags & MSG_SOCALLBCK) && 2398 (pr->pr_flags & PR_WANTRCVD)) { 2399 SOCKBUF_UNLOCK(&so->so_rcv); 2400 VNET_SO_ASSERT(so); 2401 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2402 SOCKBUF_LOCK(&so->so_rcv); 2403 } 2404 } 2405 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2406 if (orig_resid == uio->uio_resid && orig_resid && 2407 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 2408 SOCKBUF_UNLOCK(&so->so_rcv); 2409 goto restart; 2410 } 2411 SOCKBUF_UNLOCK(&so->so_rcv); 2412 2413 if (flagsp != NULL) 2414 *flagsp |= flags; 2415 release: 2416 SOCK_IO_RECV_UNLOCK(so); 2417 return (error); 2418 } 2419 2420 /* 2421 * Optimized version of soreceive() for stream (TCP) sockets. 2422 */ 2423 int 2424 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 2425 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2426 { 2427 int len = 0, error = 0, flags, oresid; 2428 struct sockbuf *sb; 2429 struct mbuf *m, *n = NULL; 2430 2431 /* We only do stream sockets. */ 2432 if (so->so_type != SOCK_STREAM) 2433 return (EINVAL); 2434 if (psa != NULL) 2435 *psa = NULL; 2436 if (flagsp != NULL) 2437 flags = *flagsp &~ MSG_EOR; 2438 else 2439 flags = 0; 2440 if (controlp != NULL) 2441 *controlp = NULL; 2442 if (flags & MSG_OOB) 2443 return (soreceive_rcvoob(so, uio, flags)); 2444 if (mp0 != NULL) 2445 *mp0 = NULL; 2446 2447 sb = &so->so_rcv; 2448 2449 #ifdef KERN_TLS 2450 /* 2451 * KTLS store TLS records as records with a control message to 2452 * describe the framing. 2453 * 2454 * We check once here before acquiring locks to optimize the 2455 * common case. 2456 */ 2457 if (sb->sb_tls_info != NULL) 2458 return (soreceive_generic(so, psa, uio, mp0, controlp, 2459 flagsp)); 2460 #endif 2461 2462 /* Prevent other readers from entering the socket. */ 2463 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 2464 if (error) 2465 return (error); 2466 SOCKBUF_LOCK(sb); 2467 2468 #ifdef KERN_TLS 2469 if (sb->sb_tls_info != NULL) { 2470 SOCKBUF_UNLOCK(sb); 2471 SOCK_IO_RECV_UNLOCK(so); 2472 return (soreceive_generic(so, psa, uio, mp0, controlp, 2473 flagsp)); 2474 } 2475 #endif 2476 2477 /* Easy one, no space to copyout anything. */ 2478 if (uio->uio_resid == 0) { 2479 error = EINVAL; 2480 goto out; 2481 } 2482 oresid = uio->uio_resid; 2483 2484 /* We will never ever get anything unless we are or were connected. */ 2485 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2486 error = ENOTCONN; 2487 goto out; 2488 } 2489 2490 restart: 2491 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2492 2493 /* Abort if socket has reported problems. */ 2494 if (so->so_error) { 2495 if (sbavail(sb) > 0) 2496 goto deliver; 2497 if (oresid > uio->uio_resid) 2498 goto out; 2499 error = so->so_error; 2500 if (!(flags & MSG_PEEK)) 2501 so->so_error = 0; 2502 goto out; 2503 } 2504 2505 /* Door is closed. Deliver what is left, if any. */ 2506 if (sb->sb_state & SBS_CANTRCVMORE) { 2507 if (sbavail(sb) > 0) 2508 goto deliver; 2509 else 2510 goto out; 2511 } 2512 2513 /* Socket buffer is empty and we shall not block. */ 2514 if (sbavail(sb) == 0 && 2515 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2516 error = EAGAIN; 2517 goto out; 2518 } 2519 2520 /* Socket buffer got some data that we shall deliver now. */ 2521 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 2522 ((so->so_state & SS_NBIO) || 2523 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2524 sbavail(sb) >= sb->sb_lowat || 2525 sbavail(sb) >= uio->uio_resid || 2526 sbavail(sb) >= sb->sb_hiwat) ) { 2527 goto deliver; 2528 } 2529 2530 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2531 if ((flags & MSG_WAITALL) && 2532 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 2533 goto deliver; 2534 2535 /* 2536 * Wait and block until (more) data comes in. 2537 * NB: Drops the sockbuf lock during wait. 2538 */ 2539 error = sbwait(sb); 2540 if (error) 2541 goto out; 2542 goto restart; 2543 2544 deliver: 2545 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2546 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 2547 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2548 2549 /* Statistics. */ 2550 if (uio->uio_td) 2551 uio->uio_td->td_ru.ru_msgrcv++; 2552 2553 /* Fill uio until full or current end of socket buffer is reached. */ 2554 len = min(uio->uio_resid, sbavail(sb)); 2555 if (mp0 != NULL) { 2556 /* Dequeue as many mbufs as possible. */ 2557 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2558 if (*mp0 == NULL) 2559 *mp0 = sb->sb_mb; 2560 else 2561 m_cat(*mp0, sb->sb_mb); 2562 for (m = sb->sb_mb; 2563 m != NULL && m->m_len <= len; 2564 m = m->m_next) { 2565 KASSERT(!(m->m_flags & M_NOTAVAIL), 2566 ("%s: m %p not available", __func__, m)); 2567 len -= m->m_len; 2568 uio->uio_resid -= m->m_len; 2569 sbfree(sb, m); 2570 n = m; 2571 } 2572 n->m_next = NULL; 2573 sb->sb_mb = m; 2574 sb->sb_lastrecord = sb->sb_mb; 2575 if (sb->sb_mb == NULL) 2576 SB_EMPTY_FIXUP(sb); 2577 } 2578 /* Copy the remainder. */ 2579 if (len > 0) { 2580 KASSERT(sb->sb_mb != NULL, 2581 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2582 2583 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2584 if (m == NULL) 2585 len = 0; /* Don't flush data from sockbuf. */ 2586 else 2587 uio->uio_resid -= len; 2588 if (*mp0 != NULL) 2589 m_cat(*mp0, m); 2590 else 2591 *mp0 = m; 2592 if (*mp0 == NULL) { 2593 error = ENOBUFS; 2594 goto out; 2595 } 2596 } 2597 } else { 2598 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2599 SOCKBUF_UNLOCK(sb); 2600 error = m_mbuftouio(uio, sb->sb_mb, len); 2601 SOCKBUF_LOCK(sb); 2602 if (error) 2603 goto out; 2604 } 2605 SBLASTRECORDCHK(sb); 2606 SBLASTMBUFCHK(sb); 2607 2608 /* 2609 * Remove the delivered data from the socket buffer unless we 2610 * were only peeking. 2611 */ 2612 if (!(flags & MSG_PEEK)) { 2613 if (len > 0) 2614 sbdrop_locked(sb, len); 2615 2616 /* Notify protocol that we drained some data. */ 2617 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2618 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2619 !(flags & MSG_SOCALLBCK))) { 2620 SOCKBUF_UNLOCK(sb); 2621 VNET_SO_ASSERT(so); 2622 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 2623 SOCKBUF_LOCK(sb); 2624 } 2625 } 2626 2627 /* 2628 * For MSG_WAITALL we may have to loop again and wait for 2629 * more data to come in. 2630 */ 2631 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2632 goto restart; 2633 out: 2634 SBLASTRECORDCHK(sb); 2635 SBLASTMBUFCHK(sb); 2636 SOCKBUF_UNLOCK(sb); 2637 SOCK_IO_RECV_UNLOCK(so); 2638 return (error); 2639 } 2640 2641 /* 2642 * Optimized version of soreceive() for simple datagram cases from userspace. 2643 * Unlike in the stream case, we're able to drop a datagram if copyout() 2644 * fails, and because we handle datagrams atomically, we don't need to use a 2645 * sleep lock to prevent I/O interlacing. 2646 */ 2647 int 2648 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2649 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2650 { 2651 struct mbuf *m, *m2; 2652 int flags, error; 2653 ssize_t len; 2654 struct protosw *pr = so->so_proto; 2655 struct mbuf *nextrecord; 2656 2657 if (psa != NULL) 2658 *psa = NULL; 2659 if (controlp != NULL) 2660 *controlp = NULL; 2661 if (flagsp != NULL) 2662 flags = *flagsp &~ MSG_EOR; 2663 else 2664 flags = 0; 2665 2666 /* 2667 * For any complicated cases, fall back to the full 2668 * soreceive_generic(). 2669 */ 2670 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) 2671 return (soreceive_generic(so, psa, uio, mp0, controlp, 2672 flagsp)); 2673 2674 /* 2675 * Enforce restrictions on use. 2676 */ 2677 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2678 ("soreceive_dgram: wantrcvd")); 2679 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2680 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2681 ("soreceive_dgram: SBS_RCVATMARK")); 2682 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2683 ("soreceive_dgram: P_CONNREQUIRED")); 2684 2685 /* 2686 * Loop blocking while waiting for a datagram. 2687 */ 2688 SOCKBUF_LOCK(&so->so_rcv); 2689 while ((m = so->so_rcv.sb_mb) == NULL) { 2690 KASSERT(sbavail(&so->so_rcv) == 0, 2691 ("soreceive_dgram: sb_mb NULL but sbavail %u", 2692 sbavail(&so->so_rcv))); 2693 if (so->so_error) { 2694 error = so->so_error; 2695 so->so_error = 0; 2696 SOCKBUF_UNLOCK(&so->so_rcv); 2697 return (error); 2698 } 2699 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2700 uio->uio_resid == 0) { 2701 SOCKBUF_UNLOCK(&so->so_rcv); 2702 return (0); 2703 } 2704 if ((so->so_state & SS_NBIO) || 2705 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2706 SOCKBUF_UNLOCK(&so->so_rcv); 2707 return (EWOULDBLOCK); 2708 } 2709 SBLASTRECORDCHK(&so->so_rcv); 2710 SBLASTMBUFCHK(&so->so_rcv); 2711 error = sbwait(&so->so_rcv); 2712 if (error) { 2713 SOCKBUF_UNLOCK(&so->so_rcv); 2714 return (error); 2715 } 2716 } 2717 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2718 2719 if (uio->uio_td) 2720 uio->uio_td->td_ru.ru_msgrcv++; 2721 SBLASTRECORDCHK(&so->so_rcv); 2722 SBLASTMBUFCHK(&so->so_rcv); 2723 nextrecord = m->m_nextpkt; 2724 if (nextrecord == NULL) { 2725 KASSERT(so->so_rcv.sb_lastrecord == m, 2726 ("soreceive_dgram: lastrecord != m")); 2727 } 2728 2729 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2730 ("soreceive_dgram: m_nextpkt != nextrecord")); 2731 2732 /* 2733 * Pull 'm' and its chain off the front of the packet queue. 2734 */ 2735 so->so_rcv.sb_mb = NULL; 2736 sockbuf_pushsync(&so->so_rcv, nextrecord); 2737 2738 /* 2739 * Walk 'm's chain and free that many bytes from the socket buffer. 2740 */ 2741 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2742 sbfree(&so->so_rcv, m2); 2743 2744 /* 2745 * Do a few last checks before we let go of the lock. 2746 */ 2747 SBLASTRECORDCHK(&so->so_rcv); 2748 SBLASTMBUFCHK(&so->so_rcv); 2749 SOCKBUF_UNLOCK(&so->so_rcv); 2750 2751 if (pr->pr_flags & PR_ADDR) { 2752 KASSERT(m->m_type == MT_SONAME, 2753 ("m->m_type == %d", m->m_type)); 2754 if (psa != NULL) 2755 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2756 M_NOWAIT); 2757 m = m_free(m); 2758 } 2759 if (m == NULL) { 2760 /* XXXRW: Can this happen? */ 2761 return (0); 2762 } 2763 2764 /* 2765 * Packet to copyout() is now in 'm' and it is disconnected from the 2766 * queue. 2767 * 2768 * Process one or more MT_CONTROL mbufs present before any data mbufs 2769 * in the first mbuf chain on the socket buffer. We call into the 2770 * protocol to perform externalization (or freeing if controlp == 2771 * NULL). In some cases there can be only MT_CONTROL mbufs without 2772 * MT_DATA mbufs. 2773 */ 2774 if (m->m_type == MT_CONTROL) { 2775 struct mbuf *cm = NULL, *cmn; 2776 struct mbuf **cme = &cm; 2777 2778 do { 2779 m2 = m->m_next; 2780 m->m_next = NULL; 2781 *cme = m; 2782 cme = &(*cme)->m_next; 2783 m = m2; 2784 } while (m != NULL && m->m_type == MT_CONTROL); 2785 while (cm != NULL) { 2786 cmn = cm->m_next; 2787 cm->m_next = NULL; 2788 if (pr->pr_domain->dom_externalize != NULL) { 2789 error = (*pr->pr_domain->dom_externalize) 2790 (cm, controlp, flags); 2791 } else if (controlp != NULL) 2792 *controlp = cm; 2793 else 2794 m_freem(cm); 2795 if (controlp != NULL) { 2796 while (*controlp != NULL) 2797 controlp = &(*controlp)->m_next; 2798 } 2799 cm = cmn; 2800 } 2801 } 2802 KASSERT(m == NULL || m->m_type == MT_DATA, 2803 ("soreceive_dgram: !data")); 2804 while (m != NULL && uio->uio_resid > 0) { 2805 len = uio->uio_resid; 2806 if (len > m->m_len) 2807 len = m->m_len; 2808 error = uiomove(mtod(m, char *), (int)len, uio); 2809 if (error) { 2810 m_freem(m); 2811 return (error); 2812 } 2813 if (len == m->m_len) 2814 m = m_free(m); 2815 else { 2816 m->m_data += len; 2817 m->m_len -= len; 2818 } 2819 } 2820 if (m != NULL) { 2821 flags |= MSG_TRUNC; 2822 m_freem(m); 2823 } 2824 if (flagsp != NULL) 2825 *flagsp |= flags; 2826 return (0); 2827 } 2828 2829 int 2830 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2831 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2832 { 2833 int error; 2834 2835 CURVNET_SET(so->so_vnet); 2836 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, 2837 mp0, controlp, flagsp)); 2838 CURVNET_RESTORE(); 2839 return (error); 2840 } 2841 2842 int 2843 soshutdown(struct socket *so, int how) 2844 { 2845 struct protosw *pr = so->so_proto; 2846 int error, soerror_enotconn; 2847 2848 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2849 return (EINVAL); 2850 2851 soerror_enotconn = 0; 2852 if ((so->so_state & 2853 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { 2854 /* 2855 * POSIX mandates us to return ENOTCONN when shutdown(2) is 2856 * invoked on a datagram sockets, however historically we would 2857 * actually tear socket down. This is known to be leveraged by 2858 * some applications to unblock process waiting in recvXXX(2) 2859 * by other process that it shares that socket with. Try to meet 2860 * both backward-compatibility and POSIX requirements by forcing 2861 * ENOTCONN but still asking protocol to perform pru_shutdown(). 2862 */ 2863 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) 2864 return (ENOTCONN); 2865 soerror_enotconn = 1; 2866 } 2867 2868 if (SOLISTENING(so)) { 2869 if (how != SHUT_WR) { 2870 SOLISTEN_LOCK(so); 2871 so->so_error = ECONNABORTED; 2872 solisten_wakeup(so); /* unlocks so */ 2873 } 2874 goto done; 2875 } 2876 2877 CURVNET_SET(so->so_vnet); 2878 if (pr->pr_usrreqs->pru_flush != NULL) 2879 (*pr->pr_usrreqs->pru_flush)(so, how); 2880 if (how != SHUT_WR) 2881 sorflush(so); 2882 if (how != SHUT_RD) { 2883 error = (*pr->pr_usrreqs->pru_shutdown)(so); 2884 wakeup(&so->so_timeo); 2885 CURVNET_RESTORE(); 2886 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); 2887 } 2888 wakeup(&so->so_timeo); 2889 CURVNET_RESTORE(); 2890 2891 done: 2892 return (soerror_enotconn ? ENOTCONN : 0); 2893 } 2894 2895 void 2896 sorflush(struct socket *so) 2897 { 2898 struct sockbuf *sb = &so->so_rcv; 2899 struct protosw *pr = so->so_proto; 2900 struct socket aso; 2901 int error; 2902 2903 VNET_SO_ASSERT(so); 2904 2905 /* 2906 * In order to avoid calling dom_dispose with the socket buffer mutex 2907 * held, and in order to generally avoid holding the lock for a long 2908 * time, we make a copy of the socket buffer and clear the original 2909 * (except locks, state). The new socket buffer copy won't have 2910 * initialized locks so we can only call routines that won't use or 2911 * assert those locks. 2912 * 2913 * Dislodge threads currently blocked in receive and wait to acquire 2914 * a lock against other simultaneous readers before clearing the 2915 * socket buffer. Don't let our acquire be interrupted by a signal 2916 * despite any existing socket disposition on interruptable waiting. 2917 */ 2918 socantrcvmore(so); 2919 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); 2920 KASSERT(error == 0, ("%s: cannot lock sock %p recv buffer", 2921 __func__, so)); 2922 2923 /* 2924 * Invalidate/clear most of the sockbuf structure, but leave selinfo 2925 * and mutex data unchanged. 2926 */ 2927 SOCKBUF_LOCK(sb); 2928 bzero(&aso, sizeof(aso)); 2929 aso.so_pcb = so->so_pcb; 2930 bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, 2931 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2932 bzero(&sb->sb_startzero, 2933 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2934 SOCKBUF_UNLOCK(sb); 2935 SOCK_IO_RECV_UNLOCK(so); 2936 2937 /* 2938 * Dispose of special rights and flush the copied socket. Don't call 2939 * any unsafe routines (that rely on locks being initialized) on aso. 2940 */ 2941 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 2942 (*pr->pr_domain->dom_dispose)(&aso); 2943 sbrelease_internal(&aso.so_rcv, so); 2944 } 2945 2946 /* 2947 * Wrapper for Socket established helper hook. 2948 * Parameters: socket, context of the hook point, hook id. 2949 */ 2950 static int inline 2951 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 2952 { 2953 struct socket_hhook_data hhook_data = { 2954 .so = so, 2955 .hctx = hctx, 2956 .m = NULL, 2957 .status = 0 2958 }; 2959 2960 CURVNET_SET(so->so_vnet); 2961 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 2962 CURVNET_RESTORE(); 2963 2964 /* Ugly but needed, since hhooks return void for now */ 2965 return (hhook_data.status); 2966 } 2967 2968 /* 2969 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2970 * additional variant to handle the case where the option value needs to be 2971 * some kind of integer, but not a specific size. In addition to their use 2972 * here, these functions are also called by the protocol-level pr_ctloutput() 2973 * routines. 2974 */ 2975 int 2976 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2977 { 2978 size_t valsize; 2979 2980 /* 2981 * If the user gives us more than we wanted, we ignore it, but if we 2982 * don't get the minimum length the caller wants, we return EINVAL. 2983 * On success, sopt->sopt_valsize is set to however much we actually 2984 * retrieved. 2985 */ 2986 if ((valsize = sopt->sopt_valsize) < minlen) 2987 return EINVAL; 2988 if (valsize > len) 2989 sopt->sopt_valsize = valsize = len; 2990 2991 if (sopt->sopt_td != NULL) 2992 return (copyin(sopt->sopt_val, buf, valsize)); 2993 2994 bcopy(sopt->sopt_val, buf, valsize); 2995 return (0); 2996 } 2997 2998 /* 2999 * Kernel version of setsockopt(2). 3000 * 3001 * XXX: optlen is size_t, not socklen_t 3002 */ 3003 int 3004 so_setsockopt(struct socket *so, int level, int optname, void *optval, 3005 size_t optlen) 3006 { 3007 struct sockopt sopt; 3008 3009 sopt.sopt_level = level; 3010 sopt.sopt_name = optname; 3011 sopt.sopt_dir = SOPT_SET; 3012 sopt.sopt_val = optval; 3013 sopt.sopt_valsize = optlen; 3014 sopt.sopt_td = NULL; 3015 return (sosetopt(so, &sopt)); 3016 } 3017 3018 int 3019 sosetopt(struct socket *so, struct sockopt *sopt) 3020 { 3021 int error, optval; 3022 struct linger l; 3023 struct timeval tv; 3024 sbintime_t val; 3025 uint32_t val32; 3026 #ifdef MAC 3027 struct mac extmac; 3028 #endif 3029 3030 CURVNET_SET(so->so_vnet); 3031 error = 0; 3032 if (sopt->sopt_level != SOL_SOCKET) { 3033 if (so->so_proto->pr_ctloutput != NULL) 3034 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3035 else 3036 error = ENOPROTOOPT; 3037 } else { 3038 switch (sopt->sopt_name) { 3039 case SO_ACCEPTFILTER: 3040 error = accept_filt_setopt(so, sopt); 3041 if (error) 3042 goto bad; 3043 break; 3044 3045 case SO_LINGER: 3046 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 3047 if (error) 3048 goto bad; 3049 if (l.l_linger < 0 || 3050 l.l_linger > USHRT_MAX || 3051 l.l_linger > (INT_MAX / hz)) { 3052 error = EDOM; 3053 goto bad; 3054 } 3055 SOCK_LOCK(so); 3056 so->so_linger = l.l_linger; 3057 if (l.l_onoff) 3058 so->so_options |= SO_LINGER; 3059 else 3060 so->so_options &= ~SO_LINGER; 3061 SOCK_UNLOCK(so); 3062 break; 3063 3064 case SO_DEBUG: 3065 case SO_KEEPALIVE: 3066 case SO_DONTROUTE: 3067 case SO_USELOOPBACK: 3068 case SO_BROADCAST: 3069 case SO_REUSEADDR: 3070 case SO_REUSEPORT: 3071 case SO_REUSEPORT_LB: 3072 case SO_OOBINLINE: 3073 case SO_TIMESTAMP: 3074 case SO_BINTIME: 3075 case SO_NOSIGPIPE: 3076 case SO_NO_DDP: 3077 case SO_NO_OFFLOAD: 3078 case SO_RERROR: 3079 error = sooptcopyin(sopt, &optval, sizeof optval, 3080 sizeof optval); 3081 if (error) 3082 goto bad; 3083 SOCK_LOCK(so); 3084 if (optval) 3085 so->so_options |= sopt->sopt_name; 3086 else 3087 so->so_options &= ~sopt->sopt_name; 3088 SOCK_UNLOCK(so); 3089 break; 3090 3091 case SO_SETFIB: 3092 error = sooptcopyin(sopt, &optval, sizeof optval, 3093 sizeof optval); 3094 if (error) 3095 goto bad; 3096 3097 if (optval < 0 || optval >= rt_numfibs) { 3098 error = EINVAL; 3099 goto bad; 3100 } 3101 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 3102 (so->so_proto->pr_domain->dom_family == PF_INET6) || 3103 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 3104 so->so_fibnum = optval; 3105 else 3106 so->so_fibnum = 0; 3107 break; 3108 3109 case SO_USER_COOKIE: 3110 error = sooptcopyin(sopt, &val32, sizeof val32, 3111 sizeof val32); 3112 if (error) 3113 goto bad; 3114 so->so_user_cookie = val32; 3115 break; 3116 3117 case SO_SNDBUF: 3118 case SO_RCVBUF: 3119 case SO_SNDLOWAT: 3120 case SO_RCVLOWAT: 3121 error = sooptcopyin(sopt, &optval, sizeof optval, 3122 sizeof optval); 3123 if (error) 3124 goto bad; 3125 3126 /* 3127 * Values < 1 make no sense for any of these options, 3128 * so disallow them. 3129 */ 3130 if (optval < 1) { 3131 error = EINVAL; 3132 goto bad; 3133 } 3134 3135 error = sbsetopt(so, sopt->sopt_name, optval); 3136 break; 3137 3138 case SO_SNDTIMEO: 3139 case SO_RCVTIMEO: 3140 #ifdef COMPAT_FREEBSD32 3141 if (SV_CURPROC_FLAG(SV_ILP32)) { 3142 struct timeval32 tv32; 3143 3144 error = sooptcopyin(sopt, &tv32, sizeof tv32, 3145 sizeof tv32); 3146 CP(tv32, tv, tv_sec); 3147 CP(tv32, tv, tv_usec); 3148 } else 3149 #endif 3150 error = sooptcopyin(sopt, &tv, sizeof tv, 3151 sizeof tv); 3152 if (error) 3153 goto bad; 3154 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 3155 tv.tv_usec >= 1000000) { 3156 error = EDOM; 3157 goto bad; 3158 } 3159 if (tv.tv_sec > INT32_MAX) 3160 val = SBT_MAX; 3161 else 3162 val = tvtosbt(tv); 3163 switch (sopt->sopt_name) { 3164 case SO_SNDTIMEO: 3165 so->so_snd.sb_timeo = val; 3166 break; 3167 case SO_RCVTIMEO: 3168 so->so_rcv.sb_timeo = val; 3169 break; 3170 } 3171 break; 3172 3173 case SO_LABEL: 3174 #ifdef MAC 3175 error = sooptcopyin(sopt, &extmac, sizeof extmac, 3176 sizeof extmac); 3177 if (error) 3178 goto bad; 3179 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 3180 so, &extmac); 3181 #else 3182 error = EOPNOTSUPP; 3183 #endif 3184 break; 3185 3186 case SO_TS_CLOCK: 3187 error = sooptcopyin(sopt, &optval, sizeof optval, 3188 sizeof optval); 3189 if (error) 3190 goto bad; 3191 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 3192 error = EINVAL; 3193 goto bad; 3194 } 3195 so->so_ts_clock = optval; 3196 break; 3197 3198 case SO_MAX_PACING_RATE: 3199 error = sooptcopyin(sopt, &val32, sizeof(val32), 3200 sizeof(val32)); 3201 if (error) 3202 goto bad; 3203 so->so_max_pacing_rate = val32; 3204 break; 3205 3206 default: 3207 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3208 error = hhook_run_socket(so, sopt, 3209 HHOOK_SOCKET_OPT); 3210 else 3211 error = ENOPROTOOPT; 3212 break; 3213 } 3214 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 3215 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 3216 } 3217 bad: 3218 CURVNET_RESTORE(); 3219 return (error); 3220 } 3221 3222 /* 3223 * Helper routine for getsockopt. 3224 */ 3225 int 3226 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 3227 { 3228 int error; 3229 size_t valsize; 3230 3231 error = 0; 3232 3233 /* 3234 * Documented get behavior is that we always return a value, possibly 3235 * truncated to fit in the user's buffer. Traditional behavior is 3236 * that we always tell the user precisely how much we copied, rather 3237 * than something useful like the total amount we had available for 3238 * her. Note that this interface is not idempotent; the entire 3239 * answer must be generated ahead of time. 3240 */ 3241 valsize = min(len, sopt->sopt_valsize); 3242 sopt->sopt_valsize = valsize; 3243 if (sopt->sopt_val != NULL) { 3244 if (sopt->sopt_td != NULL) 3245 error = copyout(buf, sopt->sopt_val, valsize); 3246 else 3247 bcopy(buf, sopt->sopt_val, valsize); 3248 } 3249 return (error); 3250 } 3251 3252 int 3253 sogetopt(struct socket *so, struct sockopt *sopt) 3254 { 3255 int error, optval; 3256 struct linger l; 3257 struct timeval tv; 3258 #ifdef MAC 3259 struct mac extmac; 3260 #endif 3261 3262 CURVNET_SET(so->so_vnet); 3263 error = 0; 3264 if (sopt->sopt_level != SOL_SOCKET) { 3265 if (so->so_proto->pr_ctloutput != NULL) 3266 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3267 else 3268 error = ENOPROTOOPT; 3269 CURVNET_RESTORE(); 3270 return (error); 3271 } else { 3272 switch (sopt->sopt_name) { 3273 case SO_ACCEPTFILTER: 3274 error = accept_filt_getopt(so, sopt); 3275 break; 3276 3277 case SO_LINGER: 3278 SOCK_LOCK(so); 3279 l.l_onoff = so->so_options & SO_LINGER; 3280 l.l_linger = so->so_linger; 3281 SOCK_UNLOCK(so); 3282 error = sooptcopyout(sopt, &l, sizeof l); 3283 break; 3284 3285 case SO_USELOOPBACK: 3286 case SO_DONTROUTE: 3287 case SO_DEBUG: 3288 case SO_KEEPALIVE: 3289 case SO_REUSEADDR: 3290 case SO_REUSEPORT: 3291 case SO_REUSEPORT_LB: 3292 case SO_BROADCAST: 3293 case SO_OOBINLINE: 3294 case SO_ACCEPTCONN: 3295 case SO_TIMESTAMP: 3296 case SO_BINTIME: 3297 case SO_NOSIGPIPE: 3298 case SO_NO_DDP: 3299 case SO_NO_OFFLOAD: 3300 case SO_RERROR: 3301 optval = so->so_options & sopt->sopt_name; 3302 integer: 3303 error = sooptcopyout(sopt, &optval, sizeof optval); 3304 break; 3305 3306 case SO_DOMAIN: 3307 optval = so->so_proto->pr_domain->dom_family; 3308 goto integer; 3309 3310 case SO_TYPE: 3311 optval = so->so_type; 3312 goto integer; 3313 3314 case SO_PROTOCOL: 3315 optval = so->so_proto->pr_protocol; 3316 goto integer; 3317 3318 case SO_ERROR: 3319 SOCK_LOCK(so); 3320 if (so->so_error) { 3321 optval = so->so_error; 3322 so->so_error = 0; 3323 } else { 3324 optval = so->so_rerror; 3325 so->so_rerror = 0; 3326 } 3327 SOCK_UNLOCK(so); 3328 goto integer; 3329 3330 case SO_SNDBUF: 3331 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 3332 so->so_snd.sb_hiwat; 3333 goto integer; 3334 3335 case SO_RCVBUF: 3336 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 3337 so->so_rcv.sb_hiwat; 3338 goto integer; 3339 3340 case SO_SNDLOWAT: 3341 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 3342 so->so_snd.sb_lowat; 3343 goto integer; 3344 3345 case SO_RCVLOWAT: 3346 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 3347 so->so_rcv.sb_lowat; 3348 goto integer; 3349 3350 case SO_SNDTIMEO: 3351 case SO_RCVTIMEO: 3352 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 3353 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 3354 #ifdef COMPAT_FREEBSD32 3355 if (SV_CURPROC_FLAG(SV_ILP32)) { 3356 struct timeval32 tv32; 3357 3358 CP(tv, tv32, tv_sec); 3359 CP(tv, tv32, tv_usec); 3360 error = sooptcopyout(sopt, &tv32, sizeof tv32); 3361 } else 3362 #endif 3363 error = sooptcopyout(sopt, &tv, sizeof tv); 3364 break; 3365 3366 case SO_LABEL: 3367 #ifdef MAC 3368 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3369 sizeof(extmac)); 3370 if (error) 3371 goto bad; 3372 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 3373 so, &extmac); 3374 if (error) 3375 goto bad; 3376 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3377 #else 3378 error = EOPNOTSUPP; 3379 #endif 3380 break; 3381 3382 case SO_PEERLABEL: 3383 #ifdef MAC 3384 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3385 sizeof(extmac)); 3386 if (error) 3387 goto bad; 3388 error = mac_getsockopt_peerlabel( 3389 sopt->sopt_td->td_ucred, so, &extmac); 3390 if (error) 3391 goto bad; 3392 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3393 #else 3394 error = EOPNOTSUPP; 3395 #endif 3396 break; 3397 3398 case SO_LISTENQLIMIT: 3399 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 3400 goto integer; 3401 3402 case SO_LISTENQLEN: 3403 optval = SOLISTENING(so) ? so->sol_qlen : 0; 3404 goto integer; 3405 3406 case SO_LISTENINCQLEN: 3407 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 3408 goto integer; 3409 3410 case SO_TS_CLOCK: 3411 optval = so->so_ts_clock; 3412 goto integer; 3413 3414 case SO_MAX_PACING_RATE: 3415 optval = so->so_max_pacing_rate; 3416 goto integer; 3417 3418 default: 3419 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3420 error = hhook_run_socket(so, sopt, 3421 HHOOK_SOCKET_OPT); 3422 else 3423 error = ENOPROTOOPT; 3424 break; 3425 } 3426 } 3427 #ifdef MAC 3428 bad: 3429 #endif 3430 CURVNET_RESTORE(); 3431 return (error); 3432 } 3433 3434 int 3435 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 3436 { 3437 struct mbuf *m, *m_prev; 3438 int sopt_size = sopt->sopt_valsize; 3439 3440 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3441 if (m == NULL) 3442 return ENOBUFS; 3443 if (sopt_size > MLEN) { 3444 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 3445 if ((m->m_flags & M_EXT) == 0) { 3446 m_free(m); 3447 return ENOBUFS; 3448 } 3449 m->m_len = min(MCLBYTES, sopt_size); 3450 } else { 3451 m->m_len = min(MLEN, sopt_size); 3452 } 3453 sopt_size -= m->m_len; 3454 *mp = m; 3455 m_prev = m; 3456 3457 while (sopt_size) { 3458 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3459 if (m == NULL) { 3460 m_freem(*mp); 3461 return ENOBUFS; 3462 } 3463 if (sopt_size > MLEN) { 3464 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 3465 M_NOWAIT); 3466 if ((m->m_flags & M_EXT) == 0) { 3467 m_freem(m); 3468 m_freem(*mp); 3469 return ENOBUFS; 3470 } 3471 m->m_len = min(MCLBYTES, sopt_size); 3472 } else { 3473 m->m_len = min(MLEN, sopt_size); 3474 } 3475 sopt_size -= m->m_len; 3476 m_prev->m_next = m; 3477 m_prev = m; 3478 } 3479 return (0); 3480 } 3481 3482 int 3483 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 3484 { 3485 struct mbuf *m0 = m; 3486 3487 if (sopt->sopt_val == NULL) 3488 return (0); 3489 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3490 if (sopt->sopt_td != NULL) { 3491 int error; 3492 3493 error = copyin(sopt->sopt_val, mtod(m, char *), 3494 m->m_len); 3495 if (error != 0) { 3496 m_freem(m0); 3497 return(error); 3498 } 3499 } else 3500 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 3501 sopt->sopt_valsize -= m->m_len; 3502 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3503 m = m->m_next; 3504 } 3505 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 3506 panic("ip6_sooptmcopyin"); 3507 return (0); 3508 } 3509 3510 int 3511 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 3512 { 3513 struct mbuf *m0 = m; 3514 size_t valsize = 0; 3515 3516 if (sopt->sopt_val == NULL) 3517 return (0); 3518 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3519 if (sopt->sopt_td != NULL) { 3520 int error; 3521 3522 error = copyout(mtod(m, char *), sopt->sopt_val, 3523 m->m_len); 3524 if (error != 0) { 3525 m_freem(m0); 3526 return(error); 3527 } 3528 } else 3529 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 3530 sopt->sopt_valsize -= m->m_len; 3531 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3532 valsize += m->m_len; 3533 m = m->m_next; 3534 } 3535 if (m != NULL) { 3536 /* enough soopt buffer should be given from user-land */ 3537 m_freem(m0); 3538 return(EINVAL); 3539 } 3540 sopt->sopt_valsize = valsize; 3541 return (0); 3542 } 3543 3544 /* 3545 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3546 * out-of-band data, which will then notify socket consumers. 3547 */ 3548 void 3549 sohasoutofband(struct socket *so) 3550 { 3551 3552 if (so->so_sigio != NULL) 3553 pgsigio(&so->so_sigio, SIGURG, 0); 3554 selwakeuppri(&so->so_rdsel, PSOCK); 3555 } 3556 3557 int 3558 sopoll(struct socket *so, int events, struct ucred *active_cred, 3559 struct thread *td) 3560 { 3561 3562 /* 3563 * We do not need to set or assert curvnet as long as everyone uses 3564 * sopoll_generic(). 3565 */ 3566 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 3567 td)); 3568 } 3569 3570 int 3571 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3572 struct thread *td) 3573 { 3574 int revents; 3575 3576 SOCK_LOCK(so); 3577 if (SOLISTENING(so)) { 3578 if (!(events & (POLLIN | POLLRDNORM))) 3579 revents = 0; 3580 else if (!TAILQ_EMPTY(&so->sol_comp)) 3581 revents = events & (POLLIN | POLLRDNORM); 3582 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 3583 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 3584 else { 3585 selrecord(td, &so->so_rdsel); 3586 revents = 0; 3587 } 3588 } else { 3589 revents = 0; 3590 SOCKBUF_LOCK(&so->so_snd); 3591 SOCKBUF_LOCK(&so->so_rcv); 3592 if (events & (POLLIN | POLLRDNORM)) 3593 if (soreadabledata(so)) 3594 revents |= events & (POLLIN | POLLRDNORM); 3595 if (events & (POLLOUT | POLLWRNORM)) 3596 if (sowriteable(so)) 3597 revents |= events & (POLLOUT | POLLWRNORM); 3598 if (events & (POLLPRI | POLLRDBAND)) 3599 if (so->so_oobmark || 3600 (so->so_rcv.sb_state & SBS_RCVATMARK)) 3601 revents |= events & (POLLPRI | POLLRDBAND); 3602 if ((events & POLLINIGNEOF) == 0) { 3603 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3604 revents |= events & (POLLIN | POLLRDNORM); 3605 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3606 revents |= POLLHUP; 3607 } 3608 } 3609 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 3610 revents |= events & POLLRDHUP; 3611 if (revents == 0) { 3612 if (events & 3613 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { 3614 selrecord(td, &so->so_rdsel); 3615 so->so_rcv.sb_flags |= SB_SEL; 3616 } 3617 if (events & (POLLOUT | POLLWRNORM)) { 3618 selrecord(td, &so->so_wrsel); 3619 so->so_snd.sb_flags |= SB_SEL; 3620 } 3621 } 3622 SOCKBUF_UNLOCK(&so->so_rcv); 3623 SOCKBUF_UNLOCK(&so->so_snd); 3624 } 3625 SOCK_UNLOCK(so); 3626 return (revents); 3627 } 3628 3629 int 3630 soo_kqfilter(struct file *fp, struct knote *kn) 3631 { 3632 struct socket *so = kn->kn_fp->f_data; 3633 struct sockbuf *sb; 3634 struct knlist *knl; 3635 3636 switch (kn->kn_filter) { 3637 case EVFILT_READ: 3638 kn->kn_fop = &soread_filtops; 3639 knl = &so->so_rdsel.si_note; 3640 sb = &so->so_rcv; 3641 break; 3642 case EVFILT_WRITE: 3643 kn->kn_fop = &sowrite_filtops; 3644 knl = &so->so_wrsel.si_note; 3645 sb = &so->so_snd; 3646 break; 3647 case EVFILT_EMPTY: 3648 kn->kn_fop = &soempty_filtops; 3649 knl = &so->so_wrsel.si_note; 3650 sb = &so->so_snd; 3651 break; 3652 default: 3653 return (EINVAL); 3654 } 3655 3656 SOCK_LOCK(so); 3657 if (SOLISTENING(so)) { 3658 knlist_add(knl, kn, 1); 3659 } else { 3660 SOCKBUF_LOCK(sb); 3661 knlist_add(knl, kn, 1); 3662 sb->sb_flags |= SB_KNOTE; 3663 SOCKBUF_UNLOCK(sb); 3664 } 3665 SOCK_UNLOCK(so); 3666 return (0); 3667 } 3668 3669 /* 3670 * Some routines that return EOPNOTSUPP for entry points that are not 3671 * supported by a protocol. Fill in as needed. 3672 */ 3673 int 3674 pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 3675 { 3676 3677 return EOPNOTSUPP; 3678 } 3679 3680 int 3681 pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) 3682 { 3683 3684 return EOPNOTSUPP; 3685 } 3686 3687 int 3688 pru_attach_notsupp(struct socket *so, int proto, struct thread *td) 3689 { 3690 3691 return EOPNOTSUPP; 3692 } 3693 3694 int 3695 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3696 { 3697 3698 return EOPNOTSUPP; 3699 } 3700 3701 int 3702 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3703 struct thread *td) 3704 { 3705 3706 return EOPNOTSUPP; 3707 } 3708 3709 int 3710 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3711 { 3712 3713 return EOPNOTSUPP; 3714 } 3715 3716 int 3717 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3718 struct thread *td) 3719 { 3720 3721 return EOPNOTSUPP; 3722 } 3723 3724 int 3725 pru_connect2_notsupp(struct socket *so1, struct socket *so2) 3726 { 3727 3728 return EOPNOTSUPP; 3729 } 3730 3731 int 3732 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 3733 struct ifnet *ifp, struct thread *td) 3734 { 3735 3736 return EOPNOTSUPP; 3737 } 3738 3739 int 3740 pru_disconnect_notsupp(struct socket *so) 3741 { 3742 3743 return EOPNOTSUPP; 3744 } 3745 3746 int 3747 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) 3748 { 3749 3750 return EOPNOTSUPP; 3751 } 3752 3753 int 3754 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) 3755 { 3756 3757 return EOPNOTSUPP; 3758 } 3759 3760 int 3761 pru_rcvd_notsupp(struct socket *so, int flags) 3762 { 3763 3764 return EOPNOTSUPP; 3765 } 3766 3767 int 3768 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 3769 { 3770 3771 return EOPNOTSUPP; 3772 } 3773 3774 int 3775 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, 3776 struct sockaddr *addr, struct mbuf *control, struct thread *td) 3777 { 3778 3779 if (control != NULL) 3780 m_freem(control); 3781 if ((flags & PRUS_NOTREADY) == 0) 3782 m_freem(m); 3783 return (EOPNOTSUPP); 3784 } 3785 3786 int 3787 pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) 3788 { 3789 3790 return (EOPNOTSUPP); 3791 } 3792 3793 /* 3794 * This isn't really a ``null'' operation, but it's the default one and 3795 * doesn't do anything destructive. 3796 */ 3797 int 3798 pru_sense_null(struct socket *so, struct stat *sb) 3799 { 3800 3801 sb->st_blksize = so->so_snd.sb_hiwat; 3802 return 0; 3803 } 3804 3805 int 3806 pru_shutdown_notsupp(struct socket *so) 3807 { 3808 3809 return EOPNOTSUPP; 3810 } 3811 3812 int 3813 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) 3814 { 3815 3816 return EOPNOTSUPP; 3817 } 3818 3819 int 3820 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, 3821 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 3822 { 3823 3824 return EOPNOTSUPP; 3825 } 3826 3827 int 3828 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, 3829 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3830 { 3831 3832 return EOPNOTSUPP; 3833 } 3834 3835 int 3836 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, 3837 struct thread *td) 3838 { 3839 3840 return EOPNOTSUPP; 3841 } 3842 3843 static void 3844 filt_sordetach(struct knote *kn) 3845 { 3846 struct socket *so = kn->kn_fp->f_data; 3847 3848 so_rdknl_lock(so); 3849 knlist_remove(&so->so_rdsel.si_note, kn, 1); 3850 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 3851 so->so_rcv.sb_flags &= ~SB_KNOTE; 3852 so_rdknl_unlock(so); 3853 } 3854 3855 /*ARGSUSED*/ 3856 static int 3857 filt_soread(struct knote *kn, long hint) 3858 { 3859 struct socket *so; 3860 3861 so = kn->kn_fp->f_data; 3862 3863 if (SOLISTENING(so)) { 3864 SOCK_LOCK_ASSERT(so); 3865 kn->kn_data = so->sol_qlen; 3866 if (so->so_error) { 3867 kn->kn_flags |= EV_EOF; 3868 kn->kn_fflags = so->so_error; 3869 return (1); 3870 } 3871 return (!TAILQ_EMPTY(&so->sol_comp)); 3872 } 3873 3874 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3875 3876 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 3877 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3878 kn->kn_flags |= EV_EOF; 3879 kn->kn_fflags = so->so_error; 3880 return (1); 3881 } else if (so->so_error || so->so_rerror) 3882 return (1); 3883 3884 if (kn->kn_sfflags & NOTE_LOWAT) { 3885 if (kn->kn_data >= kn->kn_sdata) 3886 return (1); 3887 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 3888 return (1); 3889 3890 /* This hook returning non-zero indicates an event, not error */ 3891 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 3892 } 3893 3894 static void 3895 filt_sowdetach(struct knote *kn) 3896 { 3897 struct socket *so = kn->kn_fp->f_data; 3898 3899 so_wrknl_lock(so); 3900 knlist_remove(&so->so_wrsel.si_note, kn, 1); 3901 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 3902 so->so_snd.sb_flags &= ~SB_KNOTE; 3903 so_wrknl_unlock(so); 3904 } 3905 3906 /*ARGSUSED*/ 3907 static int 3908 filt_sowrite(struct knote *kn, long hint) 3909 { 3910 struct socket *so; 3911 3912 so = kn->kn_fp->f_data; 3913 3914 if (SOLISTENING(so)) 3915 return (0); 3916 3917 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3918 kn->kn_data = sbspace(&so->so_snd); 3919 3920 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 3921 3922 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3923 kn->kn_flags |= EV_EOF; 3924 kn->kn_fflags = so->so_error; 3925 return (1); 3926 } else if (so->so_error) /* temporary udp error */ 3927 return (1); 3928 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3929 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3930 return (0); 3931 else if (kn->kn_sfflags & NOTE_LOWAT) 3932 return (kn->kn_data >= kn->kn_sdata); 3933 else 3934 return (kn->kn_data >= so->so_snd.sb_lowat); 3935 } 3936 3937 static int 3938 filt_soempty(struct knote *kn, long hint) 3939 { 3940 struct socket *so; 3941 3942 so = kn->kn_fp->f_data; 3943 3944 if (SOLISTENING(so)) 3945 return (1); 3946 3947 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3948 kn->kn_data = sbused(&so->so_snd); 3949 3950 if (kn->kn_data == 0) 3951 return (1); 3952 else 3953 return (0); 3954 } 3955 3956 int 3957 socheckuid(struct socket *so, uid_t uid) 3958 { 3959 3960 if (so == NULL) 3961 return (EPERM); 3962 if (so->so_cred->cr_uid != uid) 3963 return (EPERM); 3964 return (0); 3965 } 3966 3967 /* 3968 * These functions are used by protocols to notify the socket layer (and its 3969 * consumers) of state changes in the sockets driven by protocol-side events. 3970 */ 3971 3972 /* 3973 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3974 * 3975 * Normal sequence from the active (originating) side is that 3976 * soisconnecting() is called during processing of connect() call, resulting 3977 * in an eventual call to soisconnected() if/when the connection is 3978 * established. When the connection is torn down soisdisconnecting() is 3979 * called during processing of disconnect() call, and soisdisconnected() is 3980 * called when the connection to the peer is totally severed. The semantics 3981 * of these routines are such that connectionless protocols can call 3982 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3983 * calls when setting up a ``connection'' takes no time. 3984 * 3985 * From the passive side, a socket is created with two queues of sockets: 3986 * so_incomp for connections in progress and so_comp for connections already 3987 * made and awaiting user acceptance. As a protocol is preparing incoming 3988 * connections, it creates a socket structure queued on so_incomp by calling 3989 * sonewconn(). When the connection is established, soisconnected() is 3990 * called, and transfers the socket structure to so_comp, making it available 3991 * to accept(). 3992 * 3993 * If a socket is closed with sockets on either so_incomp or so_comp, these 3994 * sockets are dropped. 3995 * 3996 * If higher-level protocols are implemented in the kernel, the wakeups done 3997 * here will sometimes cause software-interrupt process scheduling. 3998 */ 3999 void 4000 soisconnecting(struct socket *so) 4001 { 4002 4003 SOCK_LOCK(so); 4004 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 4005 so->so_state |= SS_ISCONNECTING; 4006 SOCK_UNLOCK(so); 4007 } 4008 4009 void 4010 soisconnected(struct socket *so) 4011 { 4012 4013 SOCK_LOCK(so); 4014 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 4015 so->so_state |= SS_ISCONNECTED; 4016 4017 if (so->so_qstate == SQ_INCOMP) { 4018 struct socket *head = so->so_listen; 4019 int ret; 4020 4021 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 4022 /* 4023 * Promoting a socket from incomplete queue to complete, we 4024 * need to go through reverse order of locking. We first do 4025 * trylock, and if that doesn't succeed, we go the hard way 4026 * leaving a reference and rechecking consistency after proper 4027 * locking. 4028 */ 4029 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 4030 soref(head); 4031 SOCK_UNLOCK(so); 4032 SOLISTEN_LOCK(head); 4033 SOCK_LOCK(so); 4034 if (__predict_false(head != so->so_listen)) { 4035 /* 4036 * The socket went off the listen queue, 4037 * should be lost race to close(2) of sol. 4038 * The socket is about to soabort(). 4039 */ 4040 SOCK_UNLOCK(so); 4041 sorele(head); 4042 return; 4043 } 4044 /* Not the last one, as so holds a ref. */ 4045 refcount_release(&head->so_count); 4046 } 4047 again: 4048 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 4049 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 4050 head->sol_incqlen--; 4051 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 4052 head->sol_qlen++; 4053 so->so_qstate = SQ_COMP; 4054 SOCK_UNLOCK(so); 4055 solisten_wakeup(head); /* unlocks */ 4056 } else { 4057 SOCKBUF_LOCK(&so->so_rcv); 4058 soupcall_set(so, SO_RCV, 4059 head->sol_accept_filter->accf_callback, 4060 head->sol_accept_filter_arg); 4061 so->so_options &= ~SO_ACCEPTFILTER; 4062 ret = head->sol_accept_filter->accf_callback(so, 4063 head->sol_accept_filter_arg, M_NOWAIT); 4064 if (ret == SU_ISCONNECTED) { 4065 soupcall_clear(so, SO_RCV); 4066 SOCKBUF_UNLOCK(&so->so_rcv); 4067 goto again; 4068 } 4069 SOCKBUF_UNLOCK(&so->so_rcv); 4070 SOCK_UNLOCK(so); 4071 SOLISTEN_UNLOCK(head); 4072 } 4073 return; 4074 } 4075 SOCK_UNLOCK(so); 4076 wakeup(&so->so_timeo); 4077 sorwakeup(so); 4078 sowwakeup(so); 4079 } 4080 4081 void 4082 soisdisconnecting(struct socket *so) 4083 { 4084 4085 SOCK_LOCK(so); 4086 so->so_state &= ~SS_ISCONNECTING; 4087 so->so_state |= SS_ISDISCONNECTING; 4088 4089 if (!SOLISTENING(so)) { 4090 SOCKBUF_LOCK(&so->so_rcv); 4091 socantrcvmore_locked(so); 4092 SOCKBUF_LOCK(&so->so_snd); 4093 socantsendmore_locked(so); 4094 } 4095 SOCK_UNLOCK(so); 4096 wakeup(&so->so_timeo); 4097 } 4098 4099 void 4100 soisdisconnected(struct socket *so) 4101 { 4102 4103 SOCK_LOCK(so); 4104 4105 /* 4106 * There is at least one reader of so_state that does not 4107 * acquire socket lock, namely soreceive_generic(). Ensure 4108 * that it never sees all flags that track connection status 4109 * cleared, by ordering the update with a barrier semantic of 4110 * our release thread fence. 4111 */ 4112 so->so_state |= SS_ISDISCONNECTED; 4113 atomic_thread_fence_rel(); 4114 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 4115 4116 if (!SOLISTENING(so)) { 4117 SOCK_UNLOCK(so); 4118 SOCKBUF_LOCK(&so->so_rcv); 4119 socantrcvmore_locked(so); 4120 SOCKBUF_LOCK(&so->so_snd); 4121 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 4122 socantsendmore_locked(so); 4123 } else 4124 SOCK_UNLOCK(so); 4125 wakeup(&so->so_timeo); 4126 } 4127 4128 int 4129 soiolock(struct socket *so, struct sx *sx, int flags) 4130 { 4131 int error; 4132 4133 KASSERT((flags & SBL_VALID) == flags, 4134 ("soiolock: invalid flags %#x", flags)); 4135 4136 if ((flags & SBL_WAIT) != 0) { 4137 if ((flags & SBL_NOINTR) != 0) { 4138 sx_xlock(sx); 4139 } else { 4140 error = sx_xlock_sig(sx); 4141 if (error != 0) 4142 return (error); 4143 } 4144 } else if (!sx_try_xlock(sx)) { 4145 return (EWOULDBLOCK); 4146 } 4147 4148 if (__predict_false(SOLISTENING(so))) { 4149 sx_xunlock(sx); 4150 return (ENOTCONN); 4151 } 4152 return (0); 4153 } 4154 4155 void 4156 soiounlock(struct sx *sx) 4157 { 4158 sx_xunlock(sx); 4159 } 4160 4161 /* 4162 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 4163 */ 4164 struct sockaddr * 4165 sodupsockaddr(const struct sockaddr *sa, int mflags) 4166 { 4167 struct sockaddr *sa2; 4168 4169 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 4170 if (sa2) 4171 bcopy(sa, sa2, sa->sa_len); 4172 return sa2; 4173 } 4174 4175 /* 4176 * Register per-socket destructor. 4177 */ 4178 void 4179 sodtor_set(struct socket *so, so_dtor_t *func) 4180 { 4181 4182 SOCK_LOCK_ASSERT(so); 4183 so->so_dtor = func; 4184 } 4185 4186 /* 4187 * Register per-socket buffer upcalls. 4188 */ 4189 void 4190 soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) 4191 { 4192 struct sockbuf *sb; 4193 4194 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4195 4196 switch (which) { 4197 case SO_RCV: 4198 sb = &so->so_rcv; 4199 break; 4200 case SO_SND: 4201 sb = &so->so_snd; 4202 break; 4203 default: 4204 panic("soupcall_set: bad which"); 4205 } 4206 SOCKBUF_LOCK_ASSERT(sb); 4207 sb->sb_upcall = func; 4208 sb->sb_upcallarg = arg; 4209 sb->sb_flags |= SB_UPCALL; 4210 } 4211 4212 void 4213 soupcall_clear(struct socket *so, int which) 4214 { 4215 struct sockbuf *sb; 4216 4217 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 4218 4219 switch (which) { 4220 case SO_RCV: 4221 sb = &so->so_rcv; 4222 break; 4223 case SO_SND: 4224 sb = &so->so_snd; 4225 break; 4226 default: 4227 panic("soupcall_clear: bad which"); 4228 } 4229 SOCKBUF_LOCK_ASSERT(sb); 4230 KASSERT(sb->sb_upcall != NULL, 4231 ("%s: so %p no upcall to clear", __func__, so)); 4232 sb->sb_upcall = NULL; 4233 sb->sb_upcallarg = NULL; 4234 sb->sb_flags &= ~SB_UPCALL; 4235 } 4236 4237 void 4238 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 4239 { 4240 4241 SOLISTEN_LOCK_ASSERT(so); 4242 so->sol_upcall = func; 4243 so->sol_upcallarg = arg; 4244 } 4245 4246 static void 4247 so_rdknl_lock(void *arg) 4248 { 4249 struct socket *so = arg; 4250 4251 if (SOLISTENING(so)) 4252 SOCK_LOCK(so); 4253 else 4254 SOCKBUF_LOCK(&so->so_rcv); 4255 } 4256 4257 static void 4258 so_rdknl_unlock(void *arg) 4259 { 4260 struct socket *so = arg; 4261 4262 if (SOLISTENING(so)) 4263 SOCK_UNLOCK(so); 4264 else 4265 SOCKBUF_UNLOCK(&so->so_rcv); 4266 } 4267 4268 static void 4269 so_rdknl_assert_lock(void *arg, int what) 4270 { 4271 struct socket *so = arg; 4272 4273 if (what == LA_LOCKED) { 4274 if (SOLISTENING(so)) 4275 SOCK_LOCK_ASSERT(so); 4276 else 4277 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 4278 } else { 4279 if (SOLISTENING(so)) 4280 SOCK_UNLOCK_ASSERT(so); 4281 else 4282 SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); 4283 } 4284 } 4285 4286 static void 4287 so_wrknl_lock(void *arg) 4288 { 4289 struct socket *so = arg; 4290 4291 if (SOLISTENING(so)) 4292 SOCK_LOCK(so); 4293 else 4294 SOCKBUF_LOCK(&so->so_snd); 4295 } 4296 4297 static void 4298 so_wrknl_unlock(void *arg) 4299 { 4300 struct socket *so = arg; 4301 4302 if (SOLISTENING(so)) 4303 SOCK_UNLOCK(so); 4304 else 4305 SOCKBUF_UNLOCK(&so->so_snd); 4306 } 4307 4308 static void 4309 so_wrknl_assert_lock(void *arg, int what) 4310 { 4311 struct socket *so = arg; 4312 4313 if (what == LA_LOCKED) { 4314 if (SOLISTENING(so)) 4315 SOCK_LOCK_ASSERT(so); 4316 else 4317 SOCKBUF_LOCK_ASSERT(&so->so_snd); 4318 } else { 4319 if (SOLISTENING(so)) 4320 SOCK_UNLOCK_ASSERT(so); 4321 else 4322 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); 4323 } 4324 } 4325 4326 /* 4327 * Create an external-format (``xsocket'') structure using the information in 4328 * the kernel-format socket structure pointed to by so. This is done to 4329 * reduce the spew of irrelevant information over this interface, to isolate 4330 * user code from changes in the kernel structure, and potentially to provide 4331 * information-hiding if we decide that some of this information should be 4332 * hidden from users. 4333 */ 4334 void 4335 sotoxsocket(struct socket *so, struct xsocket *xso) 4336 { 4337 4338 bzero(xso, sizeof(*xso)); 4339 xso->xso_len = sizeof *xso; 4340 xso->xso_so = (uintptr_t)so; 4341 xso->so_type = so->so_type; 4342 xso->so_options = so->so_options; 4343 xso->so_linger = so->so_linger; 4344 xso->so_state = so->so_state; 4345 xso->so_pcb = (uintptr_t)so->so_pcb; 4346 xso->xso_protocol = so->so_proto->pr_protocol; 4347 xso->xso_family = so->so_proto->pr_domain->dom_family; 4348 xso->so_timeo = so->so_timeo; 4349 xso->so_error = so->so_error; 4350 xso->so_uid = so->so_cred->cr_uid; 4351 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 4352 if (SOLISTENING(so)) { 4353 xso->so_qlen = so->sol_qlen; 4354 xso->so_incqlen = so->sol_incqlen; 4355 xso->so_qlimit = so->sol_qlimit; 4356 xso->so_oobmark = 0; 4357 } else { 4358 xso->so_state |= so->so_qstate; 4359 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 4360 xso->so_oobmark = so->so_oobmark; 4361 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 4362 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 4363 } 4364 } 4365 4366 struct sockbuf * 4367 so_sockbuf_rcv(struct socket *so) 4368 { 4369 4370 return (&so->so_rcv); 4371 } 4372 4373 struct sockbuf * 4374 so_sockbuf_snd(struct socket *so) 4375 { 4376 4377 return (&so->so_snd); 4378 } 4379 4380 int 4381 so_state_get(const struct socket *so) 4382 { 4383 4384 return (so->so_state); 4385 } 4386 4387 void 4388 so_state_set(struct socket *so, int val) 4389 { 4390 4391 so->so_state = val; 4392 } 4393 4394 int 4395 so_options_get(const struct socket *so) 4396 { 4397 4398 return (so->so_options); 4399 } 4400 4401 void 4402 so_options_set(struct socket *so, int val) 4403 { 4404 4405 so->so_options = val; 4406 } 4407 4408 int 4409 so_error_get(const struct socket *so) 4410 { 4411 4412 return (so->so_error); 4413 } 4414 4415 void 4416 so_error_set(struct socket *so, int val) 4417 { 4418 4419 so->so_error = val; 4420 } 4421 4422 int 4423 so_linger_get(const struct socket *so) 4424 { 4425 4426 return (so->so_linger); 4427 } 4428 4429 void 4430 so_linger_set(struct socket *so, int val) 4431 { 4432 4433 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), 4434 ("%s: val %d out of range", __func__, val)); 4435 4436 so->so_linger = val; 4437 } 4438 4439 struct protosw * 4440 so_protosw_get(const struct socket *so) 4441 { 4442 4443 return (so->so_proto); 4444 } 4445 4446 void 4447 so_protosw_set(struct socket *so, struct protosw *val) 4448 { 4449 4450 so->so_proto = val; 4451 } 4452 4453 void 4454 so_sorwakeup(struct socket *so) 4455 { 4456 4457 sorwakeup(so); 4458 } 4459 4460 void 4461 so_sowwakeup(struct socket *so) 4462 { 4463 4464 sowwakeup(so); 4465 } 4466 4467 void 4468 so_sorwakeup_locked(struct socket *so) 4469 { 4470 4471 sorwakeup_locked(so); 4472 } 4473 4474 void 4475 so_sowwakeup_locked(struct socket *so) 4476 { 4477 4478 sowwakeup_locked(so); 4479 } 4480 4481 void 4482 so_lock(struct socket *so) 4483 { 4484 4485 SOCK_LOCK(so); 4486 } 4487 4488 void 4489 so_unlock(struct socket *so) 4490 { 4491 4492 SOCK_UNLOCK(so); 4493 } 4494