1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 35 */ 36 37 /* 38 * Comments on the socket life cycle: 39 * 40 * soalloc() sets of socket layer state for a socket, called only by 41 * socreate() and sonewconn(). Socket layer private. 42 * 43 * sodealloc() tears down socket layer state for a socket, called only by 44 * sofree() and sonewconn(). Socket layer private. 45 * 46 * pru_attach() associates protocol layer state with an allocated socket; 47 * called only once, may fail, aborting socket allocation. This is called 48 * from socreate() and sonewconn(). Socket layer private. 49 * 50 * pru_detach() disassociates protocol layer state from an attached socket, 51 * and will be called exactly once for sockets in which pru_attach() has 52 * been successfully called. If pru_attach() returned an error, 53 * pru_detach() will not be called. Socket layer private. 54 * 55 * pru_abort() and pru_close() notify the protocol layer that the last 56 * consumer of a socket is starting to tear down the socket, and that the 57 * protocol should terminate the connection. Historically, pru_abort() also 58 * detached protocol state from the socket state, but this is no longer the 59 * case. 60 * 61 * socreate() creates a socket and attaches protocol state. This is a public 62 * interface that may be used by socket layer consumers to create new 63 * sockets. 64 * 65 * sonewconn() creates a socket and attaches protocol state. This is a 66 * public interface that may be used by protocols to create new sockets when 67 * a new connection is received and will be available for accept() on a 68 * listen socket. 69 * 70 * soclose() destroys a socket after possibly waiting for it to disconnect. 71 * This is a public interface that socket consumers should use to close and 72 * release a socket when done with it. 73 * 74 * soabort() destroys a socket without waiting for it to disconnect (used 75 * only for incoming connections that are already partially or fully 76 * connected). This is used internally by the socket layer when clearing 77 * listen socket queues (due to overflow or close on the listen socket), but 78 * is also a public interface protocols may use to abort connections in 79 * their incomplete listen queues should they no longer be required. Sockets 80 * placed in completed connection listen queues should not be aborted for 81 * reasons described in the comment above the soclose() implementation. This 82 * is not a general purpose close routine, and except in the specific 83 * circumstances described here, should not be used. 84 * 85 * sofree() will free a socket and its protocol state if all references on 86 * the socket have been released, and is the public interface to attempt to 87 * free a socket when a reference is removed. This is a socket layer private 88 * interface. 89 * 90 * NOTE: In addition to socreate() and soclose(), which provide a single 91 * socket reference to the consumer to be managed as required, there are two 92 * calls to explicitly manage socket references, soref(), and sorele(). 93 * Currently, these are generally required only when transitioning a socket 94 * from a listen queue to a file descriptor, in order to prevent garbage 95 * collection of the socket at an untimely moment. For a number of reasons, 96 * these interfaces are not preferred, and should be avoided. 97 * 98 * NOTE: With regard to VNETs the general rule is that callers do not set 99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 101 * and sorflush(), which are usually called from a pre-set VNET context. 102 * sopoll() currently does not need a VNET context to be set. 103 */ 104 105 #include <sys/cdefs.h> 106 __FBSDID("$FreeBSD$"); 107 108 #include "opt_inet.h" 109 #include "opt_inet6.h" 110 #include "opt_kern_tls.h" 111 #include "opt_sctp.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/fcntl.h> 116 #include <sys/limits.h> 117 #include <sys/lock.h> 118 #include <sys/mac.h> 119 #include <sys/malloc.h> 120 #include <sys/mbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/domain.h> 123 #include <sys/file.h> /* for struct knote */ 124 #include <sys/hhook.h> 125 #include <sys/kernel.h> 126 #include <sys/khelp.h> 127 #include <sys/ktls.h> 128 #include <sys/event.h> 129 #include <sys/eventhandler.h> 130 #include <sys/poll.h> 131 #include <sys/proc.h> 132 #include <sys/protosw.h> 133 #include <sys/socket.h> 134 #include <sys/socketvar.h> 135 #include <sys/resourcevar.h> 136 #include <net/route.h> 137 #include <sys/signalvar.h> 138 #include <sys/stat.h> 139 #include <sys/sx.h> 140 #include <sys/sysctl.h> 141 #include <sys/taskqueue.h> 142 #include <sys/uio.h> 143 #include <sys/jail.h> 144 #include <sys/syslog.h> 145 #include <netinet/in.h> 146 #include <netinet/tcp.h> 147 148 #include <net/vnet.h> 149 150 #include <security/mac/mac_framework.h> 151 152 #include <vm/uma.h> 153 154 #ifdef COMPAT_FREEBSD32 155 #include <sys/mount.h> 156 #include <sys/sysent.h> 157 #include <compat/freebsd32/freebsd32.h> 158 #endif 159 160 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 161 int flags); 162 static void so_rdknl_lock(void *); 163 static void so_rdknl_unlock(void *); 164 static void so_rdknl_assert_locked(void *); 165 static void so_rdknl_assert_unlocked(void *); 166 static void so_wrknl_lock(void *); 167 static void so_wrknl_unlock(void *); 168 static void so_wrknl_assert_locked(void *); 169 static void so_wrknl_assert_unlocked(void *); 170 171 static void filt_sordetach(struct knote *kn); 172 static int filt_soread(struct knote *kn, long hint); 173 static void filt_sowdetach(struct knote *kn); 174 static int filt_sowrite(struct knote *kn, long hint); 175 static int filt_soempty(struct knote *kn, long hint); 176 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); 177 fo_kqfilter_t soo_kqfilter; 178 179 static struct filterops soread_filtops = { 180 .f_isfd = 1, 181 .f_detach = filt_sordetach, 182 .f_event = filt_soread, 183 }; 184 static struct filterops sowrite_filtops = { 185 .f_isfd = 1, 186 .f_detach = filt_sowdetach, 187 .f_event = filt_sowrite, 188 }; 189 static struct filterops soempty_filtops = { 190 .f_isfd = 1, 191 .f_detach = filt_sowdetach, 192 .f_event = filt_soempty, 193 }; 194 195 so_gen_t so_gencnt; /* generation count for sockets */ 196 197 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 198 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 199 200 #define VNET_SO_ASSERT(so) \ 201 VNET_ASSERT(curvnet != NULL, \ 202 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 203 204 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 205 #define V_socket_hhh VNET(socket_hhh) 206 207 /* 208 * Limit on the number of connections in the listen queue waiting 209 * for accept(2). 210 * NB: The original sysctl somaxconn is still available but hidden 211 * to prevent confusion about the actual purpose of this number. 212 */ 213 static u_int somaxconn = SOMAXCONN; 214 215 static int 216 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 217 { 218 int error; 219 int val; 220 221 val = somaxconn; 222 error = sysctl_handle_int(oidp, &val, 0, req); 223 if (error || !req->newptr ) 224 return (error); 225 226 /* 227 * The purpose of the UINT_MAX / 3 limit, is so that the formula 228 * 3 * so_qlimit / 2 229 * below, will not overflow. 230 */ 231 232 if (val < 1 || val > UINT_MAX / 3) 233 return (EINVAL); 234 235 somaxconn = val; 236 return (0); 237 } 238 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 239 0, sizeof(int), sysctl_somaxconn, "I", 240 "Maximum listen socket pending connection accept queue size"); 241 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 242 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 243 0, sizeof(int), sysctl_somaxconn, "I", 244 "Maximum listen socket pending connection accept queue size (compat)"); 245 246 static int numopensockets; 247 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 248 &numopensockets, 0, "Number of open sockets"); 249 250 /* 251 * accept_mtx locks down per-socket fields relating to accept queues. See 252 * socketvar.h for an annotation of the protected fields of struct socket. 253 */ 254 struct mtx accept_mtx; 255 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 256 257 /* 258 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 259 * so_gencnt field. 260 */ 261 static struct mtx so_global_mtx; 262 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 263 264 /* 265 * General IPC sysctl name space, used by sockets and a variety of other IPC 266 * types. 267 */ 268 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 269 270 /* 271 * Initialize the socket subsystem and set up the socket 272 * memory allocator. 273 */ 274 static uma_zone_t socket_zone; 275 int maxsockets; 276 277 static void 278 socket_zone_change(void *tag) 279 { 280 281 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 282 } 283 284 static void 285 socket_hhook_register(int subtype) 286 { 287 288 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 289 &V_socket_hhh[subtype], 290 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 291 printf("%s: WARNING: unable to register hook\n", __func__); 292 } 293 294 static void 295 socket_hhook_deregister(int subtype) 296 { 297 298 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 299 printf("%s: WARNING: unable to deregister hook\n", __func__); 300 } 301 302 static void 303 socket_init(void *tag) 304 { 305 306 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 307 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 308 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 309 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 310 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 311 EVENTHANDLER_PRI_FIRST); 312 } 313 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 314 315 static void 316 socket_vnet_init(const void *unused __unused) 317 { 318 int i; 319 320 /* We expect a contiguous range */ 321 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 322 socket_hhook_register(i); 323 } 324 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 325 socket_vnet_init, NULL); 326 327 static void 328 socket_vnet_uninit(const void *unused __unused) 329 { 330 int i; 331 332 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 333 socket_hhook_deregister(i); 334 } 335 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 336 socket_vnet_uninit, NULL); 337 338 /* 339 * Initialise maxsockets. This SYSINIT must be run after 340 * tunable_mbinit(). 341 */ 342 static void 343 init_maxsockets(void *ignored) 344 { 345 346 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 347 maxsockets = imax(maxsockets, maxfiles); 348 } 349 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 350 351 /* 352 * Sysctl to get and set the maximum global sockets limit. Notify protocols 353 * of the change so that they can update their dependent limits as required. 354 */ 355 static int 356 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 357 { 358 int error, newmaxsockets; 359 360 newmaxsockets = maxsockets; 361 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 362 if (error == 0 && req->newptr) { 363 if (newmaxsockets > maxsockets && 364 newmaxsockets <= maxfiles) { 365 maxsockets = newmaxsockets; 366 EVENTHANDLER_INVOKE(maxsockets_change); 367 } else 368 error = EINVAL; 369 } 370 return (error); 371 } 372 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 373 &maxsockets, 0, sysctl_maxsockets, "IU", 374 "Maximum number of sockets available"); 375 376 /* 377 * Socket operation routines. These routines are called by the routines in 378 * sys_socket.c or from a system process, and implement the semantics of 379 * socket operations by switching out to the protocol specific routines. 380 */ 381 382 /* 383 * Get a socket structure from our zone, and initialize it. Note that it 384 * would probably be better to allocate socket and PCB at the same time, but 385 * I'm not convinced that all the protocols can be easily modified to do 386 * this. 387 * 388 * soalloc() returns a socket with a ref count of 0. 389 */ 390 static struct socket * 391 soalloc(struct vnet *vnet) 392 { 393 struct socket *so; 394 395 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 396 if (so == NULL) 397 return (NULL); 398 #ifdef MAC 399 if (mac_socket_init(so, M_NOWAIT) != 0) { 400 uma_zfree(socket_zone, so); 401 return (NULL); 402 } 403 #endif 404 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 405 uma_zfree(socket_zone, so); 406 return (NULL); 407 } 408 409 /* 410 * The socket locking protocol allows to lock 2 sockets at a time, 411 * however, the first one must be a listening socket. WITNESS lacks 412 * a feature to change class of an existing lock, so we use DUPOK. 413 */ 414 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 415 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 416 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 417 so->so_rcv.sb_sel = &so->so_rdsel; 418 so->so_snd.sb_sel = &so->so_wrsel; 419 sx_init(&so->so_snd.sb_sx, "so_snd_sx"); 420 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); 421 TAILQ_INIT(&so->so_snd.sb_aiojobq); 422 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 423 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 424 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 425 #ifdef VIMAGE 426 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 427 __func__, __LINE__, so)); 428 so->so_vnet = vnet; 429 #endif 430 /* We shouldn't need the so_global_mtx */ 431 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 432 /* Do we need more comprehensive error returns? */ 433 uma_zfree(socket_zone, so); 434 return (NULL); 435 } 436 mtx_lock(&so_global_mtx); 437 so->so_gencnt = ++so_gencnt; 438 ++numopensockets; 439 #ifdef VIMAGE 440 vnet->vnet_sockcnt++; 441 #endif 442 mtx_unlock(&so_global_mtx); 443 444 return (so); 445 } 446 447 /* 448 * Free the storage associated with a socket at the socket layer, tear down 449 * locks, labels, etc. All protocol state is assumed already to have been 450 * torn down (and possibly never set up) by the caller. 451 */ 452 static void 453 sodealloc(struct socket *so) 454 { 455 456 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 457 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 458 459 mtx_lock(&so_global_mtx); 460 so->so_gencnt = ++so_gencnt; 461 --numopensockets; /* Could be below, but faster here. */ 462 #ifdef VIMAGE 463 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 464 __func__, __LINE__, so)); 465 so->so_vnet->vnet_sockcnt--; 466 #endif 467 mtx_unlock(&so_global_mtx); 468 #ifdef MAC 469 mac_socket_destroy(so); 470 #endif 471 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 472 473 crfree(so->so_cred); 474 khelp_destroy_osd(&so->osd); 475 if (SOLISTENING(so)) { 476 if (so->sol_accept_filter != NULL) 477 accept_filt_setopt(so, NULL); 478 } else { 479 if (so->so_rcv.sb_hiwat) 480 (void)chgsbsize(so->so_cred->cr_uidinfo, 481 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 482 if (so->so_snd.sb_hiwat) 483 (void)chgsbsize(so->so_cred->cr_uidinfo, 484 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 485 sx_destroy(&so->so_snd.sb_sx); 486 sx_destroy(&so->so_rcv.sb_sx); 487 SOCKBUF_LOCK_DESTROY(&so->so_snd); 488 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 489 } 490 mtx_destroy(&so->so_lock); 491 uma_zfree(socket_zone, so); 492 } 493 494 /* 495 * socreate returns a socket with a ref count of 1. The socket should be 496 * closed with soclose(). 497 */ 498 int 499 socreate(int dom, struct socket **aso, int type, int proto, 500 struct ucred *cred, struct thread *td) 501 { 502 struct protosw *prp; 503 struct socket *so; 504 int error; 505 506 if (proto) 507 prp = pffindproto(dom, proto, type); 508 else 509 prp = pffindtype(dom, type); 510 511 if (prp == NULL) { 512 /* No support for domain. */ 513 if (pffinddomain(dom) == NULL) 514 return (EAFNOSUPPORT); 515 /* No support for socket type. */ 516 if (proto == 0 && type != 0) 517 return (EPROTOTYPE); 518 return (EPROTONOSUPPORT); 519 } 520 if (prp->pr_usrreqs->pru_attach == NULL || 521 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 522 return (EPROTONOSUPPORT); 523 524 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 525 return (EPROTONOSUPPORT); 526 527 if (prp->pr_type != type) 528 return (EPROTOTYPE); 529 so = soalloc(CRED_TO_VNET(cred)); 530 if (so == NULL) 531 return (ENOBUFS); 532 533 so->so_type = type; 534 so->so_cred = crhold(cred); 535 if ((prp->pr_domain->dom_family == PF_INET) || 536 (prp->pr_domain->dom_family == PF_INET6) || 537 (prp->pr_domain->dom_family == PF_ROUTE)) 538 so->so_fibnum = td->td_proc->p_fibnum; 539 else 540 so->so_fibnum = 0; 541 so->so_proto = prp; 542 #ifdef MAC 543 mac_socket_create(cred, so); 544 #endif 545 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 546 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 547 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 548 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 549 /* 550 * Auto-sizing of socket buffers is managed by the protocols and 551 * the appropriate flags must be set in the pru_attach function. 552 */ 553 CURVNET_SET(so->so_vnet); 554 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 555 CURVNET_RESTORE(); 556 if (error) { 557 sodealloc(so); 558 return (error); 559 } 560 soref(so); 561 *aso = so; 562 return (0); 563 } 564 565 #ifdef REGRESSION 566 static int regression_sonewconn_earlytest = 1; 567 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 568 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 569 #endif 570 571 /* 572 * When an attempt at a new connection is noted on a socket which accepts 573 * connections, sonewconn is called. If the connection is possible (subject 574 * to space constraints, etc.) then we allocate a new structure, properly 575 * linked into the data structure of the original socket, and return this. 576 * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. 577 * 578 * Note: the ref count on the socket is 0 on return. 579 */ 580 struct socket * 581 sonewconn(struct socket *head, int connstatus) 582 { 583 static struct timeval lastover; 584 static struct timeval overinterval = { 60, 0 }; 585 static int overcount; 586 587 struct socket *so; 588 u_int over; 589 590 SOLISTEN_LOCK(head); 591 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 592 SOLISTEN_UNLOCK(head); 593 #ifdef REGRESSION 594 if (regression_sonewconn_earlytest && over) { 595 #else 596 if (over) { 597 #endif 598 overcount++; 599 600 if (ratecheck(&lastover, &overinterval)) { 601 log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " 602 "%i already in queue awaiting acceptance " 603 "(%d occurrences)\n", 604 __func__, head->so_pcb, head->sol_qlen, overcount); 605 606 overcount = 0; 607 } 608 609 return (NULL); 610 } 611 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 612 __func__, head)); 613 so = soalloc(head->so_vnet); 614 if (so == NULL) { 615 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 616 "limit reached or out of memory\n", 617 __func__, head->so_pcb); 618 return (NULL); 619 } 620 so->so_listen = head; 621 so->so_type = head->so_type; 622 so->so_linger = head->so_linger; 623 so->so_state = head->so_state | SS_NOFDREF; 624 so->so_fibnum = head->so_fibnum; 625 so->so_proto = head->so_proto; 626 so->so_cred = crhold(head->so_cred); 627 #ifdef MAC 628 mac_socket_newconn(head, so); 629 #endif 630 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 631 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 632 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 633 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 634 VNET_SO_ASSERT(head); 635 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 636 sodealloc(so); 637 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 638 __func__, head->so_pcb); 639 return (NULL); 640 } 641 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 642 sodealloc(so); 643 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 644 __func__, head->so_pcb); 645 return (NULL); 646 } 647 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 648 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 649 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 650 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 651 so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; 652 so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; 653 654 SOLISTEN_LOCK(head); 655 if (head->sol_accept_filter != NULL) 656 connstatus = 0; 657 so->so_state |= connstatus; 658 so->so_options = head->so_options & ~SO_ACCEPTCONN; 659 soref(head); /* A socket on (in)complete queue refs head. */ 660 if (connstatus) { 661 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 662 so->so_qstate = SQ_COMP; 663 head->sol_qlen++; 664 solisten_wakeup(head); /* unlocks */ 665 } else { 666 /* 667 * Keep removing sockets from the head until there's room for 668 * us to insert on the tail. In pre-locking revisions, this 669 * was a simple if(), but as we could be racing with other 670 * threads and soabort() requires dropping locks, we must 671 * loop waiting for the condition to be true. 672 */ 673 while (head->sol_incqlen > head->sol_qlimit) { 674 struct socket *sp; 675 676 sp = TAILQ_FIRST(&head->sol_incomp); 677 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 678 head->sol_incqlen--; 679 SOCK_LOCK(sp); 680 sp->so_qstate = SQ_NONE; 681 sp->so_listen = NULL; 682 SOCK_UNLOCK(sp); 683 sorele(head); /* does SOLISTEN_UNLOCK, head stays */ 684 soabort(sp); 685 SOLISTEN_LOCK(head); 686 } 687 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 688 so->so_qstate = SQ_INCOMP; 689 head->sol_incqlen++; 690 SOLISTEN_UNLOCK(head); 691 } 692 return (so); 693 } 694 695 #ifdef SCTP 696 /* 697 * Socket part of sctp_peeloff(). Detach a new socket from an 698 * association. The new socket is returned with a reference. 699 */ 700 struct socket * 701 sopeeloff(struct socket *head) 702 { 703 struct socket *so; 704 705 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 706 __func__, __LINE__, head)); 707 so = soalloc(head->so_vnet); 708 if (so == NULL) { 709 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 710 "limit reached or out of memory\n", 711 __func__, head->so_pcb); 712 return (NULL); 713 } 714 so->so_type = head->so_type; 715 so->so_options = head->so_options; 716 so->so_linger = head->so_linger; 717 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 718 so->so_fibnum = head->so_fibnum; 719 so->so_proto = head->so_proto; 720 so->so_cred = crhold(head->so_cred); 721 #ifdef MAC 722 mac_socket_newconn(head, so); 723 #endif 724 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 725 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 726 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 727 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 728 VNET_SO_ASSERT(head); 729 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 730 sodealloc(so); 731 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 732 __func__, head->so_pcb); 733 return (NULL); 734 } 735 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 736 sodealloc(so); 737 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 738 __func__, head->so_pcb); 739 return (NULL); 740 } 741 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 742 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 743 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 744 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 745 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 746 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 747 748 soref(so); 749 750 return (so); 751 } 752 #endif /* SCTP */ 753 754 int 755 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 756 { 757 int error; 758 759 CURVNET_SET(so->so_vnet); 760 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 761 CURVNET_RESTORE(); 762 return (error); 763 } 764 765 int 766 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 767 { 768 int error; 769 770 CURVNET_SET(so->so_vnet); 771 error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); 772 CURVNET_RESTORE(); 773 return (error); 774 } 775 776 /* 777 * solisten() transitions a socket from a non-listening state to a listening 778 * state, but can also be used to update the listen queue depth on an 779 * existing listen socket. The protocol will call back into the sockets 780 * layer using solisten_proto_check() and solisten_proto() to check and set 781 * socket-layer listen state. Call backs are used so that the protocol can 782 * acquire both protocol and socket layer locks in whatever order is required 783 * by the protocol. 784 * 785 * Protocol implementors are advised to hold the socket lock across the 786 * socket-layer test and set to avoid races at the socket layer. 787 */ 788 int 789 solisten(struct socket *so, int backlog, struct thread *td) 790 { 791 int error; 792 793 CURVNET_SET(so->so_vnet); 794 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 795 CURVNET_RESTORE(); 796 return (error); 797 } 798 799 int 800 solisten_proto_check(struct socket *so) 801 { 802 803 SOCK_LOCK_ASSERT(so); 804 805 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 806 SS_ISDISCONNECTING)) 807 return (EINVAL); 808 return (0); 809 } 810 811 void 812 solisten_proto(struct socket *so, int backlog) 813 { 814 int sbrcv_lowat, sbsnd_lowat; 815 u_int sbrcv_hiwat, sbsnd_hiwat; 816 short sbrcv_flags, sbsnd_flags; 817 sbintime_t sbrcv_timeo, sbsnd_timeo; 818 819 SOCK_LOCK_ASSERT(so); 820 821 if (SOLISTENING(so)) 822 goto listening; 823 824 /* 825 * Change this socket to listening state. 826 */ 827 sbrcv_lowat = so->so_rcv.sb_lowat; 828 sbsnd_lowat = so->so_snd.sb_lowat; 829 sbrcv_hiwat = so->so_rcv.sb_hiwat; 830 sbsnd_hiwat = so->so_snd.sb_hiwat; 831 sbrcv_flags = so->so_rcv.sb_flags; 832 sbsnd_flags = so->so_snd.sb_flags; 833 sbrcv_timeo = so->so_rcv.sb_timeo; 834 sbsnd_timeo = so->so_snd.sb_timeo; 835 836 sbdestroy(&so->so_snd, so); 837 sbdestroy(&so->so_rcv, so); 838 sx_destroy(&so->so_snd.sb_sx); 839 sx_destroy(&so->so_rcv.sb_sx); 840 SOCKBUF_LOCK_DESTROY(&so->so_snd); 841 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 842 843 #ifdef INVARIANTS 844 bzero(&so->so_rcv, 845 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 846 #endif 847 848 so->sol_sbrcv_lowat = sbrcv_lowat; 849 so->sol_sbsnd_lowat = sbsnd_lowat; 850 so->sol_sbrcv_hiwat = sbrcv_hiwat; 851 so->sol_sbsnd_hiwat = sbsnd_hiwat; 852 so->sol_sbrcv_flags = sbrcv_flags; 853 so->sol_sbsnd_flags = sbsnd_flags; 854 so->sol_sbrcv_timeo = sbrcv_timeo; 855 so->sol_sbsnd_timeo = sbsnd_timeo; 856 857 so->sol_qlen = so->sol_incqlen = 0; 858 TAILQ_INIT(&so->sol_incomp); 859 TAILQ_INIT(&so->sol_comp); 860 861 so->sol_accept_filter = NULL; 862 so->sol_accept_filter_arg = NULL; 863 so->sol_accept_filter_str = NULL; 864 865 so->sol_upcall = NULL; 866 so->sol_upcallarg = NULL; 867 868 so->so_options |= SO_ACCEPTCONN; 869 870 listening: 871 if (backlog < 0 || backlog > somaxconn) 872 backlog = somaxconn; 873 so->sol_qlimit = backlog; 874 } 875 876 /* 877 * Wakeup listeners/subsystems once we have a complete connection. 878 * Enters with lock, returns unlocked. 879 */ 880 void 881 solisten_wakeup(struct socket *sol) 882 { 883 884 if (sol->sol_upcall != NULL) 885 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 886 else { 887 selwakeuppri(&sol->so_rdsel, PSOCK); 888 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 889 } 890 SOLISTEN_UNLOCK(sol); 891 wakeup_one(&sol->sol_comp); 892 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 893 pgsigio(&sol->so_sigio, SIGIO, 0); 894 } 895 896 /* 897 * Return single connection off a listening socket queue. Main consumer of 898 * the function is kern_accept4(). Some modules, that do their own accept 899 * management also use the function. 900 * 901 * Listening socket must be locked on entry and is returned unlocked on 902 * return. 903 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 904 */ 905 int 906 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 907 { 908 struct socket *so; 909 int error; 910 911 SOLISTEN_LOCK_ASSERT(head); 912 913 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 914 head->so_error == 0) { 915 error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, 916 "accept", 0); 917 if (error != 0) { 918 SOLISTEN_UNLOCK(head); 919 return (error); 920 } 921 } 922 if (head->so_error) { 923 error = head->so_error; 924 head->so_error = 0; 925 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 926 error = EWOULDBLOCK; 927 else 928 error = 0; 929 if (error) { 930 SOLISTEN_UNLOCK(head); 931 return (error); 932 } 933 so = TAILQ_FIRST(&head->sol_comp); 934 SOCK_LOCK(so); 935 KASSERT(so->so_qstate == SQ_COMP, 936 ("%s: so %p not SQ_COMP", __func__, so)); 937 soref(so); 938 head->sol_qlen--; 939 so->so_qstate = SQ_NONE; 940 so->so_listen = NULL; 941 TAILQ_REMOVE(&head->sol_comp, so, so_list); 942 if (flags & ACCEPT4_INHERIT) 943 so->so_state |= (head->so_state & SS_NBIO); 944 else 945 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 946 SOCK_UNLOCK(so); 947 sorele(head); 948 949 *ret = so; 950 return (0); 951 } 952 953 /* 954 * Evaluate the reference count and named references on a socket; if no 955 * references remain, free it. This should be called whenever a reference is 956 * released, such as in sorele(), but also when named reference flags are 957 * cleared in socket or protocol code. 958 * 959 * sofree() will free the socket if: 960 * 961 * - There are no outstanding file descriptor references or related consumers 962 * (so_count == 0). 963 * 964 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 965 * 966 * - The protocol does not have an outstanding strong reference on the socket 967 * (SS_PROTOREF). 968 * 969 * - The socket is not in a completed connection queue, so a process has been 970 * notified that it is present. If it is removed, the user process may 971 * block in accept() despite select() saying the socket was ready. 972 */ 973 void 974 sofree(struct socket *so) 975 { 976 struct protosw *pr = so->so_proto; 977 978 SOCK_LOCK_ASSERT(so); 979 980 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 981 (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { 982 SOCK_UNLOCK(so); 983 return; 984 } 985 986 if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { 987 struct socket *sol; 988 989 sol = so->so_listen; 990 KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); 991 992 /* 993 * To solve race between close of a listening socket and 994 * a socket on its incomplete queue, we need to lock both. 995 * The order is first listening socket, then regular. 996 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this 997 * function and the listening socket are the only pointers 998 * to so. To preserve so and sol, we reference both and then 999 * relock. 1000 * After relock the socket may not move to so_comp since it 1001 * doesn't have PCB already, but it may be removed from 1002 * so_incomp. If that happens, we share responsiblity on 1003 * freeing the socket, but soclose() has already removed 1004 * it from queue. 1005 */ 1006 soref(sol); 1007 soref(so); 1008 SOCK_UNLOCK(so); 1009 SOLISTEN_LOCK(sol); 1010 SOCK_LOCK(so); 1011 if (so->so_qstate == SQ_INCOMP) { 1012 KASSERT(so->so_listen == sol, 1013 ("%s: so %p migrated out of sol %p", 1014 __func__, so, sol)); 1015 TAILQ_REMOVE(&sol->sol_incomp, so, so_list); 1016 sol->sol_incqlen--; 1017 /* This is guarenteed not to be the last. */ 1018 refcount_release(&sol->so_count); 1019 so->so_qstate = SQ_NONE; 1020 so->so_listen = NULL; 1021 } else 1022 KASSERT(so->so_listen == NULL, 1023 ("%s: so %p not on (in)comp with so_listen", 1024 __func__, so)); 1025 sorele(sol); 1026 KASSERT(so->so_count == 1, 1027 ("%s: so %p count %u", __func__, so, so->so_count)); 1028 so->so_count = 0; 1029 } 1030 if (SOLISTENING(so)) 1031 so->so_error = ECONNABORTED; 1032 SOCK_UNLOCK(so); 1033 1034 if (so->so_dtor != NULL) 1035 so->so_dtor(so); 1036 1037 VNET_SO_ASSERT(so); 1038 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1039 (*pr->pr_domain->dom_dispose)(so); 1040 if (pr->pr_usrreqs->pru_detach != NULL) 1041 (*pr->pr_usrreqs->pru_detach)(so); 1042 1043 /* 1044 * From this point on, we assume that no other references to this 1045 * socket exist anywhere else in the stack. Therefore, no locks need 1046 * to be acquired or held. 1047 * 1048 * We used to do a lot of socket buffer and socket locking here, as 1049 * well as invoke sorflush() and perform wakeups. The direct call to 1050 * dom_dispose() and sbdestroy() are an inlining of what was 1051 * necessary from sorflush(). 1052 * 1053 * Notice that the socket buffer and kqueue state are torn down 1054 * before calling pru_detach. This means that protocols shold not 1055 * assume they can perform socket wakeups, etc, in their detach code. 1056 */ 1057 if (!SOLISTENING(so)) { 1058 sbdestroy(&so->so_snd, so); 1059 sbdestroy(&so->so_rcv, so); 1060 } 1061 seldrain(&so->so_rdsel); 1062 seldrain(&so->so_wrsel); 1063 knlist_destroy(&so->so_rdsel.si_note); 1064 knlist_destroy(&so->so_wrsel.si_note); 1065 sodealloc(so); 1066 } 1067 1068 /* 1069 * Close a socket on last file table reference removal. Initiate disconnect 1070 * if connected. Free socket when disconnect complete. 1071 * 1072 * This function will sorele() the socket. Note that soclose() may be called 1073 * prior to the ref count reaching zero. The actual socket structure will 1074 * not be freed until the ref count reaches zero. 1075 */ 1076 int 1077 soclose(struct socket *so) 1078 { 1079 struct accept_queue lqueue; 1080 bool listening; 1081 int error = 0; 1082 1083 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 1084 1085 CURVNET_SET(so->so_vnet); 1086 funsetown(&so->so_sigio); 1087 if (so->so_state & SS_ISCONNECTED) { 1088 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1089 error = sodisconnect(so); 1090 if (error) { 1091 if (error == ENOTCONN) 1092 error = 0; 1093 goto drop; 1094 } 1095 } 1096 if (so->so_options & SO_LINGER) { 1097 if ((so->so_state & SS_ISDISCONNECTING) && 1098 (so->so_state & SS_NBIO)) 1099 goto drop; 1100 while (so->so_state & SS_ISCONNECTED) { 1101 error = tsleep(&so->so_timeo, 1102 PSOCK | PCATCH, "soclos", 1103 so->so_linger * hz); 1104 if (error) 1105 break; 1106 } 1107 } 1108 } 1109 1110 drop: 1111 if (so->so_proto->pr_usrreqs->pru_close != NULL) 1112 (*so->so_proto->pr_usrreqs->pru_close)(so); 1113 1114 SOCK_LOCK(so); 1115 if ((listening = (so->so_options & SO_ACCEPTCONN))) { 1116 struct socket *sp; 1117 1118 TAILQ_INIT(&lqueue); 1119 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1120 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1121 1122 so->sol_qlen = so->sol_incqlen = 0; 1123 1124 TAILQ_FOREACH(sp, &lqueue, so_list) { 1125 SOCK_LOCK(sp); 1126 sp->so_qstate = SQ_NONE; 1127 sp->so_listen = NULL; 1128 SOCK_UNLOCK(sp); 1129 /* Guaranteed not to be the last. */ 1130 refcount_release(&so->so_count); 1131 } 1132 } 1133 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 1134 so->so_state |= SS_NOFDREF; 1135 sorele(so); 1136 if (listening) { 1137 struct socket *sp, *tsp; 1138 1139 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { 1140 SOCK_LOCK(sp); 1141 if (sp->so_count == 0) { 1142 SOCK_UNLOCK(sp); 1143 soabort(sp); 1144 } else 1145 /* sp is now in sofree() */ 1146 SOCK_UNLOCK(sp); 1147 } 1148 } 1149 CURVNET_RESTORE(); 1150 return (error); 1151 } 1152 1153 /* 1154 * soabort() is used to abruptly tear down a connection, such as when a 1155 * resource limit is reached (listen queue depth exceeded), or if a listen 1156 * socket is closed while there are sockets waiting to be accepted. 1157 * 1158 * This interface is tricky, because it is called on an unreferenced socket, 1159 * and must be called only by a thread that has actually removed the socket 1160 * from the listen queue it was on, or races with other threads are risked. 1161 * 1162 * This interface will call into the protocol code, so must not be called 1163 * with any socket locks held. Protocols do call it while holding their own 1164 * recursible protocol mutexes, but this is something that should be subject 1165 * to review in the future. 1166 */ 1167 void 1168 soabort(struct socket *so) 1169 { 1170 1171 /* 1172 * In as much as is possible, assert that no references to this 1173 * socket are held. This is not quite the same as asserting that the 1174 * current thread is responsible for arranging for no references, but 1175 * is as close as we can get for now. 1176 */ 1177 KASSERT(so->so_count == 0, ("soabort: so_count")); 1178 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 1179 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 1180 VNET_SO_ASSERT(so); 1181 1182 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 1183 (*so->so_proto->pr_usrreqs->pru_abort)(so); 1184 SOCK_LOCK(so); 1185 sofree(so); 1186 } 1187 1188 int 1189 soaccept(struct socket *so, struct sockaddr **nam) 1190 { 1191 int error; 1192 1193 SOCK_LOCK(so); 1194 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 1195 so->so_state &= ~SS_NOFDREF; 1196 SOCK_UNLOCK(so); 1197 1198 CURVNET_SET(so->so_vnet); 1199 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 1200 CURVNET_RESTORE(); 1201 return (error); 1202 } 1203 1204 int 1205 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 1206 { 1207 1208 return (soconnectat(AT_FDCWD, so, nam, td)); 1209 } 1210 1211 int 1212 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1213 { 1214 int error; 1215 1216 if (so->so_options & SO_ACCEPTCONN) 1217 return (EOPNOTSUPP); 1218 1219 CURVNET_SET(so->so_vnet); 1220 /* 1221 * If protocol is connection-based, can only connect once. 1222 * Otherwise, if connected, try to disconnect first. This allows 1223 * user to disconnect by connecting to, e.g., a null address. 1224 */ 1225 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1226 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1227 (error = sodisconnect(so)))) { 1228 error = EISCONN; 1229 } else { 1230 /* 1231 * Prevent accumulated error from previous connection from 1232 * biting us. 1233 */ 1234 so->so_error = 0; 1235 if (fd == AT_FDCWD) { 1236 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, 1237 nam, td); 1238 } else { 1239 error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, 1240 so, nam, td); 1241 } 1242 } 1243 CURVNET_RESTORE(); 1244 1245 return (error); 1246 } 1247 1248 int 1249 soconnect2(struct socket *so1, struct socket *so2) 1250 { 1251 int error; 1252 1253 CURVNET_SET(so1->so_vnet); 1254 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 1255 CURVNET_RESTORE(); 1256 return (error); 1257 } 1258 1259 int 1260 sodisconnect(struct socket *so) 1261 { 1262 int error; 1263 1264 if ((so->so_state & SS_ISCONNECTED) == 0) 1265 return (ENOTCONN); 1266 if (so->so_state & SS_ISDISCONNECTING) 1267 return (EALREADY); 1268 VNET_SO_ASSERT(so); 1269 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 1270 return (error); 1271 } 1272 1273 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1274 1275 int 1276 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1277 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1278 { 1279 long space; 1280 ssize_t resid; 1281 int clen = 0, error, dontroute; 1282 1283 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1284 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1285 ("sosend_dgram: !PR_ATOMIC")); 1286 1287 if (uio != NULL) 1288 resid = uio->uio_resid; 1289 else 1290 resid = top->m_pkthdr.len; 1291 /* 1292 * In theory resid should be unsigned. However, space must be 1293 * signed, as it might be less than 0 if we over-committed, and we 1294 * must use a signed comparison of space and resid. On the other 1295 * hand, a negative resid causes us to loop sending 0-length 1296 * segments to the protocol. 1297 */ 1298 if (resid < 0) { 1299 error = EINVAL; 1300 goto out; 1301 } 1302 1303 dontroute = 1304 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1305 if (td != NULL) 1306 td->td_ru.ru_msgsnd++; 1307 if (control != NULL) 1308 clen = control->m_len; 1309 1310 SOCKBUF_LOCK(&so->so_snd); 1311 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1312 SOCKBUF_UNLOCK(&so->so_snd); 1313 error = EPIPE; 1314 goto out; 1315 } 1316 if (so->so_error) { 1317 error = so->so_error; 1318 so->so_error = 0; 1319 SOCKBUF_UNLOCK(&so->so_snd); 1320 goto out; 1321 } 1322 if ((so->so_state & SS_ISCONNECTED) == 0) { 1323 /* 1324 * `sendto' and `sendmsg' is allowed on a connection-based 1325 * socket if it supports implied connect. Return ENOTCONN if 1326 * not connected and no address is supplied. 1327 */ 1328 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1329 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1330 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1331 !(resid == 0 && clen != 0)) { 1332 SOCKBUF_UNLOCK(&so->so_snd); 1333 error = ENOTCONN; 1334 goto out; 1335 } 1336 } else if (addr == NULL) { 1337 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1338 error = ENOTCONN; 1339 else 1340 error = EDESTADDRREQ; 1341 SOCKBUF_UNLOCK(&so->so_snd); 1342 goto out; 1343 } 1344 } 1345 1346 /* 1347 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1348 * problem and need fixing. 1349 */ 1350 space = sbspace(&so->so_snd); 1351 if (flags & MSG_OOB) 1352 space += 1024; 1353 space -= clen; 1354 SOCKBUF_UNLOCK(&so->so_snd); 1355 if (resid > space) { 1356 error = EMSGSIZE; 1357 goto out; 1358 } 1359 if (uio == NULL) { 1360 resid = 0; 1361 if (flags & MSG_EOR) 1362 top->m_flags |= M_EOR; 1363 } else { 1364 /* 1365 * Copy the data from userland into a mbuf chain. 1366 * If no data is to be copied in, a single empty mbuf 1367 * is returned. 1368 */ 1369 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1370 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1371 if (top == NULL) { 1372 error = EFAULT; /* only possible error */ 1373 goto out; 1374 } 1375 space -= resid - uio->uio_resid; 1376 resid = uio->uio_resid; 1377 } 1378 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1379 /* 1380 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1381 * than with. 1382 */ 1383 if (dontroute) { 1384 SOCK_LOCK(so); 1385 so->so_options |= SO_DONTROUTE; 1386 SOCK_UNLOCK(so); 1387 } 1388 /* 1389 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1390 * of date. We could have received a reset packet in an interrupt or 1391 * maybe we slept while doing page faults in uiomove() etc. We could 1392 * probably recheck again inside the locking protection here, but 1393 * there are probably other places that this also happens. We must 1394 * rethink this. 1395 */ 1396 VNET_SO_ASSERT(so); 1397 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1398 (flags & MSG_OOB) ? PRUS_OOB : 1399 /* 1400 * If the user set MSG_EOF, the protocol understands this flag and 1401 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1402 */ 1403 ((flags & MSG_EOF) && 1404 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1405 (resid <= 0)) ? 1406 PRUS_EOF : 1407 /* If there is more to send set PRUS_MORETOCOME */ 1408 (flags & MSG_MORETOCOME) || 1409 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1410 top, addr, control, td); 1411 if (dontroute) { 1412 SOCK_LOCK(so); 1413 so->so_options &= ~SO_DONTROUTE; 1414 SOCK_UNLOCK(so); 1415 } 1416 clen = 0; 1417 control = NULL; 1418 top = NULL; 1419 out: 1420 if (top != NULL) 1421 m_freem(top); 1422 if (control != NULL) 1423 m_freem(control); 1424 return (error); 1425 } 1426 1427 /* 1428 * Send on a socket. If send must go all at once and message is larger than 1429 * send buffering, then hard error. Lock against other senders. If must go 1430 * all at once and not enough room now, then inform user that this would 1431 * block and do nothing. Otherwise, if nonblocking, send as much as 1432 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1433 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1434 * in mbuf chain must be small enough to send all at once. 1435 * 1436 * Returns nonzero on error, timeout or signal; callers must check for short 1437 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1438 * on return. 1439 */ 1440 int 1441 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1442 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1443 { 1444 long space; 1445 ssize_t resid; 1446 int clen = 0, error, dontroute; 1447 int atomic = sosendallatonce(so) || top; 1448 int pru_flag; 1449 #ifdef KERN_TLS 1450 struct ktls_session *tls; 1451 int tls_enq_cnt, tls_pruflag; 1452 uint8_t tls_rtype; 1453 1454 tls = NULL; 1455 tls_rtype = TLS_RLTYPE_APP; 1456 #endif 1457 if (uio != NULL) 1458 resid = uio->uio_resid; 1459 else 1460 resid = top->m_pkthdr.len; 1461 /* 1462 * In theory resid should be unsigned. However, space must be 1463 * signed, as it might be less than 0 if we over-committed, and we 1464 * must use a signed comparison of space and resid. On the other 1465 * hand, a negative resid causes us to loop sending 0-length 1466 * segments to the protocol. 1467 * 1468 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1469 * type sockets since that's an error. 1470 */ 1471 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1472 error = EINVAL; 1473 goto out; 1474 } 1475 1476 dontroute = 1477 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1478 (so->so_proto->pr_flags & PR_ATOMIC); 1479 if (td != NULL) 1480 td->td_ru.ru_msgsnd++; 1481 if (control != NULL) 1482 clen = control->m_len; 1483 1484 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1485 if (error) 1486 goto out; 1487 1488 #ifdef KERN_TLS 1489 tls_pruflag = 0; 1490 tls = ktls_hold(so->so_snd.sb_tls_info); 1491 if (tls != NULL) { 1492 if (tls->mode == TCP_TLS_MODE_SW) 1493 tls_pruflag = PRUS_NOTREADY; 1494 1495 if (control != NULL) { 1496 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1497 1498 if (clen >= sizeof(*cm) && 1499 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 1500 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 1501 clen = 0; 1502 m_freem(control); 1503 control = NULL; 1504 atomic = 1; 1505 } 1506 } 1507 } 1508 #endif 1509 1510 restart: 1511 do { 1512 SOCKBUF_LOCK(&so->so_snd); 1513 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1514 SOCKBUF_UNLOCK(&so->so_snd); 1515 error = EPIPE; 1516 goto release; 1517 } 1518 if (so->so_error) { 1519 error = so->so_error; 1520 so->so_error = 0; 1521 SOCKBUF_UNLOCK(&so->so_snd); 1522 goto release; 1523 } 1524 if ((so->so_state & SS_ISCONNECTED) == 0) { 1525 /* 1526 * `sendto' and `sendmsg' is allowed on a connection- 1527 * based socket if it supports implied connect. 1528 * Return ENOTCONN if not connected and no address is 1529 * supplied. 1530 */ 1531 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1532 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1533 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1534 !(resid == 0 && clen != 0)) { 1535 SOCKBUF_UNLOCK(&so->so_snd); 1536 error = ENOTCONN; 1537 goto release; 1538 } 1539 } else if (addr == NULL) { 1540 SOCKBUF_UNLOCK(&so->so_snd); 1541 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1542 error = ENOTCONN; 1543 else 1544 error = EDESTADDRREQ; 1545 goto release; 1546 } 1547 } 1548 space = sbspace(&so->so_snd); 1549 if (flags & MSG_OOB) 1550 space += 1024; 1551 if ((atomic && resid > so->so_snd.sb_hiwat) || 1552 clen > so->so_snd.sb_hiwat) { 1553 SOCKBUF_UNLOCK(&so->so_snd); 1554 error = EMSGSIZE; 1555 goto release; 1556 } 1557 if (space < resid + clen && 1558 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1559 if ((so->so_state & SS_NBIO) || 1560 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1561 SOCKBUF_UNLOCK(&so->so_snd); 1562 error = EWOULDBLOCK; 1563 goto release; 1564 } 1565 error = sbwait(&so->so_snd); 1566 SOCKBUF_UNLOCK(&so->so_snd); 1567 if (error) 1568 goto release; 1569 goto restart; 1570 } 1571 SOCKBUF_UNLOCK(&so->so_snd); 1572 space -= clen; 1573 do { 1574 if (uio == NULL) { 1575 resid = 0; 1576 if (flags & MSG_EOR) 1577 top->m_flags |= M_EOR; 1578 } else { 1579 /* 1580 * Copy the data from userland into a mbuf 1581 * chain. If resid is 0, which can happen 1582 * only if we have control to send, then 1583 * a single empty mbuf is returned. This 1584 * is a workaround to prevent protocol send 1585 * methods to panic. 1586 */ 1587 #ifdef KERN_TLS 1588 if (tls != NULL) { 1589 top = m_uiotombuf(uio, M_WAITOK, space, 1590 tls->params.max_frame_len, 1591 M_NOMAP | 1592 ((flags & MSG_EOR) ? M_EOR : 0)); 1593 if (top != NULL) { 1594 error = ktls_frame(top, tls, 1595 &tls_enq_cnt, tls_rtype); 1596 if (error) { 1597 m_freem(top); 1598 goto release; 1599 } 1600 } 1601 tls_rtype = TLS_RLTYPE_APP; 1602 } else 1603 #endif 1604 top = m_uiotombuf(uio, M_WAITOK, space, 1605 (atomic ? max_hdr : 0), 1606 (atomic ? M_PKTHDR : 0) | 1607 ((flags & MSG_EOR) ? M_EOR : 0)); 1608 if (top == NULL) { 1609 error = EFAULT; /* only possible error */ 1610 goto release; 1611 } 1612 space -= resid - uio->uio_resid; 1613 resid = uio->uio_resid; 1614 } 1615 if (dontroute) { 1616 SOCK_LOCK(so); 1617 so->so_options |= SO_DONTROUTE; 1618 SOCK_UNLOCK(so); 1619 } 1620 /* 1621 * XXX all the SBS_CANTSENDMORE checks previously 1622 * done could be out of date. We could have received 1623 * a reset packet in an interrupt or maybe we slept 1624 * while doing page faults in uiomove() etc. We 1625 * could probably recheck again inside the locking 1626 * protection here, but there are probably other 1627 * places that this also happens. We must rethink 1628 * this. 1629 */ 1630 VNET_SO_ASSERT(so); 1631 1632 pru_flag = (flags & MSG_OOB) ? PRUS_OOB : 1633 /* 1634 * If the user set MSG_EOF, the protocol understands 1635 * this flag and nothing left to send then use 1636 * PRU_SEND_EOF instead of PRU_SEND. 1637 */ 1638 ((flags & MSG_EOF) && 1639 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1640 (resid <= 0)) ? 1641 PRUS_EOF : 1642 /* If there is more to send set PRUS_MORETOCOME. */ 1643 (flags & MSG_MORETOCOME) || 1644 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 1645 1646 #ifdef KERN_TLS 1647 pru_flag |= tls_pruflag; 1648 #endif 1649 1650 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1651 pru_flag, top, addr, control, td); 1652 1653 if (dontroute) { 1654 SOCK_LOCK(so); 1655 so->so_options &= ~SO_DONTROUTE; 1656 SOCK_UNLOCK(so); 1657 } 1658 1659 #ifdef KERN_TLS 1660 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 1661 /* 1662 * Note that error is intentionally 1663 * ignored. 1664 * 1665 * Like sendfile(), we rely on the 1666 * completion routine (pru_ready()) 1667 * to free the mbufs in the event that 1668 * pru_send() encountered an error and 1669 * did not append them to the sockbuf. 1670 */ 1671 soref(so); 1672 ktls_enqueue(top, so, tls_enq_cnt); 1673 } 1674 #endif 1675 clen = 0; 1676 control = NULL; 1677 top = NULL; 1678 if (error) 1679 goto release; 1680 } while (resid && space > 0); 1681 } while (resid); 1682 1683 release: 1684 sbunlock(&so->so_snd); 1685 out: 1686 #ifdef KERN_TLS 1687 if (tls != NULL) 1688 ktls_free(tls); 1689 #endif 1690 if (top != NULL) 1691 m_freem(top); 1692 if (control != NULL) 1693 m_freem(control); 1694 return (error); 1695 } 1696 1697 int 1698 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1699 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1700 { 1701 int error; 1702 1703 CURVNET_SET(so->so_vnet); 1704 if (!SOLISTENING(so)) 1705 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, 1706 top, control, flags, td); 1707 else { 1708 m_freem(top); 1709 m_freem(control); 1710 error = ENOTCONN; 1711 } 1712 CURVNET_RESTORE(); 1713 return (error); 1714 } 1715 1716 /* 1717 * The part of soreceive() that implements reading non-inline out-of-band 1718 * data from a socket. For more complete comments, see soreceive(), from 1719 * which this code originated. 1720 * 1721 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1722 * unable to return an mbuf chain to the caller. 1723 */ 1724 static int 1725 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1726 { 1727 struct protosw *pr = so->so_proto; 1728 struct mbuf *m; 1729 int error; 1730 1731 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1732 VNET_SO_ASSERT(so); 1733 1734 m = m_get(M_WAITOK, MT_DATA); 1735 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1736 if (error) 1737 goto bad; 1738 do { 1739 error = uiomove(mtod(m, void *), 1740 (int) min(uio->uio_resid, m->m_len), uio); 1741 m = m_free(m); 1742 } while (uio->uio_resid && error == 0 && m); 1743 bad: 1744 if (m != NULL) 1745 m_freem(m); 1746 return (error); 1747 } 1748 1749 /* 1750 * Following replacement or removal of the first mbuf on the first mbuf chain 1751 * of a socket buffer, push necessary state changes back into the socket 1752 * buffer so that other consumers see the values consistently. 'nextrecord' 1753 * is the callers locally stored value of the original value of 1754 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1755 * NOTE: 'nextrecord' may be NULL. 1756 */ 1757 static __inline void 1758 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1759 { 1760 1761 SOCKBUF_LOCK_ASSERT(sb); 1762 /* 1763 * First, update for the new value of nextrecord. If necessary, make 1764 * it the first record. 1765 */ 1766 if (sb->sb_mb != NULL) 1767 sb->sb_mb->m_nextpkt = nextrecord; 1768 else 1769 sb->sb_mb = nextrecord; 1770 1771 /* 1772 * Now update any dependent socket buffer fields to reflect the new 1773 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1774 * addition of a second clause that takes care of the case where 1775 * sb_mb has been updated, but remains the last record. 1776 */ 1777 if (sb->sb_mb == NULL) { 1778 sb->sb_mbtail = NULL; 1779 sb->sb_lastrecord = NULL; 1780 } else if (sb->sb_mb->m_nextpkt == NULL) 1781 sb->sb_lastrecord = sb->sb_mb; 1782 } 1783 1784 /* 1785 * Implement receive operations on a socket. We depend on the way that 1786 * records are added to the sockbuf by sbappend. In particular, each record 1787 * (mbufs linked through m_next) must begin with an address if the protocol 1788 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1789 * data, and then zero or more mbufs of data. In order to allow parallelism 1790 * between network receive and copying to user space, as well as avoid 1791 * sleeping with a mutex held, we release the socket buffer mutex during the 1792 * user space copy. Although the sockbuf is locked, new data may still be 1793 * appended, and thus we must maintain consistency of the sockbuf during that 1794 * time. 1795 * 1796 * The caller may receive the data as a single mbuf chain by supplying an 1797 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1798 * the count in uio_resid. 1799 */ 1800 int 1801 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1802 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1803 { 1804 struct mbuf *m, **mp; 1805 int flags, error, offset; 1806 ssize_t len; 1807 struct protosw *pr = so->so_proto; 1808 struct mbuf *nextrecord; 1809 int moff, type = 0; 1810 ssize_t orig_resid = uio->uio_resid; 1811 1812 mp = mp0; 1813 if (psa != NULL) 1814 *psa = NULL; 1815 if (controlp != NULL) 1816 *controlp = NULL; 1817 if (flagsp != NULL) 1818 flags = *flagsp &~ MSG_EOR; 1819 else 1820 flags = 0; 1821 if (flags & MSG_OOB) 1822 return (soreceive_rcvoob(so, uio, flags)); 1823 if (mp != NULL) 1824 *mp = NULL; 1825 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1826 && uio->uio_resid) { 1827 VNET_SO_ASSERT(so); 1828 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1829 } 1830 1831 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1832 if (error) 1833 return (error); 1834 1835 restart: 1836 SOCKBUF_LOCK(&so->so_rcv); 1837 m = so->so_rcv.sb_mb; 1838 /* 1839 * If we have less data than requested, block awaiting more (subject 1840 * to any timeout) if: 1841 * 1. the current count is less than the low water mark, or 1842 * 2. MSG_DONTWAIT is not set 1843 */ 1844 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1845 sbavail(&so->so_rcv) < uio->uio_resid) && 1846 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 1847 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1848 KASSERT(m != NULL || !sbavail(&so->so_rcv), 1849 ("receive: m == %p sbavail == %u", 1850 m, sbavail(&so->so_rcv))); 1851 if (so->so_error) { 1852 if (m != NULL) 1853 goto dontblock; 1854 error = so->so_error; 1855 if ((flags & MSG_PEEK) == 0) 1856 so->so_error = 0; 1857 SOCKBUF_UNLOCK(&so->so_rcv); 1858 goto release; 1859 } 1860 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1861 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1862 if (m == NULL) { 1863 SOCKBUF_UNLOCK(&so->so_rcv); 1864 goto release; 1865 } else 1866 goto dontblock; 1867 } 1868 for (; m != NULL; m = m->m_next) 1869 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1870 m = so->so_rcv.sb_mb; 1871 goto dontblock; 1872 } 1873 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1874 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1875 SOCKBUF_UNLOCK(&so->so_rcv); 1876 error = ENOTCONN; 1877 goto release; 1878 } 1879 if (uio->uio_resid == 0) { 1880 SOCKBUF_UNLOCK(&so->so_rcv); 1881 goto release; 1882 } 1883 if ((so->so_state & SS_NBIO) || 1884 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1885 SOCKBUF_UNLOCK(&so->so_rcv); 1886 error = EWOULDBLOCK; 1887 goto release; 1888 } 1889 SBLASTRECORDCHK(&so->so_rcv); 1890 SBLASTMBUFCHK(&so->so_rcv); 1891 error = sbwait(&so->so_rcv); 1892 SOCKBUF_UNLOCK(&so->so_rcv); 1893 if (error) 1894 goto release; 1895 goto restart; 1896 } 1897 dontblock: 1898 /* 1899 * From this point onward, we maintain 'nextrecord' as a cache of the 1900 * pointer to the next record in the socket buffer. We must keep the 1901 * various socket buffer pointers and local stack versions of the 1902 * pointers in sync, pushing out modifications before dropping the 1903 * socket buffer mutex, and re-reading them when picking it up. 1904 * 1905 * Otherwise, we will race with the network stack appending new data 1906 * or records onto the socket buffer by using inconsistent/stale 1907 * versions of the field, possibly resulting in socket buffer 1908 * corruption. 1909 * 1910 * By holding the high-level sblock(), we prevent simultaneous 1911 * readers from pulling off the front of the socket buffer. 1912 */ 1913 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1914 if (uio->uio_td) 1915 uio->uio_td->td_ru.ru_msgrcv++; 1916 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1917 SBLASTRECORDCHK(&so->so_rcv); 1918 SBLASTMBUFCHK(&so->so_rcv); 1919 nextrecord = m->m_nextpkt; 1920 if (pr->pr_flags & PR_ADDR) { 1921 KASSERT(m->m_type == MT_SONAME, 1922 ("m->m_type == %d", m->m_type)); 1923 orig_resid = 0; 1924 if (psa != NULL) 1925 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1926 M_NOWAIT); 1927 if (flags & MSG_PEEK) { 1928 m = m->m_next; 1929 } else { 1930 sbfree(&so->so_rcv, m); 1931 so->so_rcv.sb_mb = m_free(m); 1932 m = so->so_rcv.sb_mb; 1933 sockbuf_pushsync(&so->so_rcv, nextrecord); 1934 } 1935 } 1936 1937 /* 1938 * Process one or more MT_CONTROL mbufs present before any data mbufs 1939 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1940 * just copy the data; if !MSG_PEEK, we call into the protocol to 1941 * perform externalization (or freeing if controlp == NULL). 1942 */ 1943 if (m != NULL && m->m_type == MT_CONTROL) { 1944 struct mbuf *cm = NULL, *cmn; 1945 struct mbuf **cme = &cm; 1946 1947 do { 1948 if (flags & MSG_PEEK) { 1949 if (controlp != NULL) { 1950 *controlp = m_copym(m, 0, m->m_len, 1951 M_NOWAIT); 1952 controlp = &(*controlp)->m_next; 1953 } 1954 m = m->m_next; 1955 } else { 1956 sbfree(&so->so_rcv, m); 1957 so->so_rcv.sb_mb = m->m_next; 1958 m->m_next = NULL; 1959 *cme = m; 1960 cme = &(*cme)->m_next; 1961 m = so->so_rcv.sb_mb; 1962 } 1963 } while (m != NULL && m->m_type == MT_CONTROL); 1964 if ((flags & MSG_PEEK) == 0) 1965 sockbuf_pushsync(&so->so_rcv, nextrecord); 1966 while (cm != NULL) { 1967 cmn = cm->m_next; 1968 cm->m_next = NULL; 1969 if (pr->pr_domain->dom_externalize != NULL) { 1970 SOCKBUF_UNLOCK(&so->so_rcv); 1971 VNET_SO_ASSERT(so); 1972 error = (*pr->pr_domain->dom_externalize) 1973 (cm, controlp, flags); 1974 SOCKBUF_LOCK(&so->so_rcv); 1975 } else if (controlp != NULL) 1976 *controlp = cm; 1977 else 1978 m_freem(cm); 1979 if (controlp != NULL) { 1980 orig_resid = 0; 1981 while (*controlp != NULL) 1982 controlp = &(*controlp)->m_next; 1983 } 1984 cm = cmn; 1985 } 1986 if (m != NULL) 1987 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1988 else 1989 nextrecord = so->so_rcv.sb_mb; 1990 orig_resid = 0; 1991 } 1992 if (m != NULL) { 1993 if ((flags & MSG_PEEK) == 0) { 1994 KASSERT(m->m_nextpkt == nextrecord, 1995 ("soreceive: post-control, nextrecord !sync")); 1996 if (nextrecord == NULL) { 1997 KASSERT(so->so_rcv.sb_mb == m, 1998 ("soreceive: post-control, sb_mb!=m")); 1999 KASSERT(so->so_rcv.sb_lastrecord == m, 2000 ("soreceive: post-control, lastrecord!=m")); 2001 } 2002 } 2003 type = m->m_type; 2004 if (type == MT_OOBDATA) 2005 flags |= MSG_OOB; 2006 } else { 2007 if ((flags & MSG_PEEK) == 0) { 2008 KASSERT(so->so_rcv.sb_mb == nextrecord, 2009 ("soreceive: sb_mb != nextrecord")); 2010 if (so->so_rcv.sb_mb == NULL) { 2011 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2012 ("soreceive: sb_lastercord != NULL")); 2013 } 2014 } 2015 } 2016 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2017 SBLASTRECORDCHK(&so->so_rcv); 2018 SBLASTMBUFCHK(&so->so_rcv); 2019 2020 /* 2021 * Now continue to read any data mbufs off of the head of the socket 2022 * buffer until the read request is satisfied. Note that 'type' is 2023 * used to store the type of any mbuf reads that have happened so far 2024 * such that soreceive() can stop reading if the type changes, which 2025 * causes soreceive() to return only one of regular data and inline 2026 * out-of-band data in a single socket receive operation. 2027 */ 2028 moff = 0; 2029 offset = 0; 2030 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 2031 && error == 0) { 2032 /* 2033 * If the type of mbuf has changed since the last mbuf 2034 * examined ('type'), end the receive operation. 2035 */ 2036 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2037 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 2038 if (type != m->m_type) 2039 break; 2040 } else if (type == MT_OOBDATA) 2041 break; 2042 else 2043 KASSERT(m->m_type == MT_DATA, 2044 ("m->m_type == %d", m->m_type)); 2045 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 2046 len = uio->uio_resid; 2047 if (so->so_oobmark && len > so->so_oobmark - offset) 2048 len = so->so_oobmark - offset; 2049 if (len > m->m_len - moff) 2050 len = m->m_len - moff; 2051 /* 2052 * If mp is set, just pass back the mbufs. Otherwise copy 2053 * them out via the uio, then free. Sockbuf must be 2054 * consistent here (points to current mbuf, it points to next 2055 * record) when we drop priority; we must note any additions 2056 * to the sockbuf when we block interrupts again. 2057 */ 2058 if (mp == NULL) { 2059 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2060 SBLASTRECORDCHK(&so->so_rcv); 2061 SBLASTMBUFCHK(&so->so_rcv); 2062 SOCKBUF_UNLOCK(&so->so_rcv); 2063 if ((m->m_flags & M_NOMAP) != 0) 2064 error = m_unmappedtouio(m, moff, uio, (int)len); 2065 else 2066 error = uiomove(mtod(m, char *) + moff, 2067 (int)len, uio); 2068 SOCKBUF_LOCK(&so->so_rcv); 2069 if (error) { 2070 /* 2071 * The MT_SONAME mbuf has already been removed 2072 * from the record, so it is necessary to 2073 * remove the data mbufs, if any, to preserve 2074 * the invariant in the case of PR_ADDR that 2075 * requires MT_SONAME mbufs at the head of 2076 * each record. 2077 */ 2078 if (pr->pr_flags & PR_ATOMIC && 2079 ((flags & MSG_PEEK) == 0)) 2080 (void)sbdroprecord_locked(&so->so_rcv); 2081 SOCKBUF_UNLOCK(&so->so_rcv); 2082 goto release; 2083 } 2084 } else 2085 uio->uio_resid -= len; 2086 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2087 if (len == m->m_len - moff) { 2088 if (m->m_flags & M_EOR) 2089 flags |= MSG_EOR; 2090 if (flags & MSG_PEEK) { 2091 m = m->m_next; 2092 moff = 0; 2093 } else { 2094 nextrecord = m->m_nextpkt; 2095 sbfree(&so->so_rcv, m); 2096 if (mp != NULL) { 2097 m->m_nextpkt = NULL; 2098 *mp = m; 2099 mp = &m->m_next; 2100 so->so_rcv.sb_mb = m = m->m_next; 2101 *mp = NULL; 2102 } else { 2103 so->so_rcv.sb_mb = m_free(m); 2104 m = so->so_rcv.sb_mb; 2105 } 2106 sockbuf_pushsync(&so->so_rcv, nextrecord); 2107 SBLASTRECORDCHK(&so->so_rcv); 2108 SBLASTMBUFCHK(&so->so_rcv); 2109 } 2110 } else { 2111 if (flags & MSG_PEEK) 2112 moff += len; 2113 else { 2114 if (mp != NULL) { 2115 if (flags & MSG_DONTWAIT) { 2116 *mp = m_copym(m, 0, len, 2117 M_NOWAIT); 2118 if (*mp == NULL) { 2119 /* 2120 * m_copym() couldn't 2121 * allocate an mbuf. 2122 * Adjust uio_resid back 2123 * (it was adjusted 2124 * down by len bytes, 2125 * which we didn't end 2126 * up "copying" over). 2127 */ 2128 uio->uio_resid += len; 2129 break; 2130 } 2131 } else { 2132 SOCKBUF_UNLOCK(&so->so_rcv); 2133 *mp = m_copym(m, 0, len, 2134 M_WAITOK); 2135 SOCKBUF_LOCK(&so->so_rcv); 2136 } 2137 } 2138 sbcut_locked(&so->so_rcv, len); 2139 } 2140 } 2141 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2142 if (so->so_oobmark) { 2143 if ((flags & MSG_PEEK) == 0) { 2144 so->so_oobmark -= len; 2145 if (so->so_oobmark == 0) { 2146 so->so_rcv.sb_state |= SBS_RCVATMARK; 2147 break; 2148 } 2149 } else { 2150 offset += len; 2151 if (offset == so->so_oobmark) 2152 break; 2153 } 2154 } 2155 if (flags & MSG_EOR) 2156 break; 2157 /* 2158 * If the MSG_WAITALL flag is set (for non-atomic socket), we 2159 * must not quit until "uio->uio_resid == 0" or an error 2160 * termination. If a signal/timeout occurs, return with a 2161 * short count but without error. Keep sockbuf locked 2162 * against other readers. 2163 */ 2164 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 2165 !sosendallatonce(so) && nextrecord == NULL) { 2166 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2167 if (so->so_error || 2168 so->so_rcv.sb_state & SBS_CANTRCVMORE) 2169 break; 2170 /* 2171 * Notify the protocol that some data has been 2172 * drained before blocking. 2173 */ 2174 if (pr->pr_flags & PR_WANTRCVD) { 2175 SOCKBUF_UNLOCK(&so->so_rcv); 2176 VNET_SO_ASSERT(so); 2177 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2178 SOCKBUF_LOCK(&so->so_rcv); 2179 } 2180 SBLASTRECORDCHK(&so->so_rcv); 2181 SBLASTMBUFCHK(&so->so_rcv); 2182 /* 2183 * We could receive some data while was notifying 2184 * the protocol. Skip blocking in this case. 2185 */ 2186 if (so->so_rcv.sb_mb == NULL) { 2187 error = sbwait(&so->so_rcv); 2188 if (error) { 2189 SOCKBUF_UNLOCK(&so->so_rcv); 2190 goto release; 2191 } 2192 } 2193 m = so->so_rcv.sb_mb; 2194 if (m != NULL) 2195 nextrecord = m->m_nextpkt; 2196 } 2197 } 2198 2199 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2200 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 2201 flags |= MSG_TRUNC; 2202 if ((flags & MSG_PEEK) == 0) 2203 (void) sbdroprecord_locked(&so->so_rcv); 2204 } 2205 if ((flags & MSG_PEEK) == 0) { 2206 if (m == NULL) { 2207 /* 2208 * First part is an inline SB_EMPTY_FIXUP(). Second 2209 * part makes sure sb_lastrecord is up-to-date if 2210 * there is still data in the socket buffer. 2211 */ 2212 so->so_rcv.sb_mb = nextrecord; 2213 if (so->so_rcv.sb_mb == NULL) { 2214 so->so_rcv.sb_mbtail = NULL; 2215 so->so_rcv.sb_lastrecord = NULL; 2216 } else if (nextrecord->m_nextpkt == NULL) 2217 so->so_rcv.sb_lastrecord = nextrecord; 2218 } 2219 SBLASTRECORDCHK(&so->so_rcv); 2220 SBLASTMBUFCHK(&so->so_rcv); 2221 /* 2222 * If soreceive() is being done from the socket callback, 2223 * then don't need to generate ACK to peer to update window, 2224 * since ACK will be generated on return to TCP. 2225 */ 2226 if (!(flags & MSG_SOCALLBCK) && 2227 (pr->pr_flags & PR_WANTRCVD)) { 2228 SOCKBUF_UNLOCK(&so->so_rcv); 2229 VNET_SO_ASSERT(so); 2230 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2231 SOCKBUF_LOCK(&so->so_rcv); 2232 } 2233 } 2234 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2235 if (orig_resid == uio->uio_resid && orig_resid && 2236 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 2237 SOCKBUF_UNLOCK(&so->so_rcv); 2238 goto restart; 2239 } 2240 SOCKBUF_UNLOCK(&so->so_rcv); 2241 2242 if (flagsp != NULL) 2243 *flagsp |= flags; 2244 release: 2245 sbunlock(&so->so_rcv); 2246 return (error); 2247 } 2248 2249 /* 2250 * Optimized version of soreceive() for stream (TCP) sockets. 2251 */ 2252 int 2253 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 2254 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2255 { 2256 int len = 0, error = 0, flags, oresid; 2257 struct sockbuf *sb; 2258 struct mbuf *m, *n = NULL; 2259 2260 /* We only do stream sockets. */ 2261 if (so->so_type != SOCK_STREAM) 2262 return (EINVAL); 2263 if (psa != NULL) 2264 *psa = NULL; 2265 if (flagsp != NULL) 2266 flags = *flagsp &~ MSG_EOR; 2267 else 2268 flags = 0; 2269 if (controlp != NULL) 2270 *controlp = NULL; 2271 if (flags & MSG_OOB) 2272 return (soreceive_rcvoob(so, uio, flags)); 2273 if (mp0 != NULL) 2274 *mp0 = NULL; 2275 2276 sb = &so->so_rcv; 2277 2278 /* Prevent other readers from entering the socket. */ 2279 error = sblock(sb, SBLOCKWAIT(flags)); 2280 if (error) 2281 return (error); 2282 SOCKBUF_LOCK(sb); 2283 2284 /* Easy one, no space to copyout anything. */ 2285 if (uio->uio_resid == 0) { 2286 error = EINVAL; 2287 goto out; 2288 } 2289 oresid = uio->uio_resid; 2290 2291 /* We will never ever get anything unless we are or were connected. */ 2292 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2293 error = ENOTCONN; 2294 goto out; 2295 } 2296 2297 restart: 2298 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2299 2300 /* Abort if socket has reported problems. */ 2301 if (so->so_error) { 2302 if (sbavail(sb) > 0) 2303 goto deliver; 2304 if (oresid > uio->uio_resid) 2305 goto out; 2306 error = so->so_error; 2307 if (!(flags & MSG_PEEK)) 2308 so->so_error = 0; 2309 goto out; 2310 } 2311 2312 /* Door is closed. Deliver what is left, if any. */ 2313 if (sb->sb_state & SBS_CANTRCVMORE) { 2314 if (sbavail(sb) > 0) 2315 goto deliver; 2316 else 2317 goto out; 2318 } 2319 2320 /* Socket buffer is empty and we shall not block. */ 2321 if (sbavail(sb) == 0 && 2322 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2323 error = EAGAIN; 2324 goto out; 2325 } 2326 2327 /* Socket buffer got some data that we shall deliver now. */ 2328 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 2329 ((so->so_state & SS_NBIO) || 2330 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2331 sbavail(sb) >= sb->sb_lowat || 2332 sbavail(sb) >= uio->uio_resid || 2333 sbavail(sb) >= sb->sb_hiwat) ) { 2334 goto deliver; 2335 } 2336 2337 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2338 if ((flags & MSG_WAITALL) && 2339 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 2340 goto deliver; 2341 2342 /* 2343 * Wait and block until (more) data comes in. 2344 * NB: Drops the sockbuf lock during wait. 2345 */ 2346 error = sbwait(sb); 2347 if (error) 2348 goto out; 2349 goto restart; 2350 2351 deliver: 2352 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2353 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 2354 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2355 2356 /* Statistics. */ 2357 if (uio->uio_td) 2358 uio->uio_td->td_ru.ru_msgrcv++; 2359 2360 /* Fill uio until full or current end of socket buffer is reached. */ 2361 len = min(uio->uio_resid, sbavail(sb)); 2362 if (mp0 != NULL) { 2363 /* Dequeue as many mbufs as possible. */ 2364 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2365 if (*mp0 == NULL) 2366 *mp0 = sb->sb_mb; 2367 else 2368 m_cat(*mp0, sb->sb_mb); 2369 for (m = sb->sb_mb; 2370 m != NULL && m->m_len <= len; 2371 m = m->m_next) { 2372 KASSERT(!(m->m_flags & M_NOTAVAIL), 2373 ("%s: m %p not available", __func__, m)); 2374 len -= m->m_len; 2375 uio->uio_resid -= m->m_len; 2376 sbfree(sb, m); 2377 n = m; 2378 } 2379 n->m_next = NULL; 2380 sb->sb_mb = m; 2381 sb->sb_lastrecord = sb->sb_mb; 2382 if (sb->sb_mb == NULL) 2383 SB_EMPTY_FIXUP(sb); 2384 } 2385 /* Copy the remainder. */ 2386 if (len > 0) { 2387 KASSERT(sb->sb_mb != NULL, 2388 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2389 2390 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2391 if (m == NULL) 2392 len = 0; /* Don't flush data from sockbuf. */ 2393 else 2394 uio->uio_resid -= len; 2395 if (*mp0 != NULL) 2396 m_cat(*mp0, m); 2397 else 2398 *mp0 = m; 2399 if (*mp0 == NULL) { 2400 error = ENOBUFS; 2401 goto out; 2402 } 2403 } 2404 } else { 2405 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2406 SOCKBUF_UNLOCK(sb); 2407 error = m_mbuftouio(uio, sb->sb_mb, len); 2408 SOCKBUF_LOCK(sb); 2409 if (error) 2410 goto out; 2411 } 2412 SBLASTRECORDCHK(sb); 2413 SBLASTMBUFCHK(sb); 2414 2415 /* 2416 * Remove the delivered data from the socket buffer unless we 2417 * were only peeking. 2418 */ 2419 if (!(flags & MSG_PEEK)) { 2420 if (len > 0) 2421 sbdrop_locked(sb, len); 2422 2423 /* Notify protocol that we drained some data. */ 2424 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2425 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2426 !(flags & MSG_SOCALLBCK))) { 2427 SOCKBUF_UNLOCK(sb); 2428 VNET_SO_ASSERT(so); 2429 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 2430 SOCKBUF_LOCK(sb); 2431 } 2432 } 2433 2434 /* 2435 * For MSG_WAITALL we may have to loop again and wait for 2436 * more data to come in. 2437 */ 2438 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2439 goto restart; 2440 out: 2441 SOCKBUF_LOCK_ASSERT(sb); 2442 SBLASTRECORDCHK(sb); 2443 SBLASTMBUFCHK(sb); 2444 SOCKBUF_UNLOCK(sb); 2445 sbunlock(sb); 2446 return (error); 2447 } 2448 2449 /* 2450 * Optimized version of soreceive() for simple datagram cases from userspace. 2451 * Unlike in the stream case, we're able to drop a datagram if copyout() 2452 * fails, and because we handle datagrams atomically, we don't need to use a 2453 * sleep lock to prevent I/O interlacing. 2454 */ 2455 int 2456 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2457 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2458 { 2459 struct mbuf *m, *m2; 2460 int flags, error; 2461 ssize_t len; 2462 struct protosw *pr = so->so_proto; 2463 struct mbuf *nextrecord; 2464 2465 if (psa != NULL) 2466 *psa = NULL; 2467 if (controlp != NULL) 2468 *controlp = NULL; 2469 if (flagsp != NULL) 2470 flags = *flagsp &~ MSG_EOR; 2471 else 2472 flags = 0; 2473 2474 /* 2475 * For any complicated cases, fall back to the full 2476 * soreceive_generic(). 2477 */ 2478 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) 2479 return (soreceive_generic(so, psa, uio, mp0, controlp, 2480 flagsp)); 2481 2482 /* 2483 * Enforce restrictions on use. 2484 */ 2485 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2486 ("soreceive_dgram: wantrcvd")); 2487 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2488 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2489 ("soreceive_dgram: SBS_RCVATMARK")); 2490 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2491 ("soreceive_dgram: P_CONNREQUIRED")); 2492 2493 /* 2494 * Loop blocking while waiting for a datagram. 2495 */ 2496 SOCKBUF_LOCK(&so->so_rcv); 2497 while ((m = so->so_rcv.sb_mb) == NULL) { 2498 KASSERT(sbavail(&so->so_rcv) == 0, 2499 ("soreceive_dgram: sb_mb NULL but sbavail %u", 2500 sbavail(&so->so_rcv))); 2501 if (so->so_error) { 2502 error = so->so_error; 2503 so->so_error = 0; 2504 SOCKBUF_UNLOCK(&so->so_rcv); 2505 return (error); 2506 } 2507 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2508 uio->uio_resid == 0) { 2509 SOCKBUF_UNLOCK(&so->so_rcv); 2510 return (0); 2511 } 2512 if ((so->so_state & SS_NBIO) || 2513 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2514 SOCKBUF_UNLOCK(&so->so_rcv); 2515 return (EWOULDBLOCK); 2516 } 2517 SBLASTRECORDCHK(&so->so_rcv); 2518 SBLASTMBUFCHK(&so->so_rcv); 2519 error = sbwait(&so->so_rcv); 2520 if (error) { 2521 SOCKBUF_UNLOCK(&so->so_rcv); 2522 return (error); 2523 } 2524 } 2525 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2526 2527 if (uio->uio_td) 2528 uio->uio_td->td_ru.ru_msgrcv++; 2529 SBLASTRECORDCHK(&so->so_rcv); 2530 SBLASTMBUFCHK(&so->so_rcv); 2531 nextrecord = m->m_nextpkt; 2532 if (nextrecord == NULL) { 2533 KASSERT(so->so_rcv.sb_lastrecord == m, 2534 ("soreceive_dgram: lastrecord != m")); 2535 } 2536 2537 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2538 ("soreceive_dgram: m_nextpkt != nextrecord")); 2539 2540 /* 2541 * Pull 'm' and its chain off the front of the packet queue. 2542 */ 2543 so->so_rcv.sb_mb = NULL; 2544 sockbuf_pushsync(&so->so_rcv, nextrecord); 2545 2546 /* 2547 * Walk 'm's chain and free that many bytes from the socket buffer. 2548 */ 2549 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2550 sbfree(&so->so_rcv, m2); 2551 2552 /* 2553 * Do a few last checks before we let go of the lock. 2554 */ 2555 SBLASTRECORDCHK(&so->so_rcv); 2556 SBLASTMBUFCHK(&so->so_rcv); 2557 SOCKBUF_UNLOCK(&so->so_rcv); 2558 2559 if (pr->pr_flags & PR_ADDR) { 2560 KASSERT(m->m_type == MT_SONAME, 2561 ("m->m_type == %d", m->m_type)); 2562 if (psa != NULL) 2563 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2564 M_NOWAIT); 2565 m = m_free(m); 2566 } 2567 if (m == NULL) { 2568 /* XXXRW: Can this happen? */ 2569 return (0); 2570 } 2571 2572 /* 2573 * Packet to copyout() is now in 'm' and it is disconnected from the 2574 * queue. 2575 * 2576 * Process one or more MT_CONTROL mbufs present before any data mbufs 2577 * in the first mbuf chain on the socket buffer. We call into the 2578 * protocol to perform externalization (or freeing if controlp == 2579 * NULL). In some cases there can be only MT_CONTROL mbufs without 2580 * MT_DATA mbufs. 2581 */ 2582 if (m->m_type == MT_CONTROL) { 2583 struct mbuf *cm = NULL, *cmn; 2584 struct mbuf **cme = &cm; 2585 2586 do { 2587 m2 = m->m_next; 2588 m->m_next = NULL; 2589 *cme = m; 2590 cme = &(*cme)->m_next; 2591 m = m2; 2592 } while (m != NULL && m->m_type == MT_CONTROL); 2593 while (cm != NULL) { 2594 cmn = cm->m_next; 2595 cm->m_next = NULL; 2596 if (pr->pr_domain->dom_externalize != NULL) { 2597 error = (*pr->pr_domain->dom_externalize) 2598 (cm, controlp, flags); 2599 } else if (controlp != NULL) 2600 *controlp = cm; 2601 else 2602 m_freem(cm); 2603 if (controlp != NULL) { 2604 while (*controlp != NULL) 2605 controlp = &(*controlp)->m_next; 2606 } 2607 cm = cmn; 2608 } 2609 } 2610 KASSERT(m == NULL || m->m_type == MT_DATA, 2611 ("soreceive_dgram: !data")); 2612 while (m != NULL && uio->uio_resid > 0) { 2613 len = uio->uio_resid; 2614 if (len > m->m_len) 2615 len = m->m_len; 2616 error = uiomove(mtod(m, char *), (int)len, uio); 2617 if (error) { 2618 m_freem(m); 2619 return (error); 2620 } 2621 if (len == m->m_len) 2622 m = m_free(m); 2623 else { 2624 m->m_data += len; 2625 m->m_len -= len; 2626 } 2627 } 2628 if (m != NULL) { 2629 flags |= MSG_TRUNC; 2630 m_freem(m); 2631 } 2632 if (flagsp != NULL) 2633 *flagsp |= flags; 2634 return (0); 2635 } 2636 2637 int 2638 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2639 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2640 { 2641 int error; 2642 2643 CURVNET_SET(so->so_vnet); 2644 if (!SOLISTENING(so)) 2645 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, 2646 mp0, controlp, flagsp)); 2647 else 2648 error = ENOTCONN; 2649 CURVNET_RESTORE(); 2650 return (error); 2651 } 2652 2653 int 2654 soshutdown(struct socket *so, int how) 2655 { 2656 struct protosw *pr = so->so_proto; 2657 int error, soerror_enotconn; 2658 2659 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2660 return (EINVAL); 2661 2662 soerror_enotconn = 0; 2663 if ((so->so_state & 2664 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { 2665 /* 2666 * POSIX mandates us to return ENOTCONN when shutdown(2) is 2667 * invoked on a datagram sockets, however historically we would 2668 * actually tear socket down. This is known to be leveraged by 2669 * some applications to unblock process waiting in recvXXX(2) 2670 * by other process that it shares that socket with. Try to meet 2671 * both backward-compatibility and POSIX requirements by forcing 2672 * ENOTCONN but still asking protocol to perform pru_shutdown(). 2673 */ 2674 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) 2675 return (ENOTCONN); 2676 soerror_enotconn = 1; 2677 } 2678 2679 if (SOLISTENING(so)) { 2680 if (how != SHUT_WR) { 2681 SOLISTEN_LOCK(so); 2682 so->so_error = ECONNABORTED; 2683 solisten_wakeup(so); /* unlocks so */ 2684 } 2685 goto done; 2686 } 2687 2688 CURVNET_SET(so->so_vnet); 2689 if (pr->pr_usrreqs->pru_flush != NULL) 2690 (*pr->pr_usrreqs->pru_flush)(so, how); 2691 if (how != SHUT_WR) 2692 sorflush(so); 2693 if (how != SHUT_RD) { 2694 error = (*pr->pr_usrreqs->pru_shutdown)(so); 2695 wakeup(&so->so_timeo); 2696 CURVNET_RESTORE(); 2697 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); 2698 } 2699 wakeup(&so->so_timeo); 2700 CURVNET_RESTORE(); 2701 2702 done: 2703 return (soerror_enotconn ? ENOTCONN : 0); 2704 } 2705 2706 void 2707 sorflush(struct socket *so) 2708 { 2709 struct sockbuf *sb = &so->so_rcv; 2710 struct protosw *pr = so->so_proto; 2711 struct socket aso; 2712 2713 VNET_SO_ASSERT(so); 2714 2715 /* 2716 * In order to avoid calling dom_dispose with the socket buffer mutex 2717 * held, and in order to generally avoid holding the lock for a long 2718 * time, we make a copy of the socket buffer and clear the original 2719 * (except locks, state). The new socket buffer copy won't have 2720 * initialized locks so we can only call routines that won't use or 2721 * assert those locks. 2722 * 2723 * Dislodge threads currently blocked in receive and wait to acquire 2724 * a lock against other simultaneous readers before clearing the 2725 * socket buffer. Don't let our acquire be interrupted by a signal 2726 * despite any existing socket disposition on interruptable waiting. 2727 */ 2728 socantrcvmore(so); 2729 (void) sblock(sb, SBL_WAIT | SBL_NOINTR); 2730 2731 /* 2732 * Invalidate/clear most of the sockbuf structure, but leave selinfo 2733 * and mutex data unchanged. 2734 */ 2735 SOCKBUF_LOCK(sb); 2736 bzero(&aso, sizeof(aso)); 2737 aso.so_pcb = so->so_pcb; 2738 bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, 2739 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2740 bzero(&sb->sb_startzero, 2741 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2742 SOCKBUF_UNLOCK(sb); 2743 sbunlock(sb); 2744 2745 /* 2746 * Dispose of special rights and flush the copied socket. Don't call 2747 * any unsafe routines (that rely on locks being initialized) on aso. 2748 */ 2749 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 2750 (*pr->pr_domain->dom_dispose)(&aso); 2751 sbrelease_internal(&aso.so_rcv, so); 2752 } 2753 2754 /* 2755 * Wrapper for Socket established helper hook. 2756 * Parameters: socket, context of the hook point, hook id. 2757 */ 2758 static int inline 2759 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 2760 { 2761 struct socket_hhook_data hhook_data = { 2762 .so = so, 2763 .hctx = hctx, 2764 .m = NULL, 2765 .status = 0 2766 }; 2767 2768 CURVNET_SET(so->so_vnet); 2769 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 2770 CURVNET_RESTORE(); 2771 2772 /* Ugly but needed, since hhooks return void for now */ 2773 return (hhook_data.status); 2774 } 2775 2776 /* 2777 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2778 * additional variant to handle the case where the option value needs to be 2779 * some kind of integer, but not a specific size. In addition to their use 2780 * here, these functions are also called by the protocol-level pr_ctloutput() 2781 * routines. 2782 */ 2783 int 2784 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2785 { 2786 size_t valsize; 2787 2788 /* 2789 * If the user gives us more than we wanted, we ignore it, but if we 2790 * don't get the minimum length the caller wants, we return EINVAL. 2791 * On success, sopt->sopt_valsize is set to however much we actually 2792 * retrieved. 2793 */ 2794 if ((valsize = sopt->sopt_valsize) < minlen) 2795 return EINVAL; 2796 if (valsize > len) 2797 sopt->sopt_valsize = valsize = len; 2798 2799 if (sopt->sopt_td != NULL) 2800 return (copyin(sopt->sopt_val, buf, valsize)); 2801 2802 bcopy(sopt->sopt_val, buf, valsize); 2803 return (0); 2804 } 2805 2806 /* 2807 * Kernel version of setsockopt(2). 2808 * 2809 * XXX: optlen is size_t, not socklen_t 2810 */ 2811 int 2812 so_setsockopt(struct socket *so, int level, int optname, void *optval, 2813 size_t optlen) 2814 { 2815 struct sockopt sopt; 2816 2817 sopt.sopt_level = level; 2818 sopt.sopt_name = optname; 2819 sopt.sopt_dir = SOPT_SET; 2820 sopt.sopt_val = optval; 2821 sopt.sopt_valsize = optlen; 2822 sopt.sopt_td = NULL; 2823 return (sosetopt(so, &sopt)); 2824 } 2825 2826 int 2827 sosetopt(struct socket *so, struct sockopt *sopt) 2828 { 2829 int error, optval; 2830 struct linger l; 2831 struct timeval tv; 2832 sbintime_t val; 2833 uint32_t val32; 2834 #ifdef MAC 2835 struct mac extmac; 2836 #endif 2837 2838 CURVNET_SET(so->so_vnet); 2839 error = 0; 2840 if (sopt->sopt_level != SOL_SOCKET) { 2841 if (so->so_proto->pr_ctloutput != NULL) 2842 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2843 else 2844 error = ENOPROTOOPT; 2845 } else { 2846 switch (sopt->sopt_name) { 2847 case SO_ACCEPTFILTER: 2848 error = accept_filt_setopt(so, sopt); 2849 if (error) 2850 goto bad; 2851 break; 2852 2853 case SO_LINGER: 2854 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2855 if (error) 2856 goto bad; 2857 if (l.l_linger < 0 || 2858 l.l_linger > USHRT_MAX || 2859 l.l_linger > (INT_MAX / hz)) { 2860 error = EDOM; 2861 goto bad; 2862 } 2863 SOCK_LOCK(so); 2864 so->so_linger = l.l_linger; 2865 if (l.l_onoff) 2866 so->so_options |= SO_LINGER; 2867 else 2868 so->so_options &= ~SO_LINGER; 2869 SOCK_UNLOCK(so); 2870 break; 2871 2872 case SO_DEBUG: 2873 case SO_KEEPALIVE: 2874 case SO_DONTROUTE: 2875 case SO_USELOOPBACK: 2876 case SO_BROADCAST: 2877 case SO_REUSEADDR: 2878 case SO_REUSEPORT: 2879 case SO_REUSEPORT_LB: 2880 case SO_OOBINLINE: 2881 case SO_TIMESTAMP: 2882 case SO_BINTIME: 2883 case SO_NOSIGPIPE: 2884 case SO_NO_DDP: 2885 case SO_NO_OFFLOAD: 2886 error = sooptcopyin(sopt, &optval, sizeof optval, 2887 sizeof optval); 2888 if (error) 2889 goto bad; 2890 SOCK_LOCK(so); 2891 if (optval) 2892 so->so_options |= sopt->sopt_name; 2893 else 2894 so->so_options &= ~sopt->sopt_name; 2895 SOCK_UNLOCK(so); 2896 break; 2897 2898 case SO_SETFIB: 2899 error = sooptcopyin(sopt, &optval, sizeof optval, 2900 sizeof optval); 2901 if (error) 2902 goto bad; 2903 2904 if (optval < 0 || optval >= rt_numfibs) { 2905 error = EINVAL; 2906 goto bad; 2907 } 2908 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 2909 (so->so_proto->pr_domain->dom_family == PF_INET6) || 2910 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 2911 so->so_fibnum = optval; 2912 else 2913 so->so_fibnum = 0; 2914 break; 2915 2916 case SO_USER_COOKIE: 2917 error = sooptcopyin(sopt, &val32, sizeof val32, 2918 sizeof val32); 2919 if (error) 2920 goto bad; 2921 so->so_user_cookie = val32; 2922 break; 2923 2924 case SO_SNDBUF: 2925 case SO_RCVBUF: 2926 case SO_SNDLOWAT: 2927 case SO_RCVLOWAT: 2928 error = sooptcopyin(sopt, &optval, sizeof optval, 2929 sizeof optval); 2930 if (error) 2931 goto bad; 2932 2933 /* 2934 * Values < 1 make no sense for any of these options, 2935 * so disallow them. 2936 */ 2937 if (optval < 1) { 2938 error = EINVAL; 2939 goto bad; 2940 } 2941 2942 error = sbsetopt(so, sopt->sopt_name, optval); 2943 break; 2944 2945 case SO_SNDTIMEO: 2946 case SO_RCVTIMEO: 2947 #ifdef COMPAT_FREEBSD32 2948 if (SV_CURPROC_FLAG(SV_ILP32)) { 2949 struct timeval32 tv32; 2950 2951 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2952 sizeof tv32); 2953 CP(tv32, tv, tv_sec); 2954 CP(tv32, tv, tv_usec); 2955 } else 2956 #endif 2957 error = sooptcopyin(sopt, &tv, sizeof tv, 2958 sizeof tv); 2959 if (error) 2960 goto bad; 2961 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 2962 tv.tv_usec >= 1000000) { 2963 error = EDOM; 2964 goto bad; 2965 } 2966 if (tv.tv_sec > INT32_MAX) 2967 val = SBT_MAX; 2968 else 2969 val = tvtosbt(tv); 2970 switch (sopt->sopt_name) { 2971 case SO_SNDTIMEO: 2972 so->so_snd.sb_timeo = val; 2973 break; 2974 case SO_RCVTIMEO: 2975 so->so_rcv.sb_timeo = val; 2976 break; 2977 } 2978 break; 2979 2980 case SO_LABEL: 2981 #ifdef MAC 2982 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2983 sizeof extmac); 2984 if (error) 2985 goto bad; 2986 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2987 so, &extmac); 2988 #else 2989 error = EOPNOTSUPP; 2990 #endif 2991 break; 2992 2993 case SO_TS_CLOCK: 2994 error = sooptcopyin(sopt, &optval, sizeof optval, 2995 sizeof optval); 2996 if (error) 2997 goto bad; 2998 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 2999 error = EINVAL; 3000 goto bad; 3001 } 3002 so->so_ts_clock = optval; 3003 break; 3004 3005 case SO_MAX_PACING_RATE: 3006 error = sooptcopyin(sopt, &val32, sizeof(val32), 3007 sizeof(val32)); 3008 if (error) 3009 goto bad; 3010 so->so_max_pacing_rate = val32; 3011 break; 3012 3013 default: 3014 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3015 error = hhook_run_socket(so, sopt, 3016 HHOOK_SOCKET_OPT); 3017 else 3018 error = ENOPROTOOPT; 3019 break; 3020 } 3021 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 3022 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 3023 } 3024 bad: 3025 CURVNET_RESTORE(); 3026 return (error); 3027 } 3028 3029 /* 3030 * Helper routine for getsockopt. 3031 */ 3032 int 3033 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 3034 { 3035 int error; 3036 size_t valsize; 3037 3038 error = 0; 3039 3040 /* 3041 * Documented get behavior is that we always return a value, possibly 3042 * truncated to fit in the user's buffer. Traditional behavior is 3043 * that we always tell the user precisely how much we copied, rather 3044 * than something useful like the total amount we had available for 3045 * her. Note that this interface is not idempotent; the entire 3046 * answer must be generated ahead of time. 3047 */ 3048 valsize = min(len, sopt->sopt_valsize); 3049 sopt->sopt_valsize = valsize; 3050 if (sopt->sopt_val != NULL) { 3051 if (sopt->sopt_td != NULL) 3052 error = copyout(buf, sopt->sopt_val, valsize); 3053 else 3054 bcopy(buf, sopt->sopt_val, valsize); 3055 } 3056 return (error); 3057 } 3058 3059 int 3060 sogetopt(struct socket *so, struct sockopt *sopt) 3061 { 3062 int error, optval; 3063 struct linger l; 3064 struct timeval tv; 3065 #ifdef MAC 3066 struct mac extmac; 3067 #endif 3068 3069 CURVNET_SET(so->so_vnet); 3070 error = 0; 3071 if (sopt->sopt_level != SOL_SOCKET) { 3072 if (so->so_proto->pr_ctloutput != NULL) 3073 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3074 else 3075 error = ENOPROTOOPT; 3076 CURVNET_RESTORE(); 3077 return (error); 3078 } else { 3079 switch (sopt->sopt_name) { 3080 case SO_ACCEPTFILTER: 3081 error = accept_filt_getopt(so, sopt); 3082 break; 3083 3084 case SO_LINGER: 3085 SOCK_LOCK(so); 3086 l.l_onoff = so->so_options & SO_LINGER; 3087 l.l_linger = so->so_linger; 3088 SOCK_UNLOCK(so); 3089 error = sooptcopyout(sopt, &l, sizeof l); 3090 break; 3091 3092 case SO_USELOOPBACK: 3093 case SO_DONTROUTE: 3094 case SO_DEBUG: 3095 case SO_KEEPALIVE: 3096 case SO_REUSEADDR: 3097 case SO_REUSEPORT: 3098 case SO_REUSEPORT_LB: 3099 case SO_BROADCAST: 3100 case SO_OOBINLINE: 3101 case SO_ACCEPTCONN: 3102 case SO_TIMESTAMP: 3103 case SO_BINTIME: 3104 case SO_NOSIGPIPE: 3105 optval = so->so_options & sopt->sopt_name; 3106 integer: 3107 error = sooptcopyout(sopt, &optval, sizeof optval); 3108 break; 3109 3110 case SO_DOMAIN: 3111 optval = so->so_proto->pr_domain->dom_family; 3112 goto integer; 3113 3114 case SO_TYPE: 3115 optval = so->so_type; 3116 goto integer; 3117 3118 case SO_PROTOCOL: 3119 optval = so->so_proto->pr_protocol; 3120 goto integer; 3121 3122 case SO_ERROR: 3123 SOCK_LOCK(so); 3124 optval = so->so_error; 3125 so->so_error = 0; 3126 SOCK_UNLOCK(so); 3127 goto integer; 3128 3129 case SO_SNDBUF: 3130 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 3131 so->so_snd.sb_hiwat; 3132 goto integer; 3133 3134 case SO_RCVBUF: 3135 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 3136 so->so_rcv.sb_hiwat; 3137 goto integer; 3138 3139 case SO_SNDLOWAT: 3140 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 3141 so->so_snd.sb_lowat; 3142 goto integer; 3143 3144 case SO_RCVLOWAT: 3145 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 3146 so->so_rcv.sb_lowat; 3147 goto integer; 3148 3149 case SO_SNDTIMEO: 3150 case SO_RCVTIMEO: 3151 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 3152 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 3153 #ifdef COMPAT_FREEBSD32 3154 if (SV_CURPROC_FLAG(SV_ILP32)) { 3155 struct timeval32 tv32; 3156 3157 CP(tv, tv32, tv_sec); 3158 CP(tv, tv32, tv_usec); 3159 error = sooptcopyout(sopt, &tv32, sizeof tv32); 3160 } else 3161 #endif 3162 error = sooptcopyout(sopt, &tv, sizeof tv); 3163 break; 3164 3165 case SO_LABEL: 3166 #ifdef MAC 3167 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3168 sizeof(extmac)); 3169 if (error) 3170 goto bad; 3171 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 3172 so, &extmac); 3173 if (error) 3174 goto bad; 3175 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3176 #else 3177 error = EOPNOTSUPP; 3178 #endif 3179 break; 3180 3181 case SO_PEERLABEL: 3182 #ifdef MAC 3183 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3184 sizeof(extmac)); 3185 if (error) 3186 goto bad; 3187 error = mac_getsockopt_peerlabel( 3188 sopt->sopt_td->td_ucred, so, &extmac); 3189 if (error) 3190 goto bad; 3191 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3192 #else 3193 error = EOPNOTSUPP; 3194 #endif 3195 break; 3196 3197 case SO_LISTENQLIMIT: 3198 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 3199 goto integer; 3200 3201 case SO_LISTENQLEN: 3202 optval = SOLISTENING(so) ? so->sol_qlen : 0; 3203 goto integer; 3204 3205 case SO_LISTENINCQLEN: 3206 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 3207 goto integer; 3208 3209 case SO_TS_CLOCK: 3210 optval = so->so_ts_clock; 3211 goto integer; 3212 3213 case SO_MAX_PACING_RATE: 3214 optval = so->so_max_pacing_rate; 3215 goto integer; 3216 3217 default: 3218 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3219 error = hhook_run_socket(so, sopt, 3220 HHOOK_SOCKET_OPT); 3221 else 3222 error = ENOPROTOOPT; 3223 break; 3224 } 3225 } 3226 #ifdef MAC 3227 bad: 3228 #endif 3229 CURVNET_RESTORE(); 3230 return (error); 3231 } 3232 3233 int 3234 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 3235 { 3236 struct mbuf *m, *m_prev; 3237 int sopt_size = sopt->sopt_valsize; 3238 3239 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3240 if (m == NULL) 3241 return ENOBUFS; 3242 if (sopt_size > MLEN) { 3243 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 3244 if ((m->m_flags & M_EXT) == 0) { 3245 m_free(m); 3246 return ENOBUFS; 3247 } 3248 m->m_len = min(MCLBYTES, sopt_size); 3249 } else { 3250 m->m_len = min(MLEN, sopt_size); 3251 } 3252 sopt_size -= m->m_len; 3253 *mp = m; 3254 m_prev = m; 3255 3256 while (sopt_size) { 3257 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3258 if (m == NULL) { 3259 m_freem(*mp); 3260 return ENOBUFS; 3261 } 3262 if (sopt_size > MLEN) { 3263 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 3264 M_NOWAIT); 3265 if ((m->m_flags & M_EXT) == 0) { 3266 m_freem(m); 3267 m_freem(*mp); 3268 return ENOBUFS; 3269 } 3270 m->m_len = min(MCLBYTES, sopt_size); 3271 } else { 3272 m->m_len = min(MLEN, sopt_size); 3273 } 3274 sopt_size -= m->m_len; 3275 m_prev->m_next = m; 3276 m_prev = m; 3277 } 3278 return (0); 3279 } 3280 3281 int 3282 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 3283 { 3284 struct mbuf *m0 = m; 3285 3286 if (sopt->sopt_val == NULL) 3287 return (0); 3288 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3289 if (sopt->sopt_td != NULL) { 3290 int error; 3291 3292 error = copyin(sopt->sopt_val, mtod(m, char *), 3293 m->m_len); 3294 if (error != 0) { 3295 m_freem(m0); 3296 return(error); 3297 } 3298 } else 3299 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 3300 sopt->sopt_valsize -= m->m_len; 3301 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3302 m = m->m_next; 3303 } 3304 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 3305 panic("ip6_sooptmcopyin"); 3306 return (0); 3307 } 3308 3309 int 3310 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 3311 { 3312 struct mbuf *m0 = m; 3313 size_t valsize = 0; 3314 3315 if (sopt->sopt_val == NULL) 3316 return (0); 3317 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3318 if (sopt->sopt_td != NULL) { 3319 int error; 3320 3321 error = copyout(mtod(m, char *), sopt->sopt_val, 3322 m->m_len); 3323 if (error != 0) { 3324 m_freem(m0); 3325 return(error); 3326 } 3327 } else 3328 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 3329 sopt->sopt_valsize -= m->m_len; 3330 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3331 valsize += m->m_len; 3332 m = m->m_next; 3333 } 3334 if (m != NULL) { 3335 /* enough soopt buffer should be given from user-land */ 3336 m_freem(m0); 3337 return(EINVAL); 3338 } 3339 sopt->sopt_valsize = valsize; 3340 return (0); 3341 } 3342 3343 /* 3344 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3345 * out-of-band data, which will then notify socket consumers. 3346 */ 3347 void 3348 sohasoutofband(struct socket *so) 3349 { 3350 3351 if (so->so_sigio != NULL) 3352 pgsigio(&so->so_sigio, SIGURG, 0); 3353 selwakeuppri(&so->so_rdsel, PSOCK); 3354 } 3355 3356 int 3357 sopoll(struct socket *so, int events, struct ucred *active_cred, 3358 struct thread *td) 3359 { 3360 3361 /* 3362 * We do not need to set or assert curvnet as long as everyone uses 3363 * sopoll_generic(). 3364 */ 3365 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 3366 td)); 3367 } 3368 3369 int 3370 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3371 struct thread *td) 3372 { 3373 int revents; 3374 3375 SOCK_LOCK(so); 3376 if (SOLISTENING(so)) { 3377 if (!(events & (POLLIN | POLLRDNORM))) 3378 revents = 0; 3379 else if (!TAILQ_EMPTY(&so->sol_comp)) 3380 revents = events & (POLLIN | POLLRDNORM); 3381 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 3382 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 3383 else { 3384 selrecord(td, &so->so_rdsel); 3385 revents = 0; 3386 } 3387 } else { 3388 revents = 0; 3389 SOCKBUF_LOCK(&so->so_snd); 3390 SOCKBUF_LOCK(&so->so_rcv); 3391 if (events & (POLLIN | POLLRDNORM)) 3392 if (soreadabledata(so)) 3393 revents |= events & (POLLIN | POLLRDNORM); 3394 if (events & (POLLOUT | POLLWRNORM)) 3395 if (sowriteable(so)) 3396 revents |= events & (POLLOUT | POLLWRNORM); 3397 if (events & (POLLPRI | POLLRDBAND)) 3398 if (so->so_oobmark || 3399 (so->so_rcv.sb_state & SBS_RCVATMARK)) 3400 revents |= events & (POLLPRI | POLLRDBAND); 3401 if ((events & POLLINIGNEOF) == 0) { 3402 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3403 revents |= events & (POLLIN | POLLRDNORM); 3404 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3405 revents |= POLLHUP; 3406 } 3407 } 3408 if (revents == 0) { 3409 if (events & 3410 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 3411 selrecord(td, &so->so_rdsel); 3412 so->so_rcv.sb_flags |= SB_SEL; 3413 } 3414 if (events & (POLLOUT | POLLWRNORM)) { 3415 selrecord(td, &so->so_wrsel); 3416 so->so_snd.sb_flags |= SB_SEL; 3417 } 3418 } 3419 SOCKBUF_UNLOCK(&so->so_rcv); 3420 SOCKBUF_UNLOCK(&so->so_snd); 3421 } 3422 SOCK_UNLOCK(so); 3423 return (revents); 3424 } 3425 3426 int 3427 soo_kqfilter(struct file *fp, struct knote *kn) 3428 { 3429 struct socket *so = kn->kn_fp->f_data; 3430 struct sockbuf *sb; 3431 struct knlist *knl; 3432 3433 switch (kn->kn_filter) { 3434 case EVFILT_READ: 3435 kn->kn_fop = &soread_filtops; 3436 knl = &so->so_rdsel.si_note; 3437 sb = &so->so_rcv; 3438 break; 3439 case EVFILT_WRITE: 3440 kn->kn_fop = &sowrite_filtops; 3441 knl = &so->so_wrsel.si_note; 3442 sb = &so->so_snd; 3443 break; 3444 case EVFILT_EMPTY: 3445 kn->kn_fop = &soempty_filtops; 3446 knl = &so->so_wrsel.si_note; 3447 sb = &so->so_snd; 3448 break; 3449 default: 3450 return (EINVAL); 3451 } 3452 3453 SOCK_LOCK(so); 3454 if (SOLISTENING(so)) { 3455 knlist_add(knl, kn, 1); 3456 } else { 3457 SOCKBUF_LOCK(sb); 3458 knlist_add(knl, kn, 1); 3459 sb->sb_flags |= SB_KNOTE; 3460 SOCKBUF_UNLOCK(sb); 3461 } 3462 SOCK_UNLOCK(so); 3463 return (0); 3464 } 3465 3466 /* 3467 * Some routines that return EOPNOTSUPP for entry points that are not 3468 * supported by a protocol. Fill in as needed. 3469 */ 3470 int 3471 pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 3472 { 3473 3474 return EOPNOTSUPP; 3475 } 3476 3477 int 3478 pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) 3479 { 3480 3481 return EOPNOTSUPP; 3482 } 3483 3484 int 3485 pru_attach_notsupp(struct socket *so, int proto, struct thread *td) 3486 { 3487 3488 return EOPNOTSUPP; 3489 } 3490 3491 int 3492 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3493 { 3494 3495 return EOPNOTSUPP; 3496 } 3497 3498 int 3499 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3500 struct thread *td) 3501 { 3502 3503 return EOPNOTSUPP; 3504 } 3505 3506 int 3507 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3508 { 3509 3510 return EOPNOTSUPP; 3511 } 3512 3513 int 3514 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3515 struct thread *td) 3516 { 3517 3518 return EOPNOTSUPP; 3519 } 3520 3521 int 3522 pru_connect2_notsupp(struct socket *so1, struct socket *so2) 3523 { 3524 3525 return EOPNOTSUPP; 3526 } 3527 3528 int 3529 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 3530 struct ifnet *ifp, struct thread *td) 3531 { 3532 3533 return EOPNOTSUPP; 3534 } 3535 3536 int 3537 pru_disconnect_notsupp(struct socket *so) 3538 { 3539 3540 return EOPNOTSUPP; 3541 } 3542 3543 int 3544 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) 3545 { 3546 3547 return EOPNOTSUPP; 3548 } 3549 3550 int 3551 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) 3552 { 3553 3554 return EOPNOTSUPP; 3555 } 3556 3557 int 3558 pru_rcvd_notsupp(struct socket *so, int flags) 3559 { 3560 3561 return EOPNOTSUPP; 3562 } 3563 3564 int 3565 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 3566 { 3567 3568 return EOPNOTSUPP; 3569 } 3570 3571 int 3572 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, 3573 struct sockaddr *addr, struct mbuf *control, struct thread *td) 3574 { 3575 3576 return EOPNOTSUPP; 3577 } 3578 3579 int 3580 pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) 3581 { 3582 3583 return (EOPNOTSUPP); 3584 } 3585 3586 /* 3587 * This isn't really a ``null'' operation, but it's the default one and 3588 * doesn't do anything destructive. 3589 */ 3590 int 3591 pru_sense_null(struct socket *so, struct stat *sb) 3592 { 3593 3594 sb->st_blksize = so->so_snd.sb_hiwat; 3595 return 0; 3596 } 3597 3598 int 3599 pru_shutdown_notsupp(struct socket *so) 3600 { 3601 3602 return EOPNOTSUPP; 3603 } 3604 3605 int 3606 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) 3607 { 3608 3609 return EOPNOTSUPP; 3610 } 3611 3612 int 3613 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, 3614 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 3615 { 3616 3617 return EOPNOTSUPP; 3618 } 3619 3620 int 3621 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, 3622 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3623 { 3624 3625 return EOPNOTSUPP; 3626 } 3627 3628 int 3629 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, 3630 struct thread *td) 3631 { 3632 3633 return EOPNOTSUPP; 3634 } 3635 3636 static void 3637 filt_sordetach(struct knote *kn) 3638 { 3639 struct socket *so = kn->kn_fp->f_data; 3640 3641 so_rdknl_lock(so); 3642 knlist_remove(&so->so_rdsel.si_note, kn, 1); 3643 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 3644 so->so_rcv.sb_flags &= ~SB_KNOTE; 3645 so_rdknl_unlock(so); 3646 } 3647 3648 /*ARGSUSED*/ 3649 static int 3650 filt_soread(struct knote *kn, long hint) 3651 { 3652 struct socket *so; 3653 3654 so = kn->kn_fp->f_data; 3655 3656 if (SOLISTENING(so)) { 3657 SOCK_LOCK_ASSERT(so); 3658 kn->kn_data = so->sol_qlen; 3659 if (so->so_error) { 3660 kn->kn_flags |= EV_EOF; 3661 kn->kn_fflags = so->so_error; 3662 return (1); 3663 } 3664 return (!TAILQ_EMPTY(&so->sol_comp)); 3665 } 3666 3667 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3668 3669 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 3670 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3671 kn->kn_flags |= EV_EOF; 3672 kn->kn_fflags = so->so_error; 3673 return (1); 3674 } else if (so->so_error) /* temporary udp error */ 3675 return (1); 3676 3677 if (kn->kn_sfflags & NOTE_LOWAT) { 3678 if (kn->kn_data >= kn->kn_sdata) 3679 return (1); 3680 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 3681 return (1); 3682 3683 /* This hook returning non-zero indicates an event, not error */ 3684 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 3685 } 3686 3687 static void 3688 filt_sowdetach(struct knote *kn) 3689 { 3690 struct socket *so = kn->kn_fp->f_data; 3691 3692 so_wrknl_lock(so); 3693 knlist_remove(&so->so_wrsel.si_note, kn, 1); 3694 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 3695 so->so_snd.sb_flags &= ~SB_KNOTE; 3696 so_wrknl_unlock(so); 3697 } 3698 3699 /*ARGSUSED*/ 3700 static int 3701 filt_sowrite(struct knote *kn, long hint) 3702 { 3703 struct socket *so; 3704 3705 so = kn->kn_fp->f_data; 3706 3707 if (SOLISTENING(so)) 3708 return (0); 3709 3710 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3711 kn->kn_data = sbspace(&so->so_snd); 3712 3713 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 3714 3715 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3716 kn->kn_flags |= EV_EOF; 3717 kn->kn_fflags = so->so_error; 3718 return (1); 3719 } else if (so->so_error) /* temporary udp error */ 3720 return (1); 3721 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3722 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3723 return (0); 3724 else if (kn->kn_sfflags & NOTE_LOWAT) 3725 return (kn->kn_data >= kn->kn_sdata); 3726 else 3727 return (kn->kn_data >= so->so_snd.sb_lowat); 3728 } 3729 3730 static int 3731 filt_soempty(struct knote *kn, long hint) 3732 { 3733 struct socket *so; 3734 3735 so = kn->kn_fp->f_data; 3736 3737 if (SOLISTENING(so)) 3738 return (1); 3739 3740 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3741 kn->kn_data = sbused(&so->so_snd); 3742 3743 if (kn->kn_data == 0) 3744 return (1); 3745 else 3746 return (0); 3747 } 3748 3749 int 3750 socheckuid(struct socket *so, uid_t uid) 3751 { 3752 3753 if (so == NULL) 3754 return (EPERM); 3755 if (so->so_cred->cr_uid != uid) 3756 return (EPERM); 3757 return (0); 3758 } 3759 3760 /* 3761 * These functions are used by protocols to notify the socket layer (and its 3762 * consumers) of state changes in the sockets driven by protocol-side events. 3763 */ 3764 3765 /* 3766 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3767 * 3768 * Normal sequence from the active (originating) side is that 3769 * soisconnecting() is called during processing of connect() call, resulting 3770 * in an eventual call to soisconnected() if/when the connection is 3771 * established. When the connection is torn down soisdisconnecting() is 3772 * called during processing of disconnect() call, and soisdisconnected() is 3773 * called when the connection to the peer is totally severed. The semantics 3774 * of these routines are such that connectionless protocols can call 3775 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3776 * calls when setting up a ``connection'' takes no time. 3777 * 3778 * From the passive side, a socket is created with two queues of sockets: 3779 * so_incomp for connections in progress and so_comp for connections already 3780 * made and awaiting user acceptance. As a protocol is preparing incoming 3781 * connections, it creates a socket structure queued on so_incomp by calling 3782 * sonewconn(). When the connection is established, soisconnected() is 3783 * called, and transfers the socket structure to so_comp, making it available 3784 * to accept(). 3785 * 3786 * If a socket is closed with sockets on either so_incomp or so_comp, these 3787 * sockets are dropped. 3788 * 3789 * If higher-level protocols are implemented in the kernel, the wakeups done 3790 * here will sometimes cause software-interrupt process scheduling. 3791 */ 3792 void 3793 soisconnecting(struct socket *so) 3794 { 3795 3796 SOCK_LOCK(so); 3797 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3798 so->so_state |= SS_ISCONNECTING; 3799 SOCK_UNLOCK(so); 3800 } 3801 3802 void 3803 soisconnected(struct socket *so) 3804 { 3805 3806 SOCK_LOCK(so); 3807 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3808 so->so_state |= SS_ISCONNECTED; 3809 3810 if (so->so_qstate == SQ_INCOMP) { 3811 struct socket *head = so->so_listen; 3812 int ret; 3813 3814 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 3815 /* 3816 * Promoting a socket from incomplete queue to complete, we 3817 * need to go through reverse order of locking. We first do 3818 * trylock, and if that doesn't succeed, we go the hard way 3819 * leaving a reference and rechecking consistency after proper 3820 * locking. 3821 */ 3822 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 3823 soref(head); 3824 SOCK_UNLOCK(so); 3825 SOLISTEN_LOCK(head); 3826 SOCK_LOCK(so); 3827 if (__predict_false(head != so->so_listen)) { 3828 /* 3829 * The socket went off the listen queue, 3830 * should be lost race to close(2) of sol. 3831 * The socket is about to soabort(). 3832 */ 3833 SOCK_UNLOCK(so); 3834 sorele(head); 3835 return; 3836 } 3837 /* Not the last one, as so holds a ref. */ 3838 refcount_release(&head->so_count); 3839 } 3840 again: 3841 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3842 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 3843 head->sol_incqlen--; 3844 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 3845 head->sol_qlen++; 3846 so->so_qstate = SQ_COMP; 3847 SOCK_UNLOCK(so); 3848 solisten_wakeup(head); /* unlocks */ 3849 } else { 3850 SOCKBUF_LOCK(&so->so_rcv); 3851 soupcall_set(so, SO_RCV, 3852 head->sol_accept_filter->accf_callback, 3853 head->sol_accept_filter_arg); 3854 so->so_options &= ~SO_ACCEPTFILTER; 3855 ret = head->sol_accept_filter->accf_callback(so, 3856 head->sol_accept_filter_arg, M_NOWAIT); 3857 if (ret == SU_ISCONNECTED) { 3858 soupcall_clear(so, SO_RCV); 3859 SOCKBUF_UNLOCK(&so->so_rcv); 3860 goto again; 3861 } 3862 SOCKBUF_UNLOCK(&so->so_rcv); 3863 SOCK_UNLOCK(so); 3864 SOLISTEN_UNLOCK(head); 3865 } 3866 return; 3867 } 3868 SOCK_UNLOCK(so); 3869 wakeup(&so->so_timeo); 3870 sorwakeup(so); 3871 sowwakeup(so); 3872 } 3873 3874 void 3875 soisdisconnecting(struct socket *so) 3876 { 3877 3878 SOCK_LOCK(so); 3879 so->so_state &= ~SS_ISCONNECTING; 3880 so->so_state |= SS_ISDISCONNECTING; 3881 3882 if (!SOLISTENING(so)) { 3883 SOCKBUF_LOCK(&so->so_rcv); 3884 socantrcvmore_locked(so); 3885 SOCKBUF_LOCK(&so->so_snd); 3886 socantsendmore_locked(so); 3887 } 3888 SOCK_UNLOCK(so); 3889 wakeup(&so->so_timeo); 3890 } 3891 3892 void 3893 soisdisconnected(struct socket *so) 3894 { 3895 3896 SOCK_LOCK(so); 3897 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3898 so->so_state |= SS_ISDISCONNECTED; 3899 3900 if (!SOLISTENING(so)) { 3901 SOCK_UNLOCK(so); 3902 SOCKBUF_LOCK(&so->so_rcv); 3903 socantrcvmore_locked(so); 3904 SOCKBUF_LOCK(&so->so_snd); 3905 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 3906 socantsendmore_locked(so); 3907 } else 3908 SOCK_UNLOCK(so); 3909 wakeup(&so->so_timeo); 3910 } 3911 3912 /* 3913 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3914 */ 3915 struct sockaddr * 3916 sodupsockaddr(const struct sockaddr *sa, int mflags) 3917 { 3918 struct sockaddr *sa2; 3919 3920 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3921 if (sa2) 3922 bcopy(sa, sa2, sa->sa_len); 3923 return sa2; 3924 } 3925 3926 /* 3927 * Register per-socket destructor. 3928 */ 3929 void 3930 sodtor_set(struct socket *so, so_dtor_t *func) 3931 { 3932 3933 SOCK_LOCK_ASSERT(so); 3934 so->so_dtor = func; 3935 } 3936 3937 /* 3938 * Register per-socket buffer upcalls. 3939 */ 3940 void 3941 soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) 3942 { 3943 struct sockbuf *sb; 3944 3945 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 3946 3947 switch (which) { 3948 case SO_RCV: 3949 sb = &so->so_rcv; 3950 break; 3951 case SO_SND: 3952 sb = &so->so_snd; 3953 break; 3954 default: 3955 panic("soupcall_set: bad which"); 3956 } 3957 SOCKBUF_LOCK_ASSERT(sb); 3958 sb->sb_upcall = func; 3959 sb->sb_upcallarg = arg; 3960 sb->sb_flags |= SB_UPCALL; 3961 } 3962 3963 void 3964 soupcall_clear(struct socket *so, int which) 3965 { 3966 struct sockbuf *sb; 3967 3968 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 3969 3970 switch (which) { 3971 case SO_RCV: 3972 sb = &so->so_rcv; 3973 break; 3974 case SO_SND: 3975 sb = &so->so_snd; 3976 break; 3977 default: 3978 panic("soupcall_clear: bad which"); 3979 } 3980 SOCKBUF_LOCK_ASSERT(sb); 3981 KASSERT(sb->sb_upcall != NULL, 3982 ("%s: so %p no upcall to clear", __func__, so)); 3983 sb->sb_upcall = NULL; 3984 sb->sb_upcallarg = NULL; 3985 sb->sb_flags &= ~SB_UPCALL; 3986 } 3987 3988 void 3989 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 3990 { 3991 3992 SOLISTEN_LOCK_ASSERT(so); 3993 so->sol_upcall = func; 3994 so->sol_upcallarg = arg; 3995 } 3996 3997 static void 3998 so_rdknl_lock(void *arg) 3999 { 4000 struct socket *so = arg; 4001 4002 if (SOLISTENING(so)) 4003 SOCK_LOCK(so); 4004 else 4005 SOCKBUF_LOCK(&so->so_rcv); 4006 } 4007 4008 static void 4009 so_rdknl_unlock(void *arg) 4010 { 4011 struct socket *so = arg; 4012 4013 if (SOLISTENING(so)) 4014 SOCK_UNLOCK(so); 4015 else 4016 SOCKBUF_UNLOCK(&so->so_rcv); 4017 } 4018 4019 static void 4020 so_rdknl_assert_locked(void *arg) 4021 { 4022 struct socket *so = arg; 4023 4024 if (SOLISTENING(so)) 4025 SOCK_LOCK_ASSERT(so); 4026 else 4027 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 4028 } 4029 4030 static void 4031 so_rdknl_assert_unlocked(void *arg) 4032 { 4033 struct socket *so = arg; 4034 4035 if (SOLISTENING(so)) 4036 SOCK_UNLOCK_ASSERT(so); 4037 else 4038 SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); 4039 } 4040 4041 static void 4042 so_wrknl_lock(void *arg) 4043 { 4044 struct socket *so = arg; 4045 4046 if (SOLISTENING(so)) 4047 SOCK_LOCK(so); 4048 else 4049 SOCKBUF_LOCK(&so->so_snd); 4050 } 4051 4052 static void 4053 so_wrknl_unlock(void *arg) 4054 { 4055 struct socket *so = arg; 4056 4057 if (SOLISTENING(so)) 4058 SOCK_UNLOCK(so); 4059 else 4060 SOCKBUF_UNLOCK(&so->so_snd); 4061 } 4062 4063 static void 4064 so_wrknl_assert_locked(void *arg) 4065 { 4066 struct socket *so = arg; 4067 4068 if (SOLISTENING(so)) 4069 SOCK_LOCK_ASSERT(so); 4070 else 4071 SOCKBUF_LOCK_ASSERT(&so->so_snd); 4072 } 4073 4074 static void 4075 so_wrknl_assert_unlocked(void *arg) 4076 { 4077 struct socket *so = arg; 4078 4079 if (SOLISTENING(so)) 4080 SOCK_UNLOCK_ASSERT(so); 4081 else 4082 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); 4083 } 4084 4085 /* 4086 * Create an external-format (``xsocket'') structure using the information in 4087 * the kernel-format socket structure pointed to by so. This is done to 4088 * reduce the spew of irrelevant information over this interface, to isolate 4089 * user code from changes in the kernel structure, and potentially to provide 4090 * information-hiding if we decide that some of this information should be 4091 * hidden from users. 4092 */ 4093 void 4094 sotoxsocket(struct socket *so, struct xsocket *xso) 4095 { 4096 4097 bzero(xso, sizeof(*xso)); 4098 xso->xso_len = sizeof *xso; 4099 xso->xso_so = (uintptr_t)so; 4100 xso->so_type = so->so_type; 4101 xso->so_options = so->so_options; 4102 xso->so_linger = so->so_linger; 4103 xso->so_state = so->so_state; 4104 xso->so_pcb = (uintptr_t)so->so_pcb; 4105 xso->xso_protocol = so->so_proto->pr_protocol; 4106 xso->xso_family = so->so_proto->pr_domain->dom_family; 4107 xso->so_timeo = so->so_timeo; 4108 xso->so_error = so->so_error; 4109 xso->so_uid = so->so_cred->cr_uid; 4110 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 4111 if (SOLISTENING(so)) { 4112 xso->so_qlen = so->sol_qlen; 4113 xso->so_incqlen = so->sol_incqlen; 4114 xso->so_qlimit = so->sol_qlimit; 4115 xso->so_oobmark = 0; 4116 } else { 4117 xso->so_state |= so->so_qstate; 4118 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 4119 xso->so_oobmark = so->so_oobmark; 4120 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 4121 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 4122 } 4123 } 4124 4125 struct sockbuf * 4126 so_sockbuf_rcv(struct socket *so) 4127 { 4128 4129 return (&so->so_rcv); 4130 } 4131 4132 struct sockbuf * 4133 so_sockbuf_snd(struct socket *so) 4134 { 4135 4136 return (&so->so_snd); 4137 } 4138 4139 int 4140 so_state_get(const struct socket *so) 4141 { 4142 4143 return (so->so_state); 4144 } 4145 4146 void 4147 so_state_set(struct socket *so, int val) 4148 { 4149 4150 so->so_state = val; 4151 } 4152 4153 int 4154 so_options_get(const struct socket *so) 4155 { 4156 4157 return (so->so_options); 4158 } 4159 4160 void 4161 so_options_set(struct socket *so, int val) 4162 { 4163 4164 so->so_options = val; 4165 } 4166 4167 int 4168 so_error_get(const struct socket *so) 4169 { 4170 4171 return (so->so_error); 4172 } 4173 4174 void 4175 so_error_set(struct socket *so, int val) 4176 { 4177 4178 so->so_error = val; 4179 } 4180 4181 int 4182 so_linger_get(const struct socket *so) 4183 { 4184 4185 return (so->so_linger); 4186 } 4187 4188 void 4189 so_linger_set(struct socket *so, int val) 4190 { 4191 4192 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), 4193 ("%s: val %d out of range", __func__, val)); 4194 4195 so->so_linger = val; 4196 } 4197 4198 struct protosw * 4199 so_protosw_get(const struct socket *so) 4200 { 4201 4202 return (so->so_proto); 4203 } 4204 4205 void 4206 so_protosw_set(struct socket *so, struct protosw *val) 4207 { 4208 4209 so->so_proto = val; 4210 } 4211 4212 void 4213 so_sorwakeup(struct socket *so) 4214 { 4215 4216 sorwakeup(so); 4217 } 4218 4219 void 4220 so_sowwakeup(struct socket *so) 4221 { 4222 4223 sowwakeup(so); 4224 } 4225 4226 void 4227 so_sorwakeup_locked(struct socket *so) 4228 { 4229 4230 sorwakeup_locked(so); 4231 } 4232 4233 void 4234 so_sowwakeup_locked(struct socket *so) 4235 { 4236 4237 sowwakeup_locked(so); 4238 } 4239 4240 void 4241 so_lock(struct socket *so) 4242 { 4243 4244 SOCK_LOCK(so); 4245 } 4246 4247 void 4248 so_unlock(struct socket *so) 4249 { 4250 4251 SOCK_UNLOCK(so); 4252 } 4253