1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 35 */ 36 37 /* 38 * Comments on the socket life cycle: 39 * 40 * soalloc() sets of socket layer state for a socket, called only by 41 * socreate() and sonewconn(). Socket layer private. 42 * 43 * sodealloc() tears down socket layer state for a socket, called only by 44 * sofree() and sonewconn(). Socket layer private. 45 * 46 * pru_attach() associates protocol layer state with an allocated socket; 47 * called only once, may fail, aborting socket allocation. This is called 48 * from socreate() and sonewconn(). Socket layer private. 49 * 50 * pru_detach() disassociates protocol layer state from an attached socket, 51 * and will be called exactly once for sockets in which pru_attach() has 52 * been successfully called. If pru_attach() returned an error, 53 * pru_detach() will not be called. Socket layer private. 54 * 55 * pru_abort() and pru_close() notify the protocol layer that the last 56 * consumer of a socket is starting to tear down the socket, and that the 57 * protocol should terminate the connection. Historically, pru_abort() also 58 * detached protocol state from the socket state, but this is no longer the 59 * case. 60 * 61 * socreate() creates a socket and attaches protocol state. This is a public 62 * interface that may be used by socket layer consumers to create new 63 * sockets. 64 * 65 * sonewconn() creates a socket and attaches protocol state. This is a 66 * public interface that may be used by protocols to create new sockets when 67 * a new connection is received and will be available for accept() on a 68 * listen socket. 69 * 70 * soclose() destroys a socket after possibly waiting for it to disconnect. 71 * This is a public interface that socket consumers should use to close and 72 * release a socket when done with it. 73 * 74 * soabort() destroys a socket without waiting for it to disconnect (used 75 * only for incoming connections that are already partially or fully 76 * connected). This is used internally by the socket layer when clearing 77 * listen socket queues (due to overflow or close on the listen socket), but 78 * is also a public interface protocols may use to abort connections in 79 * their incomplete listen queues should they no longer be required. Sockets 80 * placed in completed connection listen queues should not be aborted for 81 * reasons described in the comment above the soclose() implementation. This 82 * is not a general purpose close routine, and except in the specific 83 * circumstances described here, should not be used. 84 * 85 * sofree() will free a socket and its protocol state if all references on 86 * the socket have been released, and is the public interface to attempt to 87 * free a socket when a reference is removed. This is a socket layer private 88 * interface. 89 * 90 * NOTE: In addition to socreate() and soclose(), which provide a single 91 * socket reference to the consumer to be managed as required, there are two 92 * calls to explicitly manage socket references, soref(), and sorele(). 93 * Currently, these are generally required only when transitioning a socket 94 * from a listen queue to a file descriptor, in order to prevent garbage 95 * collection of the socket at an untimely moment. For a number of reasons, 96 * these interfaces are not preferred, and should be avoided. 97 * 98 * NOTE: With regard to VNETs the general rule is that callers do not set 99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 101 * and sorflush(), which are usually called from a pre-set VNET context. 102 * sopoll() currently does not need a VNET context to be set. 103 */ 104 105 #include <sys/cdefs.h> 106 __FBSDID("$FreeBSD$"); 107 108 #include "opt_inet.h" 109 #include "opt_inet6.h" 110 #include "opt_kern_tls.h" 111 #include "opt_sctp.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/fcntl.h> 116 #include <sys/limits.h> 117 #include <sys/lock.h> 118 #include <sys/mac.h> 119 #include <sys/malloc.h> 120 #include <sys/mbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/domain.h> 123 #include <sys/file.h> /* for struct knote */ 124 #include <sys/hhook.h> 125 #include <sys/kernel.h> 126 #include <sys/khelp.h> 127 #include <sys/ktls.h> 128 #include <sys/event.h> 129 #include <sys/eventhandler.h> 130 #include <sys/poll.h> 131 #include <sys/proc.h> 132 #include <sys/protosw.h> 133 #include <sys/socket.h> 134 #include <sys/socketvar.h> 135 #include <sys/resourcevar.h> 136 #include <net/route.h> 137 #include <sys/signalvar.h> 138 #include <sys/stat.h> 139 #include <sys/sx.h> 140 #include <sys/sysctl.h> 141 #include <sys/taskqueue.h> 142 #include <sys/uio.h> 143 #include <sys/jail.h> 144 #include <sys/syslog.h> 145 #include <netinet/in.h> 146 #include <netinet/tcp.h> 147 148 #include <net/vnet.h> 149 150 #include <security/mac/mac_framework.h> 151 152 #include <vm/uma.h> 153 154 #ifdef COMPAT_FREEBSD32 155 #include <sys/mount.h> 156 #include <sys/sysent.h> 157 #include <compat/freebsd32/freebsd32.h> 158 #endif 159 160 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 161 int flags); 162 static void so_rdknl_lock(void *); 163 static void so_rdknl_unlock(void *); 164 static void so_rdknl_assert_locked(void *); 165 static void so_rdknl_assert_unlocked(void *); 166 static void so_wrknl_lock(void *); 167 static void so_wrknl_unlock(void *); 168 static void so_wrknl_assert_locked(void *); 169 static void so_wrknl_assert_unlocked(void *); 170 171 static void filt_sordetach(struct knote *kn); 172 static int filt_soread(struct knote *kn, long hint); 173 static void filt_sowdetach(struct knote *kn); 174 static int filt_sowrite(struct knote *kn, long hint); 175 static int filt_soempty(struct knote *kn, long hint); 176 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); 177 fo_kqfilter_t soo_kqfilter; 178 179 static struct filterops soread_filtops = { 180 .f_isfd = 1, 181 .f_detach = filt_sordetach, 182 .f_event = filt_soread, 183 }; 184 static struct filterops sowrite_filtops = { 185 .f_isfd = 1, 186 .f_detach = filt_sowdetach, 187 .f_event = filt_sowrite, 188 }; 189 static struct filterops soempty_filtops = { 190 .f_isfd = 1, 191 .f_detach = filt_sowdetach, 192 .f_event = filt_soempty, 193 }; 194 195 so_gen_t so_gencnt; /* generation count for sockets */ 196 197 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 198 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 199 200 #define VNET_SO_ASSERT(so) \ 201 VNET_ASSERT(curvnet != NULL, \ 202 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 203 204 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 205 #define V_socket_hhh VNET(socket_hhh) 206 207 /* 208 * Limit on the number of connections in the listen queue waiting 209 * for accept(2). 210 * NB: The original sysctl somaxconn is still available but hidden 211 * to prevent confusion about the actual purpose of this number. 212 */ 213 static u_int somaxconn = SOMAXCONN; 214 215 static int 216 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 217 { 218 int error; 219 int val; 220 221 val = somaxconn; 222 error = sysctl_handle_int(oidp, &val, 0, req); 223 if (error || !req->newptr ) 224 return (error); 225 226 /* 227 * The purpose of the UINT_MAX / 3 limit, is so that the formula 228 * 3 * so_qlimit / 2 229 * below, will not overflow. 230 */ 231 232 if (val < 1 || val > UINT_MAX / 3) 233 return (EINVAL); 234 235 somaxconn = val; 236 return (0); 237 } 238 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, 239 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), 240 sysctl_somaxconn, "I", 241 "Maximum listen socket pending connection accept queue size"); 242 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 243 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0, 244 sizeof(int), sysctl_somaxconn, "I", 245 "Maximum listen socket pending connection accept queue size (compat)"); 246 247 static int numopensockets; 248 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 249 &numopensockets, 0, "Number of open sockets"); 250 251 /* 252 * accept_mtx locks down per-socket fields relating to accept queues. See 253 * socketvar.h for an annotation of the protected fields of struct socket. 254 */ 255 struct mtx accept_mtx; 256 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 257 258 /* 259 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 260 * so_gencnt field. 261 */ 262 static struct mtx so_global_mtx; 263 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 264 265 /* 266 * General IPC sysctl name space, used by sockets and a variety of other IPC 267 * types. 268 */ 269 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 270 "IPC"); 271 272 /* 273 * Initialize the socket subsystem and set up the socket 274 * memory allocator. 275 */ 276 static uma_zone_t socket_zone; 277 int maxsockets; 278 279 static void 280 socket_zone_change(void *tag) 281 { 282 283 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 284 } 285 286 static void 287 socket_hhook_register(int subtype) 288 { 289 290 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 291 &V_socket_hhh[subtype], 292 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 293 printf("%s: WARNING: unable to register hook\n", __func__); 294 } 295 296 static void 297 socket_hhook_deregister(int subtype) 298 { 299 300 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 301 printf("%s: WARNING: unable to deregister hook\n", __func__); 302 } 303 304 static void 305 socket_init(void *tag) 306 { 307 308 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 309 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 310 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 311 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 312 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 313 EVENTHANDLER_PRI_FIRST); 314 } 315 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 316 317 static void 318 socket_vnet_init(const void *unused __unused) 319 { 320 int i; 321 322 /* We expect a contiguous range */ 323 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 324 socket_hhook_register(i); 325 } 326 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 327 socket_vnet_init, NULL); 328 329 static void 330 socket_vnet_uninit(const void *unused __unused) 331 { 332 int i; 333 334 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 335 socket_hhook_deregister(i); 336 } 337 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 338 socket_vnet_uninit, NULL); 339 340 /* 341 * Initialise maxsockets. This SYSINIT must be run after 342 * tunable_mbinit(). 343 */ 344 static void 345 init_maxsockets(void *ignored) 346 { 347 348 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 349 maxsockets = imax(maxsockets, maxfiles); 350 } 351 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 352 353 /* 354 * Sysctl to get and set the maximum global sockets limit. Notify protocols 355 * of the change so that they can update their dependent limits as required. 356 */ 357 static int 358 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 359 { 360 int error, newmaxsockets; 361 362 newmaxsockets = maxsockets; 363 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 364 if (error == 0 && req->newptr) { 365 if (newmaxsockets > maxsockets && 366 newmaxsockets <= maxfiles) { 367 maxsockets = newmaxsockets; 368 EVENTHANDLER_INVOKE(maxsockets_change); 369 } else 370 error = EINVAL; 371 } 372 return (error); 373 } 374 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, 375 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0, 376 sysctl_maxsockets, "IU", 377 "Maximum number of sockets available"); 378 379 /* 380 * Socket operation routines. These routines are called by the routines in 381 * sys_socket.c or from a system process, and implement the semantics of 382 * socket operations by switching out to the protocol specific routines. 383 */ 384 385 /* 386 * Get a socket structure from our zone, and initialize it. Note that it 387 * would probably be better to allocate socket and PCB at the same time, but 388 * I'm not convinced that all the protocols can be easily modified to do 389 * this. 390 * 391 * soalloc() returns a socket with a ref count of 0. 392 */ 393 static struct socket * 394 soalloc(struct vnet *vnet) 395 { 396 struct socket *so; 397 398 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 399 if (so == NULL) 400 return (NULL); 401 #ifdef MAC 402 if (mac_socket_init(so, M_NOWAIT) != 0) { 403 uma_zfree(socket_zone, so); 404 return (NULL); 405 } 406 #endif 407 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 408 uma_zfree(socket_zone, so); 409 return (NULL); 410 } 411 412 /* 413 * The socket locking protocol allows to lock 2 sockets at a time, 414 * however, the first one must be a listening socket. WITNESS lacks 415 * a feature to change class of an existing lock, so we use DUPOK. 416 */ 417 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 418 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 419 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 420 so->so_rcv.sb_sel = &so->so_rdsel; 421 so->so_snd.sb_sel = &so->so_wrsel; 422 sx_init(&so->so_snd.sb_sx, "so_snd_sx"); 423 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); 424 TAILQ_INIT(&so->so_snd.sb_aiojobq); 425 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 426 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 427 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 428 #ifdef VIMAGE 429 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 430 __func__, __LINE__, so)); 431 so->so_vnet = vnet; 432 #endif 433 /* We shouldn't need the so_global_mtx */ 434 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 435 /* Do we need more comprehensive error returns? */ 436 uma_zfree(socket_zone, so); 437 return (NULL); 438 } 439 mtx_lock(&so_global_mtx); 440 so->so_gencnt = ++so_gencnt; 441 ++numopensockets; 442 #ifdef VIMAGE 443 vnet->vnet_sockcnt++; 444 #endif 445 mtx_unlock(&so_global_mtx); 446 447 return (so); 448 } 449 450 /* 451 * Free the storage associated with a socket at the socket layer, tear down 452 * locks, labels, etc. All protocol state is assumed already to have been 453 * torn down (and possibly never set up) by the caller. 454 */ 455 static void 456 sodealloc(struct socket *so) 457 { 458 459 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 460 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 461 462 mtx_lock(&so_global_mtx); 463 so->so_gencnt = ++so_gencnt; 464 --numopensockets; /* Could be below, but faster here. */ 465 #ifdef VIMAGE 466 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 467 __func__, __LINE__, so)); 468 so->so_vnet->vnet_sockcnt--; 469 #endif 470 mtx_unlock(&so_global_mtx); 471 #ifdef MAC 472 mac_socket_destroy(so); 473 #endif 474 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 475 476 crfree(so->so_cred); 477 khelp_destroy_osd(&so->osd); 478 if (SOLISTENING(so)) { 479 if (so->sol_accept_filter != NULL) 480 accept_filt_setopt(so, NULL); 481 } else { 482 if (so->so_rcv.sb_hiwat) 483 (void)chgsbsize(so->so_cred->cr_uidinfo, 484 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 485 if (so->so_snd.sb_hiwat) 486 (void)chgsbsize(so->so_cred->cr_uidinfo, 487 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 488 sx_destroy(&so->so_snd.sb_sx); 489 sx_destroy(&so->so_rcv.sb_sx); 490 SOCKBUF_LOCK_DESTROY(&so->so_snd); 491 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 492 } 493 mtx_destroy(&so->so_lock); 494 uma_zfree(socket_zone, so); 495 } 496 497 /* 498 * socreate returns a socket with a ref count of 1. The socket should be 499 * closed with soclose(). 500 */ 501 int 502 socreate(int dom, struct socket **aso, int type, int proto, 503 struct ucred *cred, struct thread *td) 504 { 505 struct protosw *prp; 506 struct socket *so; 507 int error; 508 509 if (proto) 510 prp = pffindproto(dom, proto, type); 511 else 512 prp = pffindtype(dom, type); 513 514 if (prp == NULL) { 515 /* No support for domain. */ 516 if (pffinddomain(dom) == NULL) 517 return (EAFNOSUPPORT); 518 /* No support for socket type. */ 519 if (proto == 0 && type != 0) 520 return (EPROTOTYPE); 521 return (EPROTONOSUPPORT); 522 } 523 if (prp->pr_usrreqs->pru_attach == NULL || 524 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 525 return (EPROTONOSUPPORT); 526 527 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 528 return (EPROTONOSUPPORT); 529 530 if (prp->pr_type != type) 531 return (EPROTOTYPE); 532 so = soalloc(CRED_TO_VNET(cred)); 533 if (so == NULL) 534 return (ENOBUFS); 535 536 so->so_type = type; 537 so->so_cred = crhold(cred); 538 if ((prp->pr_domain->dom_family == PF_INET) || 539 (prp->pr_domain->dom_family == PF_INET6) || 540 (prp->pr_domain->dom_family == PF_ROUTE)) 541 so->so_fibnum = td->td_proc->p_fibnum; 542 else 543 so->so_fibnum = 0; 544 so->so_proto = prp; 545 #ifdef MAC 546 mac_socket_create(cred, so); 547 #endif 548 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 549 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 550 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 551 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 552 /* 553 * Auto-sizing of socket buffers is managed by the protocols and 554 * the appropriate flags must be set in the pru_attach function. 555 */ 556 CURVNET_SET(so->so_vnet); 557 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 558 CURVNET_RESTORE(); 559 if (error) { 560 sodealloc(so); 561 return (error); 562 } 563 soref(so); 564 *aso = so; 565 return (0); 566 } 567 568 #ifdef REGRESSION 569 static int regression_sonewconn_earlytest = 1; 570 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 571 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 572 #endif 573 574 /* 575 * When an attempt at a new connection is noted on a socket which accepts 576 * connections, sonewconn is called. If the connection is possible (subject 577 * to space constraints, etc.) then we allocate a new structure, properly 578 * linked into the data structure of the original socket, and return this. 579 * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. 580 * 581 * Note: the ref count on the socket is 0 on return. 582 */ 583 struct socket * 584 sonewconn(struct socket *head, int connstatus) 585 { 586 static struct timeval lastover; 587 static struct timeval overinterval = { 60, 0 }; 588 static int overcount; 589 590 struct socket *so; 591 u_int over; 592 593 SOLISTEN_LOCK(head); 594 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 595 SOLISTEN_UNLOCK(head); 596 #ifdef REGRESSION 597 if (regression_sonewconn_earlytest && over) { 598 #else 599 if (over) { 600 #endif 601 overcount++; 602 603 if (ratecheck(&lastover, &overinterval)) { 604 log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " 605 "%i already in queue awaiting acceptance " 606 "(%d occurrences)\n", 607 __func__, head->so_pcb, head->sol_qlen, overcount); 608 609 overcount = 0; 610 } 611 612 return (NULL); 613 } 614 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 615 __func__, head)); 616 so = soalloc(head->so_vnet); 617 if (so == NULL) { 618 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 619 "limit reached or out of memory\n", 620 __func__, head->so_pcb); 621 return (NULL); 622 } 623 so->so_listen = head; 624 so->so_type = head->so_type; 625 so->so_linger = head->so_linger; 626 so->so_state = head->so_state | SS_NOFDREF; 627 so->so_fibnum = head->so_fibnum; 628 so->so_proto = head->so_proto; 629 so->so_cred = crhold(head->so_cred); 630 #ifdef MAC 631 mac_socket_newconn(head, so); 632 #endif 633 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 634 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 635 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 636 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 637 VNET_SO_ASSERT(head); 638 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 639 sodealloc(so); 640 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 641 __func__, head->so_pcb); 642 return (NULL); 643 } 644 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 645 sodealloc(so); 646 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 647 __func__, head->so_pcb); 648 return (NULL); 649 } 650 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 651 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 652 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 653 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 654 so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; 655 so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; 656 657 SOLISTEN_LOCK(head); 658 if (head->sol_accept_filter != NULL) 659 connstatus = 0; 660 so->so_state |= connstatus; 661 so->so_options = head->so_options & ~SO_ACCEPTCONN; 662 soref(head); /* A socket on (in)complete queue refs head. */ 663 if (connstatus) { 664 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 665 so->so_qstate = SQ_COMP; 666 head->sol_qlen++; 667 solisten_wakeup(head); /* unlocks */ 668 } else { 669 /* 670 * Keep removing sockets from the head until there's room for 671 * us to insert on the tail. In pre-locking revisions, this 672 * was a simple if(), but as we could be racing with other 673 * threads and soabort() requires dropping locks, we must 674 * loop waiting for the condition to be true. 675 */ 676 while (head->sol_incqlen > head->sol_qlimit) { 677 struct socket *sp; 678 679 sp = TAILQ_FIRST(&head->sol_incomp); 680 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 681 head->sol_incqlen--; 682 SOCK_LOCK(sp); 683 sp->so_qstate = SQ_NONE; 684 sp->so_listen = NULL; 685 SOCK_UNLOCK(sp); 686 sorele(head); /* does SOLISTEN_UNLOCK, head stays */ 687 soabort(sp); 688 SOLISTEN_LOCK(head); 689 } 690 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 691 so->so_qstate = SQ_INCOMP; 692 head->sol_incqlen++; 693 SOLISTEN_UNLOCK(head); 694 } 695 return (so); 696 } 697 698 #ifdef SCTP 699 /* 700 * Socket part of sctp_peeloff(). Detach a new socket from an 701 * association. The new socket is returned with a reference. 702 */ 703 struct socket * 704 sopeeloff(struct socket *head) 705 { 706 struct socket *so; 707 708 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 709 __func__, __LINE__, head)); 710 so = soalloc(head->so_vnet); 711 if (so == NULL) { 712 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 713 "limit reached or out of memory\n", 714 __func__, head->so_pcb); 715 return (NULL); 716 } 717 so->so_type = head->so_type; 718 so->so_options = head->so_options; 719 so->so_linger = head->so_linger; 720 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 721 so->so_fibnum = head->so_fibnum; 722 so->so_proto = head->so_proto; 723 so->so_cred = crhold(head->so_cred); 724 #ifdef MAC 725 mac_socket_newconn(head, so); 726 #endif 727 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 728 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 729 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 730 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 731 VNET_SO_ASSERT(head); 732 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 733 sodealloc(so); 734 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 735 __func__, head->so_pcb); 736 return (NULL); 737 } 738 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 739 sodealloc(so); 740 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 741 __func__, head->so_pcb); 742 return (NULL); 743 } 744 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 745 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 746 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 747 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 748 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 749 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 750 751 soref(so); 752 753 return (so); 754 } 755 #endif /* SCTP */ 756 757 int 758 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 759 { 760 int error; 761 762 CURVNET_SET(so->so_vnet); 763 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 764 CURVNET_RESTORE(); 765 return (error); 766 } 767 768 int 769 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 770 { 771 int error; 772 773 CURVNET_SET(so->so_vnet); 774 error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); 775 CURVNET_RESTORE(); 776 return (error); 777 } 778 779 /* 780 * solisten() transitions a socket from a non-listening state to a listening 781 * state, but can also be used to update the listen queue depth on an 782 * existing listen socket. The protocol will call back into the sockets 783 * layer using solisten_proto_check() and solisten_proto() to check and set 784 * socket-layer listen state. Call backs are used so that the protocol can 785 * acquire both protocol and socket layer locks in whatever order is required 786 * by the protocol. 787 * 788 * Protocol implementors are advised to hold the socket lock across the 789 * socket-layer test and set to avoid races at the socket layer. 790 */ 791 int 792 solisten(struct socket *so, int backlog, struct thread *td) 793 { 794 int error; 795 796 CURVNET_SET(so->so_vnet); 797 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 798 CURVNET_RESTORE(); 799 return (error); 800 } 801 802 int 803 solisten_proto_check(struct socket *so) 804 { 805 806 SOCK_LOCK_ASSERT(so); 807 808 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 809 SS_ISDISCONNECTING)) 810 return (EINVAL); 811 return (0); 812 } 813 814 void 815 solisten_proto(struct socket *so, int backlog) 816 { 817 int sbrcv_lowat, sbsnd_lowat; 818 u_int sbrcv_hiwat, sbsnd_hiwat; 819 short sbrcv_flags, sbsnd_flags; 820 sbintime_t sbrcv_timeo, sbsnd_timeo; 821 822 SOCK_LOCK_ASSERT(so); 823 824 if (SOLISTENING(so)) 825 goto listening; 826 827 /* 828 * Change this socket to listening state. 829 */ 830 sbrcv_lowat = so->so_rcv.sb_lowat; 831 sbsnd_lowat = so->so_snd.sb_lowat; 832 sbrcv_hiwat = so->so_rcv.sb_hiwat; 833 sbsnd_hiwat = so->so_snd.sb_hiwat; 834 sbrcv_flags = so->so_rcv.sb_flags; 835 sbsnd_flags = so->so_snd.sb_flags; 836 sbrcv_timeo = so->so_rcv.sb_timeo; 837 sbsnd_timeo = so->so_snd.sb_timeo; 838 839 sbdestroy(&so->so_snd, so); 840 sbdestroy(&so->so_rcv, so); 841 sx_destroy(&so->so_snd.sb_sx); 842 sx_destroy(&so->so_rcv.sb_sx); 843 SOCKBUF_LOCK_DESTROY(&so->so_snd); 844 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 845 846 #ifdef INVARIANTS 847 bzero(&so->so_rcv, 848 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 849 #endif 850 851 so->sol_sbrcv_lowat = sbrcv_lowat; 852 so->sol_sbsnd_lowat = sbsnd_lowat; 853 so->sol_sbrcv_hiwat = sbrcv_hiwat; 854 so->sol_sbsnd_hiwat = sbsnd_hiwat; 855 so->sol_sbrcv_flags = sbrcv_flags; 856 so->sol_sbsnd_flags = sbsnd_flags; 857 so->sol_sbrcv_timeo = sbrcv_timeo; 858 so->sol_sbsnd_timeo = sbsnd_timeo; 859 860 so->sol_qlen = so->sol_incqlen = 0; 861 TAILQ_INIT(&so->sol_incomp); 862 TAILQ_INIT(&so->sol_comp); 863 864 so->sol_accept_filter = NULL; 865 so->sol_accept_filter_arg = NULL; 866 so->sol_accept_filter_str = NULL; 867 868 so->sol_upcall = NULL; 869 so->sol_upcallarg = NULL; 870 871 so->so_options |= SO_ACCEPTCONN; 872 873 listening: 874 if (backlog < 0 || backlog > somaxconn) 875 backlog = somaxconn; 876 so->sol_qlimit = backlog; 877 } 878 879 /* 880 * Wakeup listeners/subsystems once we have a complete connection. 881 * Enters with lock, returns unlocked. 882 */ 883 void 884 solisten_wakeup(struct socket *sol) 885 { 886 887 if (sol->sol_upcall != NULL) 888 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 889 else { 890 selwakeuppri(&sol->so_rdsel, PSOCK); 891 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 892 } 893 SOLISTEN_UNLOCK(sol); 894 wakeup_one(&sol->sol_comp); 895 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) 896 pgsigio(&sol->so_sigio, SIGIO, 0); 897 } 898 899 /* 900 * Return single connection off a listening socket queue. Main consumer of 901 * the function is kern_accept4(). Some modules, that do their own accept 902 * management also use the function. 903 * 904 * Listening socket must be locked on entry and is returned unlocked on 905 * return. 906 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 907 */ 908 int 909 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 910 { 911 struct socket *so; 912 int error; 913 914 SOLISTEN_LOCK_ASSERT(head); 915 916 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 917 head->so_error == 0) { 918 error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, 919 "accept", 0); 920 if (error != 0) { 921 SOLISTEN_UNLOCK(head); 922 return (error); 923 } 924 } 925 if (head->so_error) { 926 error = head->so_error; 927 head->so_error = 0; 928 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) 929 error = EWOULDBLOCK; 930 else 931 error = 0; 932 if (error) { 933 SOLISTEN_UNLOCK(head); 934 return (error); 935 } 936 so = TAILQ_FIRST(&head->sol_comp); 937 SOCK_LOCK(so); 938 KASSERT(so->so_qstate == SQ_COMP, 939 ("%s: so %p not SQ_COMP", __func__, so)); 940 soref(so); 941 head->sol_qlen--; 942 so->so_qstate = SQ_NONE; 943 so->so_listen = NULL; 944 TAILQ_REMOVE(&head->sol_comp, so, so_list); 945 if (flags & ACCEPT4_INHERIT) 946 so->so_state |= (head->so_state & SS_NBIO); 947 else 948 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 949 SOCK_UNLOCK(so); 950 sorele(head); 951 952 *ret = so; 953 return (0); 954 } 955 956 /* 957 * Evaluate the reference count and named references on a socket; if no 958 * references remain, free it. This should be called whenever a reference is 959 * released, such as in sorele(), but also when named reference flags are 960 * cleared in socket or protocol code. 961 * 962 * sofree() will free the socket if: 963 * 964 * - There are no outstanding file descriptor references or related consumers 965 * (so_count == 0). 966 * 967 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 968 * 969 * - The protocol does not have an outstanding strong reference on the socket 970 * (SS_PROTOREF). 971 * 972 * - The socket is not in a completed connection queue, so a process has been 973 * notified that it is present. If it is removed, the user process may 974 * block in accept() despite select() saying the socket was ready. 975 */ 976 void 977 sofree(struct socket *so) 978 { 979 struct protosw *pr = so->so_proto; 980 981 SOCK_LOCK_ASSERT(so); 982 983 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 984 (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { 985 SOCK_UNLOCK(so); 986 return; 987 } 988 989 if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { 990 struct socket *sol; 991 992 sol = so->so_listen; 993 KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); 994 995 /* 996 * To solve race between close of a listening socket and 997 * a socket on its incomplete queue, we need to lock both. 998 * The order is first listening socket, then regular. 999 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this 1000 * function and the listening socket are the only pointers 1001 * to so. To preserve so and sol, we reference both and then 1002 * relock. 1003 * After relock the socket may not move to so_comp since it 1004 * doesn't have PCB already, but it may be removed from 1005 * so_incomp. If that happens, we share responsiblity on 1006 * freeing the socket, but soclose() has already removed 1007 * it from queue. 1008 */ 1009 soref(sol); 1010 soref(so); 1011 SOCK_UNLOCK(so); 1012 SOLISTEN_LOCK(sol); 1013 SOCK_LOCK(so); 1014 if (so->so_qstate == SQ_INCOMP) { 1015 KASSERT(so->so_listen == sol, 1016 ("%s: so %p migrated out of sol %p", 1017 __func__, so, sol)); 1018 TAILQ_REMOVE(&sol->sol_incomp, so, so_list); 1019 sol->sol_incqlen--; 1020 /* This is guarenteed not to be the last. */ 1021 refcount_release(&sol->so_count); 1022 so->so_qstate = SQ_NONE; 1023 so->so_listen = NULL; 1024 } else 1025 KASSERT(so->so_listen == NULL, 1026 ("%s: so %p not on (in)comp with so_listen", 1027 __func__, so)); 1028 sorele(sol); 1029 KASSERT(so->so_count == 1, 1030 ("%s: so %p count %u", __func__, so, so->so_count)); 1031 so->so_count = 0; 1032 } 1033 if (SOLISTENING(so)) 1034 so->so_error = ECONNABORTED; 1035 SOCK_UNLOCK(so); 1036 1037 if (so->so_dtor != NULL) 1038 so->so_dtor(so); 1039 1040 VNET_SO_ASSERT(so); 1041 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1042 (*pr->pr_domain->dom_dispose)(so); 1043 if (pr->pr_usrreqs->pru_detach != NULL) 1044 (*pr->pr_usrreqs->pru_detach)(so); 1045 1046 /* 1047 * From this point on, we assume that no other references to this 1048 * socket exist anywhere else in the stack. Therefore, no locks need 1049 * to be acquired or held. 1050 * 1051 * We used to do a lot of socket buffer and socket locking here, as 1052 * well as invoke sorflush() and perform wakeups. The direct call to 1053 * dom_dispose() and sbdestroy() are an inlining of what was 1054 * necessary from sorflush(). 1055 * 1056 * Notice that the socket buffer and kqueue state are torn down 1057 * before calling pru_detach. This means that protocols shold not 1058 * assume they can perform socket wakeups, etc, in their detach code. 1059 */ 1060 if (!SOLISTENING(so)) { 1061 sbdestroy(&so->so_snd, so); 1062 sbdestroy(&so->so_rcv, so); 1063 } 1064 seldrain(&so->so_rdsel); 1065 seldrain(&so->so_wrsel); 1066 knlist_destroy(&so->so_rdsel.si_note); 1067 knlist_destroy(&so->so_wrsel.si_note); 1068 sodealloc(so); 1069 } 1070 1071 /* 1072 * Close a socket on last file table reference removal. Initiate disconnect 1073 * if connected. Free socket when disconnect complete. 1074 * 1075 * This function will sorele() the socket. Note that soclose() may be called 1076 * prior to the ref count reaching zero. The actual socket structure will 1077 * not be freed until the ref count reaches zero. 1078 */ 1079 int 1080 soclose(struct socket *so) 1081 { 1082 struct accept_queue lqueue; 1083 bool listening; 1084 int error = 0; 1085 1086 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 1087 1088 CURVNET_SET(so->so_vnet); 1089 funsetown(&so->so_sigio); 1090 if (so->so_state & SS_ISCONNECTED) { 1091 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1092 error = sodisconnect(so); 1093 if (error) { 1094 if (error == ENOTCONN) 1095 error = 0; 1096 goto drop; 1097 } 1098 } 1099 if (so->so_options & SO_LINGER) { 1100 if ((so->so_state & SS_ISDISCONNECTING) && 1101 (so->so_state & SS_NBIO)) 1102 goto drop; 1103 while (so->so_state & SS_ISCONNECTED) { 1104 error = tsleep(&so->so_timeo, 1105 PSOCK | PCATCH, "soclos", 1106 so->so_linger * hz); 1107 if (error) 1108 break; 1109 } 1110 } 1111 } 1112 1113 drop: 1114 if (so->so_proto->pr_usrreqs->pru_close != NULL) 1115 (*so->so_proto->pr_usrreqs->pru_close)(so); 1116 1117 SOCK_LOCK(so); 1118 if ((listening = (so->so_options & SO_ACCEPTCONN))) { 1119 struct socket *sp; 1120 1121 TAILQ_INIT(&lqueue); 1122 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1123 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1124 1125 so->sol_qlen = so->sol_incqlen = 0; 1126 1127 TAILQ_FOREACH(sp, &lqueue, so_list) { 1128 SOCK_LOCK(sp); 1129 sp->so_qstate = SQ_NONE; 1130 sp->so_listen = NULL; 1131 SOCK_UNLOCK(sp); 1132 /* Guaranteed not to be the last. */ 1133 refcount_release(&so->so_count); 1134 } 1135 } 1136 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 1137 so->so_state |= SS_NOFDREF; 1138 sorele(so); 1139 if (listening) { 1140 struct socket *sp, *tsp; 1141 1142 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { 1143 SOCK_LOCK(sp); 1144 if (sp->so_count == 0) { 1145 SOCK_UNLOCK(sp); 1146 soabort(sp); 1147 } else 1148 /* sp is now in sofree() */ 1149 SOCK_UNLOCK(sp); 1150 } 1151 } 1152 CURVNET_RESTORE(); 1153 return (error); 1154 } 1155 1156 /* 1157 * soabort() is used to abruptly tear down a connection, such as when a 1158 * resource limit is reached (listen queue depth exceeded), or if a listen 1159 * socket is closed while there are sockets waiting to be accepted. 1160 * 1161 * This interface is tricky, because it is called on an unreferenced socket, 1162 * and must be called only by a thread that has actually removed the socket 1163 * from the listen queue it was on, or races with other threads are risked. 1164 * 1165 * This interface will call into the protocol code, so must not be called 1166 * with any socket locks held. Protocols do call it while holding their own 1167 * recursible protocol mutexes, but this is something that should be subject 1168 * to review in the future. 1169 */ 1170 void 1171 soabort(struct socket *so) 1172 { 1173 1174 /* 1175 * In as much as is possible, assert that no references to this 1176 * socket are held. This is not quite the same as asserting that the 1177 * current thread is responsible for arranging for no references, but 1178 * is as close as we can get for now. 1179 */ 1180 KASSERT(so->so_count == 0, ("soabort: so_count")); 1181 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 1182 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 1183 VNET_SO_ASSERT(so); 1184 1185 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 1186 (*so->so_proto->pr_usrreqs->pru_abort)(so); 1187 SOCK_LOCK(so); 1188 sofree(so); 1189 } 1190 1191 int 1192 soaccept(struct socket *so, struct sockaddr **nam) 1193 { 1194 int error; 1195 1196 SOCK_LOCK(so); 1197 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 1198 so->so_state &= ~SS_NOFDREF; 1199 SOCK_UNLOCK(so); 1200 1201 CURVNET_SET(so->so_vnet); 1202 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 1203 CURVNET_RESTORE(); 1204 return (error); 1205 } 1206 1207 int 1208 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 1209 { 1210 1211 return (soconnectat(AT_FDCWD, so, nam, td)); 1212 } 1213 1214 int 1215 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1216 { 1217 int error; 1218 1219 if (so->so_options & SO_ACCEPTCONN) 1220 return (EOPNOTSUPP); 1221 1222 CURVNET_SET(so->so_vnet); 1223 /* 1224 * If protocol is connection-based, can only connect once. 1225 * Otherwise, if connected, try to disconnect first. This allows 1226 * user to disconnect by connecting to, e.g., a null address. 1227 */ 1228 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1229 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1230 (error = sodisconnect(so)))) { 1231 error = EISCONN; 1232 } else { 1233 /* 1234 * Prevent accumulated error from previous connection from 1235 * biting us. 1236 */ 1237 so->so_error = 0; 1238 if (fd == AT_FDCWD) { 1239 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, 1240 nam, td); 1241 } else { 1242 error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, 1243 so, nam, td); 1244 } 1245 } 1246 CURVNET_RESTORE(); 1247 1248 return (error); 1249 } 1250 1251 int 1252 soconnect2(struct socket *so1, struct socket *so2) 1253 { 1254 int error; 1255 1256 CURVNET_SET(so1->so_vnet); 1257 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 1258 CURVNET_RESTORE(); 1259 return (error); 1260 } 1261 1262 int 1263 sodisconnect(struct socket *so) 1264 { 1265 int error; 1266 1267 if ((so->so_state & SS_ISCONNECTED) == 0) 1268 return (ENOTCONN); 1269 if (so->so_state & SS_ISDISCONNECTING) 1270 return (EALREADY); 1271 VNET_SO_ASSERT(so); 1272 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 1273 return (error); 1274 } 1275 1276 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1277 1278 int 1279 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1280 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1281 { 1282 long space; 1283 ssize_t resid; 1284 int clen = 0, error, dontroute; 1285 1286 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1287 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1288 ("sosend_dgram: !PR_ATOMIC")); 1289 1290 if (uio != NULL) 1291 resid = uio->uio_resid; 1292 else 1293 resid = top->m_pkthdr.len; 1294 /* 1295 * In theory resid should be unsigned. However, space must be 1296 * signed, as it might be less than 0 if we over-committed, and we 1297 * must use a signed comparison of space and resid. On the other 1298 * hand, a negative resid causes us to loop sending 0-length 1299 * segments to the protocol. 1300 */ 1301 if (resid < 0) { 1302 error = EINVAL; 1303 goto out; 1304 } 1305 1306 dontroute = 1307 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1308 if (td != NULL) 1309 td->td_ru.ru_msgsnd++; 1310 if (control != NULL) 1311 clen = control->m_len; 1312 1313 SOCKBUF_LOCK(&so->so_snd); 1314 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1315 SOCKBUF_UNLOCK(&so->so_snd); 1316 error = EPIPE; 1317 goto out; 1318 } 1319 if (so->so_error) { 1320 error = so->so_error; 1321 so->so_error = 0; 1322 SOCKBUF_UNLOCK(&so->so_snd); 1323 goto out; 1324 } 1325 if ((so->so_state & SS_ISCONNECTED) == 0) { 1326 /* 1327 * `sendto' and `sendmsg' is allowed on a connection-based 1328 * socket if it supports implied connect. Return ENOTCONN if 1329 * not connected and no address is supplied. 1330 */ 1331 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1332 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1333 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1334 !(resid == 0 && clen != 0)) { 1335 SOCKBUF_UNLOCK(&so->so_snd); 1336 error = ENOTCONN; 1337 goto out; 1338 } 1339 } else if (addr == NULL) { 1340 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1341 error = ENOTCONN; 1342 else 1343 error = EDESTADDRREQ; 1344 SOCKBUF_UNLOCK(&so->so_snd); 1345 goto out; 1346 } 1347 } 1348 1349 /* 1350 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1351 * problem and need fixing. 1352 */ 1353 space = sbspace(&so->so_snd); 1354 if (flags & MSG_OOB) 1355 space += 1024; 1356 space -= clen; 1357 SOCKBUF_UNLOCK(&so->so_snd); 1358 if (resid > space) { 1359 error = EMSGSIZE; 1360 goto out; 1361 } 1362 if (uio == NULL) { 1363 resid = 0; 1364 if (flags & MSG_EOR) 1365 top->m_flags |= M_EOR; 1366 } else { 1367 /* 1368 * Copy the data from userland into a mbuf chain. 1369 * If no data is to be copied in, a single empty mbuf 1370 * is returned. 1371 */ 1372 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1373 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1374 if (top == NULL) { 1375 error = EFAULT; /* only possible error */ 1376 goto out; 1377 } 1378 space -= resid - uio->uio_resid; 1379 resid = uio->uio_resid; 1380 } 1381 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1382 /* 1383 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1384 * than with. 1385 */ 1386 if (dontroute) { 1387 SOCK_LOCK(so); 1388 so->so_options |= SO_DONTROUTE; 1389 SOCK_UNLOCK(so); 1390 } 1391 /* 1392 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1393 * of date. We could have received a reset packet in an interrupt or 1394 * maybe we slept while doing page faults in uiomove() etc. We could 1395 * probably recheck again inside the locking protection here, but 1396 * there are probably other places that this also happens. We must 1397 * rethink this. 1398 */ 1399 VNET_SO_ASSERT(so); 1400 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1401 (flags & MSG_OOB) ? PRUS_OOB : 1402 /* 1403 * If the user set MSG_EOF, the protocol understands this flag and 1404 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1405 */ 1406 ((flags & MSG_EOF) && 1407 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1408 (resid <= 0)) ? 1409 PRUS_EOF : 1410 /* If there is more to send set PRUS_MORETOCOME */ 1411 (flags & MSG_MORETOCOME) || 1412 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1413 top, addr, control, td); 1414 if (dontroute) { 1415 SOCK_LOCK(so); 1416 so->so_options &= ~SO_DONTROUTE; 1417 SOCK_UNLOCK(so); 1418 } 1419 clen = 0; 1420 control = NULL; 1421 top = NULL; 1422 out: 1423 if (top != NULL) 1424 m_freem(top); 1425 if (control != NULL) 1426 m_freem(control); 1427 return (error); 1428 } 1429 1430 /* 1431 * Send on a socket. If send must go all at once and message is larger than 1432 * send buffering, then hard error. Lock against other senders. If must go 1433 * all at once and not enough room now, then inform user that this would 1434 * block and do nothing. Otherwise, if nonblocking, send as much as 1435 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1436 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1437 * in mbuf chain must be small enough to send all at once. 1438 * 1439 * Returns nonzero on error, timeout or signal; callers must check for short 1440 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1441 * on return. 1442 */ 1443 int 1444 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1445 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1446 { 1447 long space; 1448 ssize_t resid; 1449 int clen = 0, error, dontroute; 1450 int atomic = sosendallatonce(so) || top; 1451 int pru_flag; 1452 #ifdef KERN_TLS 1453 struct ktls_session *tls; 1454 int tls_enq_cnt, tls_pruflag; 1455 uint8_t tls_rtype; 1456 1457 tls = NULL; 1458 tls_rtype = TLS_RLTYPE_APP; 1459 #endif 1460 if (uio != NULL) 1461 resid = uio->uio_resid; 1462 else 1463 resid = top->m_pkthdr.len; 1464 /* 1465 * In theory resid should be unsigned. However, space must be 1466 * signed, as it might be less than 0 if we over-committed, and we 1467 * must use a signed comparison of space and resid. On the other 1468 * hand, a negative resid causes us to loop sending 0-length 1469 * segments to the protocol. 1470 * 1471 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1472 * type sockets since that's an error. 1473 */ 1474 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1475 error = EINVAL; 1476 goto out; 1477 } 1478 1479 dontroute = 1480 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1481 (so->so_proto->pr_flags & PR_ATOMIC); 1482 if (td != NULL) 1483 td->td_ru.ru_msgsnd++; 1484 if (control != NULL) 1485 clen = control->m_len; 1486 1487 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1488 if (error) 1489 goto out; 1490 1491 #ifdef KERN_TLS 1492 tls_pruflag = 0; 1493 tls = ktls_hold(so->so_snd.sb_tls_info); 1494 if (tls != NULL) { 1495 if (tls->mode == TCP_TLS_MODE_SW) 1496 tls_pruflag = PRUS_NOTREADY; 1497 1498 if (control != NULL) { 1499 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1500 1501 if (clen >= sizeof(*cm) && 1502 cm->cmsg_type == TLS_SET_RECORD_TYPE) { 1503 tls_rtype = *((uint8_t *)CMSG_DATA(cm)); 1504 clen = 0; 1505 m_freem(control); 1506 control = NULL; 1507 atomic = 1; 1508 } 1509 } 1510 } 1511 #endif 1512 1513 restart: 1514 do { 1515 SOCKBUF_LOCK(&so->so_snd); 1516 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1517 SOCKBUF_UNLOCK(&so->so_snd); 1518 error = EPIPE; 1519 goto release; 1520 } 1521 if (so->so_error) { 1522 error = so->so_error; 1523 so->so_error = 0; 1524 SOCKBUF_UNLOCK(&so->so_snd); 1525 goto release; 1526 } 1527 if ((so->so_state & SS_ISCONNECTED) == 0) { 1528 /* 1529 * `sendto' and `sendmsg' is allowed on a connection- 1530 * based socket if it supports implied connect. 1531 * Return ENOTCONN if not connected and no address is 1532 * supplied. 1533 */ 1534 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1535 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1536 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1537 !(resid == 0 && clen != 0)) { 1538 SOCKBUF_UNLOCK(&so->so_snd); 1539 error = ENOTCONN; 1540 goto release; 1541 } 1542 } else if (addr == NULL) { 1543 SOCKBUF_UNLOCK(&so->so_snd); 1544 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1545 error = ENOTCONN; 1546 else 1547 error = EDESTADDRREQ; 1548 goto release; 1549 } 1550 } 1551 space = sbspace(&so->so_snd); 1552 if (flags & MSG_OOB) 1553 space += 1024; 1554 if ((atomic && resid > so->so_snd.sb_hiwat) || 1555 clen > so->so_snd.sb_hiwat) { 1556 SOCKBUF_UNLOCK(&so->so_snd); 1557 error = EMSGSIZE; 1558 goto release; 1559 } 1560 if (space < resid + clen && 1561 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1562 if ((so->so_state & SS_NBIO) || 1563 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1564 SOCKBUF_UNLOCK(&so->so_snd); 1565 error = EWOULDBLOCK; 1566 goto release; 1567 } 1568 error = sbwait(&so->so_snd); 1569 SOCKBUF_UNLOCK(&so->so_snd); 1570 if (error) 1571 goto release; 1572 goto restart; 1573 } 1574 SOCKBUF_UNLOCK(&so->so_snd); 1575 space -= clen; 1576 do { 1577 if (uio == NULL) { 1578 resid = 0; 1579 if (flags & MSG_EOR) 1580 top->m_flags |= M_EOR; 1581 } else { 1582 /* 1583 * Copy the data from userland into a mbuf 1584 * chain. If resid is 0, which can happen 1585 * only if we have control to send, then 1586 * a single empty mbuf is returned. This 1587 * is a workaround to prevent protocol send 1588 * methods to panic. 1589 */ 1590 #ifdef KERN_TLS 1591 if (tls != NULL) { 1592 top = m_uiotombuf(uio, M_WAITOK, space, 1593 tls->params.max_frame_len, 1594 M_NOMAP | 1595 ((flags & MSG_EOR) ? M_EOR : 0)); 1596 if (top != NULL) { 1597 ktls_frame(top, tls, 1598 &tls_enq_cnt, tls_rtype); 1599 } 1600 tls_rtype = TLS_RLTYPE_APP; 1601 } else 1602 #endif 1603 top = m_uiotombuf(uio, M_WAITOK, space, 1604 (atomic ? max_hdr : 0), 1605 (atomic ? M_PKTHDR : 0) | 1606 ((flags & MSG_EOR) ? M_EOR : 0)); 1607 if (top == NULL) { 1608 error = EFAULT; /* only possible error */ 1609 goto release; 1610 } 1611 space -= resid - uio->uio_resid; 1612 resid = uio->uio_resid; 1613 } 1614 if (dontroute) { 1615 SOCK_LOCK(so); 1616 so->so_options |= SO_DONTROUTE; 1617 SOCK_UNLOCK(so); 1618 } 1619 /* 1620 * XXX all the SBS_CANTSENDMORE checks previously 1621 * done could be out of date. We could have received 1622 * a reset packet in an interrupt or maybe we slept 1623 * while doing page faults in uiomove() etc. We 1624 * could probably recheck again inside the locking 1625 * protection here, but there are probably other 1626 * places that this also happens. We must rethink 1627 * this. 1628 */ 1629 VNET_SO_ASSERT(so); 1630 1631 pru_flag = (flags & MSG_OOB) ? PRUS_OOB : 1632 /* 1633 * If the user set MSG_EOF, the protocol understands 1634 * this flag and nothing left to send then use 1635 * PRU_SEND_EOF instead of PRU_SEND. 1636 */ 1637 ((flags & MSG_EOF) && 1638 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1639 (resid <= 0)) ? 1640 PRUS_EOF : 1641 /* If there is more to send set PRUS_MORETOCOME. */ 1642 (flags & MSG_MORETOCOME) || 1643 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; 1644 1645 #ifdef KERN_TLS 1646 pru_flag |= tls_pruflag; 1647 #endif 1648 1649 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1650 pru_flag, top, addr, control, td); 1651 1652 if (dontroute) { 1653 SOCK_LOCK(so); 1654 so->so_options &= ~SO_DONTROUTE; 1655 SOCK_UNLOCK(so); 1656 } 1657 1658 #ifdef KERN_TLS 1659 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { 1660 /* 1661 * Note that error is intentionally 1662 * ignored. 1663 * 1664 * Like sendfile(), we rely on the 1665 * completion routine (pru_ready()) 1666 * to free the mbufs in the event that 1667 * pru_send() encountered an error and 1668 * did not append them to the sockbuf. 1669 */ 1670 soref(so); 1671 ktls_enqueue(top, so, tls_enq_cnt); 1672 } 1673 #endif 1674 clen = 0; 1675 control = NULL; 1676 top = NULL; 1677 if (error) 1678 goto release; 1679 } while (resid && space > 0); 1680 } while (resid); 1681 1682 release: 1683 sbunlock(&so->so_snd); 1684 out: 1685 #ifdef KERN_TLS 1686 if (tls != NULL) 1687 ktls_free(tls); 1688 #endif 1689 if (top != NULL) 1690 m_freem(top); 1691 if (control != NULL) 1692 m_freem(control); 1693 return (error); 1694 } 1695 1696 int 1697 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1698 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1699 { 1700 int error; 1701 1702 CURVNET_SET(so->so_vnet); 1703 if (!SOLISTENING(so)) 1704 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, 1705 top, control, flags, td); 1706 else { 1707 m_freem(top); 1708 m_freem(control); 1709 error = ENOTCONN; 1710 } 1711 CURVNET_RESTORE(); 1712 return (error); 1713 } 1714 1715 /* 1716 * The part of soreceive() that implements reading non-inline out-of-band 1717 * data from a socket. For more complete comments, see soreceive(), from 1718 * which this code originated. 1719 * 1720 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1721 * unable to return an mbuf chain to the caller. 1722 */ 1723 static int 1724 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1725 { 1726 struct protosw *pr = so->so_proto; 1727 struct mbuf *m; 1728 int error; 1729 1730 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1731 VNET_SO_ASSERT(so); 1732 1733 m = m_get(M_WAITOK, MT_DATA); 1734 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1735 if (error) 1736 goto bad; 1737 do { 1738 error = uiomove(mtod(m, void *), 1739 (int) min(uio->uio_resid, m->m_len), uio); 1740 m = m_free(m); 1741 } while (uio->uio_resid && error == 0 && m); 1742 bad: 1743 if (m != NULL) 1744 m_freem(m); 1745 return (error); 1746 } 1747 1748 /* 1749 * Following replacement or removal of the first mbuf on the first mbuf chain 1750 * of a socket buffer, push necessary state changes back into the socket 1751 * buffer so that other consumers see the values consistently. 'nextrecord' 1752 * is the callers locally stored value of the original value of 1753 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1754 * NOTE: 'nextrecord' may be NULL. 1755 */ 1756 static __inline void 1757 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1758 { 1759 1760 SOCKBUF_LOCK_ASSERT(sb); 1761 /* 1762 * First, update for the new value of nextrecord. If necessary, make 1763 * it the first record. 1764 */ 1765 if (sb->sb_mb != NULL) 1766 sb->sb_mb->m_nextpkt = nextrecord; 1767 else 1768 sb->sb_mb = nextrecord; 1769 1770 /* 1771 * Now update any dependent socket buffer fields to reflect the new 1772 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1773 * addition of a second clause that takes care of the case where 1774 * sb_mb has been updated, but remains the last record. 1775 */ 1776 if (sb->sb_mb == NULL) { 1777 sb->sb_mbtail = NULL; 1778 sb->sb_lastrecord = NULL; 1779 } else if (sb->sb_mb->m_nextpkt == NULL) 1780 sb->sb_lastrecord = sb->sb_mb; 1781 } 1782 1783 /* 1784 * Implement receive operations on a socket. We depend on the way that 1785 * records are added to the sockbuf by sbappend. In particular, each record 1786 * (mbufs linked through m_next) must begin with an address if the protocol 1787 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1788 * data, and then zero or more mbufs of data. In order to allow parallelism 1789 * between network receive and copying to user space, as well as avoid 1790 * sleeping with a mutex held, we release the socket buffer mutex during the 1791 * user space copy. Although the sockbuf is locked, new data may still be 1792 * appended, and thus we must maintain consistency of the sockbuf during that 1793 * time. 1794 * 1795 * The caller may receive the data as a single mbuf chain by supplying an 1796 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1797 * the count in uio_resid. 1798 */ 1799 int 1800 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1801 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1802 { 1803 struct mbuf *m, **mp; 1804 int flags, error, offset; 1805 ssize_t len; 1806 struct protosw *pr = so->so_proto; 1807 struct mbuf *nextrecord; 1808 int moff, type = 0; 1809 ssize_t orig_resid = uio->uio_resid; 1810 1811 mp = mp0; 1812 if (psa != NULL) 1813 *psa = NULL; 1814 if (controlp != NULL) 1815 *controlp = NULL; 1816 if (flagsp != NULL) 1817 flags = *flagsp &~ MSG_EOR; 1818 else 1819 flags = 0; 1820 if (flags & MSG_OOB) 1821 return (soreceive_rcvoob(so, uio, flags)); 1822 if (mp != NULL) 1823 *mp = NULL; 1824 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1825 && uio->uio_resid) { 1826 VNET_SO_ASSERT(so); 1827 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1828 } 1829 1830 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1831 if (error) 1832 return (error); 1833 1834 restart: 1835 SOCKBUF_LOCK(&so->so_rcv); 1836 m = so->so_rcv.sb_mb; 1837 /* 1838 * If we have less data than requested, block awaiting more (subject 1839 * to any timeout) if: 1840 * 1. the current count is less than the low water mark, or 1841 * 2. MSG_DONTWAIT is not set 1842 */ 1843 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1844 sbavail(&so->so_rcv) < uio->uio_resid) && 1845 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 1846 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1847 KASSERT(m != NULL || !sbavail(&so->so_rcv), 1848 ("receive: m == %p sbavail == %u", 1849 m, sbavail(&so->so_rcv))); 1850 if (so->so_error) { 1851 if (m != NULL) 1852 goto dontblock; 1853 error = so->so_error; 1854 if ((flags & MSG_PEEK) == 0) 1855 so->so_error = 0; 1856 SOCKBUF_UNLOCK(&so->so_rcv); 1857 goto release; 1858 } 1859 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1860 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1861 if (m == NULL) { 1862 SOCKBUF_UNLOCK(&so->so_rcv); 1863 goto release; 1864 } else 1865 goto dontblock; 1866 } 1867 for (; m != NULL; m = m->m_next) 1868 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1869 m = so->so_rcv.sb_mb; 1870 goto dontblock; 1871 } 1872 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1873 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1874 SOCKBUF_UNLOCK(&so->so_rcv); 1875 error = ENOTCONN; 1876 goto release; 1877 } 1878 if (uio->uio_resid == 0) { 1879 SOCKBUF_UNLOCK(&so->so_rcv); 1880 goto release; 1881 } 1882 if ((so->so_state & SS_NBIO) || 1883 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1884 SOCKBUF_UNLOCK(&so->so_rcv); 1885 error = EWOULDBLOCK; 1886 goto release; 1887 } 1888 SBLASTRECORDCHK(&so->so_rcv); 1889 SBLASTMBUFCHK(&so->so_rcv); 1890 error = sbwait(&so->so_rcv); 1891 SOCKBUF_UNLOCK(&so->so_rcv); 1892 if (error) 1893 goto release; 1894 goto restart; 1895 } 1896 dontblock: 1897 /* 1898 * From this point onward, we maintain 'nextrecord' as a cache of the 1899 * pointer to the next record in the socket buffer. We must keep the 1900 * various socket buffer pointers and local stack versions of the 1901 * pointers in sync, pushing out modifications before dropping the 1902 * socket buffer mutex, and re-reading them when picking it up. 1903 * 1904 * Otherwise, we will race with the network stack appending new data 1905 * or records onto the socket buffer by using inconsistent/stale 1906 * versions of the field, possibly resulting in socket buffer 1907 * corruption. 1908 * 1909 * By holding the high-level sblock(), we prevent simultaneous 1910 * readers from pulling off the front of the socket buffer. 1911 */ 1912 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1913 if (uio->uio_td) 1914 uio->uio_td->td_ru.ru_msgrcv++; 1915 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1916 SBLASTRECORDCHK(&so->so_rcv); 1917 SBLASTMBUFCHK(&so->so_rcv); 1918 nextrecord = m->m_nextpkt; 1919 if (pr->pr_flags & PR_ADDR) { 1920 KASSERT(m->m_type == MT_SONAME, 1921 ("m->m_type == %d", m->m_type)); 1922 orig_resid = 0; 1923 if (psa != NULL) 1924 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1925 M_NOWAIT); 1926 if (flags & MSG_PEEK) { 1927 m = m->m_next; 1928 } else { 1929 sbfree(&so->so_rcv, m); 1930 so->so_rcv.sb_mb = m_free(m); 1931 m = so->so_rcv.sb_mb; 1932 sockbuf_pushsync(&so->so_rcv, nextrecord); 1933 } 1934 } 1935 1936 /* 1937 * Process one or more MT_CONTROL mbufs present before any data mbufs 1938 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1939 * just copy the data; if !MSG_PEEK, we call into the protocol to 1940 * perform externalization (or freeing if controlp == NULL). 1941 */ 1942 if (m != NULL && m->m_type == MT_CONTROL) { 1943 struct mbuf *cm = NULL, *cmn; 1944 struct mbuf **cme = &cm; 1945 1946 do { 1947 if (flags & MSG_PEEK) { 1948 if (controlp != NULL) { 1949 *controlp = m_copym(m, 0, m->m_len, 1950 M_NOWAIT); 1951 controlp = &(*controlp)->m_next; 1952 } 1953 m = m->m_next; 1954 } else { 1955 sbfree(&so->so_rcv, m); 1956 so->so_rcv.sb_mb = m->m_next; 1957 m->m_next = NULL; 1958 *cme = m; 1959 cme = &(*cme)->m_next; 1960 m = so->so_rcv.sb_mb; 1961 } 1962 } while (m != NULL && m->m_type == MT_CONTROL); 1963 if ((flags & MSG_PEEK) == 0) 1964 sockbuf_pushsync(&so->so_rcv, nextrecord); 1965 while (cm != NULL) { 1966 cmn = cm->m_next; 1967 cm->m_next = NULL; 1968 if (pr->pr_domain->dom_externalize != NULL) { 1969 SOCKBUF_UNLOCK(&so->so_rcv); 1970 VNET_SO_ASSERT(so); 1971 error = (*pr->pr_domain->dom_externalize) 1972 (cm, controlp, flags); 1973 SOCKBUF_LOCK(&so->so_rcv); 1974 } else if (controlp != NULL) 1975 *controlp = cm; 1976 else 1977 m_freem(cm); 1978 if (controlp != NULL) { 1979 orig_resid = 0; 1980 while (*controlp != NULL) 1981 controlp = &(*controlp)->m_next; 1982 } 1983 cm = cmn; 1984 } 1985 if (m != NULL) 1986 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1987 else 1988 nextrecord = so->so_rcv.sb_mb; 1989 orig_resid = 0; 1990 } 1991 if (m != NULL) { 1992 if ((flags & MSG_PEEK) == 0) { 1993 KASSERT(m->m_nextpkt == nextrecord, 1994 ("soreceive: post-control, nextrecord !sync")); 1995 if (nextrecord == NULL) { 1996 KASSERT(so->so_rcv.sb_mb == m, 1997 ("soreceive: post-control, sb_mb!=m")); 1998 KASSERT(so->so_rcv.sb_lastrecord == m, 1999 ("soreceive: post-control, lastrecord!=m")); 2000 } 2001 } 2002 type = m->m_type; 2003 if (type == MT_OOBDATA) 2004 flags |= MSG_OOB; 2005 } else { 2006 if ((flags & MSG_PEEK) == 0) { 2007 KASSERT(so->so_rcv.sb_mb == nextrecord, 2008 ("soreceive: sb_mb != nextrecord")); 2009 if (so->so_rcv.sb_mb == NULL) { 2010 KASSERT(so->so_rcv.sb_lastrecord == NULL, 2011 ("soreceive: sb_lastercord != NULL")); 2012 } 2013 } 2014 } 2015 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2016 SBLASTRECORDCHK(&so->so_rcv); 2017 SBLASTMBUFCHK(&so->so_rcv); 2018 2019 /* 2020 * Now continue to read any data mbufs off of the head of the socket 2021 * buffer until the read request is satisfied. Note that 'type' is 2022 * used to store the type of any mbuf reads that have happened so far 2023 * such that soreceive() can stop reading if the type changes, which 2024 * causes soreceive() to return only one of regular data and inline 2025 * out-of-band data in a single socket receive operation. 2026 */ 2027 moff = 0; 2028 offset = 0; 2029 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 2030 && error == 0) { 2031 /* 2032 * If the type of mbuf has changed since the last mbuf 2033 * examined ('type'), end the receive operation. 2034 */ 2035 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2036 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 2037 if (type != m->m_type) 2038 break; 2039 } else if (type == MT_OOBDATA) 2040 break; 2041 else 2042 KASSERT(m->m_type == MT_DATA, 2043 ("m->m_type == %d", m->m_type)); 2044 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 2045 len = uio->uio_resid; 2046 if (so->so_oobmark && len > so->so_oobmark - offset) 2047 len = so->so_oobmark - offset; 2048 if (len > m->m_len - moff) 2049 len = m->m_len - moff; 2050 /* 2051 * If mp is set, just pass back the mbufs. Otherwise copy 2052 * them out via the uio, then free. Sockbuf must be 2053 * consistent here (points to current mbuf, it points to next 2054 * record) when we drop priority; we must note any additions 2055 * to the sockbuf when we block interrupts again. 2056 */ 2057 if (mp == NULL) { 2058 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2059 SBLASTRECORDCHK(&so->so_rcv); 2060 SBLASTMBUFCHK(&so->so_rcv); 2061 SOCKBUF_UNLOCK(&so->so_rcv); 2062 if ((m->m_flags & M_NOMAP) != 0) 2063 error = m_unmappedtouio(m, moff, uio, (int)len); 2064 else 2065 error = uiomove(mtod(m, char *) + moff, 2066 (int)len, uio); 2067 SOCKBUF_LOCK(&so->so_rcv); 2068 if (error) { 2069 /* 2070 * The MT_SONAME mbuf has already been removed 2071 * from the record, so it is necessary to 2072 * remove the data mbufs, if any, to preserve 2073 * the invariant in the case of PR_ADDR that 2074 * requires MT_SONAME mbufs at the head of 2075 * each record. 2076 */ 2077 if (pr->pr_flags & PR_ATOMIC && 2078 ((flags & MSG_PEEK) == 0)) 2079 (void)sbdroprecord_locked(&so->so_rcv); 2080 SOCKBUF_UNLOCK(&so->so_rcv); 2081 goto release; 2082 } 2083 } else 2084 uio->uio_resid -= len; 2085 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2086 if (len == m->m_len - moff) { 2087 if (m->m_flags & M_EOR) 2088 flags |= MSG_EOR; 2089 if (flags & MSG_PEEK) { 2090 m = m->m_next; 2091 moff = 0; 2092 } else { 2093 nextrecord = m->m_nextpkt; 2094 sbfree(&so->so_rcv, m); 2095 if (mp != NULL) { 2096 m->m_nextpkt = NULL; 2097 *mp = m; 2098 mp = &m->m_next; 2099 so->so_rcv.sb_mb = m = m->m_next; 2100 *mp = NULL; 2101 } else { 2102 so->so_rcv.sb_mb = m_free(m); 2103 m = so->so_rcv.sb_mb; 2104 } 2105 sockbuf_pushsync(&so->so_rcv, nextrecord); 2106 SBLASTRECORDCHK(&so->so_rcv); 2107 SBLASTMBUFCHK(&so->so_rcv); 2108 } 2109 } else { 2110 if (flags & MSG_PEEK) 2111 moff += len; 2112 else { 2113 if (mp != NULL) { 2114 if (flags & MSG_DONTWAIT) { 2115 *mp = m_copym(m, 0, len, 2116 M_NOWAIT); 2117 if (*mp == NULL) { 2118 /* 2119 * m_copym() couldn't 2120 * allocate an mbuf. 2121 * Adjust uio_resid back 2122 * (it was adjusted 2123 * down by len bytes, 2124 * which we didn't end 2125 * up "copying" over). 2126 */ 2127 uio->uio_resid += len; 2128 break; 2129 } 2130 } else { 2131 SOCKBUF_UNLOCK(&so->so_rcv); 2132 *mp = m_copym(m, 0, len, 2133 M_WAITOK); 2134 SOCKBUF_LOCK(&so->so_rcv); 2135 } 2136 } 2137 sbcut_locked(&so->so_rcv, len); 2138 } 2139 } 2140 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2141 if (so->so_oobmark) { 2142 if ((flags & MSG_PEEK) == 0) { 2143 so->so_oobmark -= len; 2144 if (so->so_oobmark == 0) { 2145 so->so_rcv.sb_state |= SBS_RCVATMARK; 2146 break; 2147 } 2148 } else { 2149 offset += len; 2150 if (offset == so->so_oobmark) 2151 break; 2152 } 2153 } 2154 if (flags & MSG_EOR) 2155 break; 2156 /* 2157 * If the MSG_WAITALL flag is set (for non-atomic socket), we 2158 * must not quit until "uio->uio_resid == 0" or an error 2159 * termination. If a signal/timeout occurs, return with a 2160 * short count but without error. Keep sockbuf locked 2161 * against other readers. 2162 */ 2163 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 2164 !sosendallatonce(so) && nextrecord == NULL) { 2165 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2166 if (so->so_error || 2167 so->so_rcv.sb_state & SBS_CANTRCVMORE) 2168 break; 2169 /* 2170 * Notify the protocol that some data has been 2171 * drained before blocking. 2172 */ 2173 if (pr->pr_flags & PR_WANTRCVD) { 2174 SOCKBUF_UNLOCK(&so->so_rcv); 2175 VNET_SO_ASSERT(so); 2176 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2177 SOCKBUF_LOCK(&so->so_rcv); 2178 } 2179 SBLASTRECORDCHK(&so->so_rcv); 2180 SBLASTMBUFCHK(&so->so_rcv); 2181 /* 2182 * We could receive some data while was notifying 2183 * the protocol. Skip blocking in this case. 2184 */ 2185 if (so->so_rcv.sb_mb == NULL) { 2186 error = sbwait(&so->so_rcv); 2187 if (error) { 2188 SOCKBUF_UNLOCK(&so->so_rcv); 2189 goto release; 2190 } 2191 } 2192 m = so->so_rcv.sb_mb; 2193 if (m != NULL) 2194 nextrecord = m->m_nextpkt; 2195 } 2196 } 2197 2198 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2199 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 2200 flags |= MSG_TRUNC; 2201 if ((flags & MSG_PEEK) == 0) 2202 (void) sbdroprecord_locked(&so->so_rcv); 2203 } 2204 if ((flags & MSG_PEEK) == 0) { 2205 if (m == NULL) { 2206 /* 2207 * First part is an inline SB_EMPTY_FIXUP(). Second 2208 * part makes sure sb_lastrecord is up-to-date if 2209 * there is still data in the socket buffer. 2210 */ 2211 so->so_rcv.sb_mb = nextrecord; 2212 if (so->so_rcv.sb_mb == NULL) { 2213 so->so_rcv.sb_mbtail = NULL; 2214 so->so_rcv.sb_lastrecord = NULL; 2215 } else if (nextrecord->m_nextpkt == NULL) 2216 so->so_rcv.sb_lastrecord = nextrecord; 2217 } 2218 SBLASTRECORDCHK(&so->so_rcv); 2219 SBLASTMBUFCHK(&so->so_rcv); 2220 /* 2221 * If soreceive() is being done from the socket callback, 2222 * then don't need to generate ACK to peer to update window, 2223 * since ACK will be generated on return to TCP. 2224 */ 2225 if (!(flags & MSG_SOCALLBCK) && 2226 (pr->pr_flags & PR_WANTRCVD)) { 2227 SOCKBUF_UNLOCK(&so->so_rcv); 2228 VNET_SO_ASSERT(so); 2229 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2230 SOCKBUF_LOCK(&so->so_rcv); 2231 } 2232 } 2233 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2234 if (orig_resid == uio->uio_resid && orig_resid && 2235 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 2236 SOCKBUF_UNLOCK(&so->so_rcv); 2237 goto restart; 2238 } 2239 SOCKBUF_UNLOCK(&so->so_rcv); 2240 2241 if (flagsp != NULL) 2242 *flagsp |= flags; 2243 release: 2244 sbunlock(&so->so_rcv); 2245 return (error); 2246 } 2247 2248 /* 2249 * Optimized version of soreceive() for stream (TCP) sockets. 2250 */ 2251 int 2252 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 2253 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2254 { 2255 int len = 0, error = 0, flags, oresid; 2256 struct sockbuf *sb; 2257 struct mbuf *m, *n = NULL; 2258 2259 /* We only do stream sockets. */ 2260 if (so->so_type != SOCK_STREAM) 2261 return (EINVAL); 2262 if (psa != NULL) 2263 *psa = NULL; 2264 if (flagsp != NULL) 2265 flags = *flagsp &~ MSG_EOR; 2266 else 2267 flags = 0; 2268 if (controlp != NULL) 2269 *controlp = NULL; 2270 if (flags & MSG_OOB) 2271 return (soreceive_rcvoob(so, uio, flags)); 2272 if (mp0 != NULL) 2273 *mp0 = NULL; 2274 2275 sb = &so->so_rcv; 2276 2277 /* Prevent other readers from entering the socket. */ 2278 error = sblock(sb, SBLOCKWAIT(flags)); 2279 if (error) 2280 return (error); 2281 SOCKBUF_LOCK(sb); 2282 2283 /* Easy one, no space to copyout anything. */ 2284 if (uio->uio_resid == 0) { 2285 error = EINVAL; 2286 goto out; 2287 } 2288 oresid = uio->uio_resid; 2289 2290 /* We will never ever get anything unless we are or were connected. */ 2291 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2292 error = ENOTCONN; 2293 goto out; 2294 } 2295 2296 restart: 2297 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2298 2299 /* Abort if socket has reported problems. */ 2300 if (so->so_error) { 2301 if (sbavail(sb) > 0) 2302 goto deliver; 2303 if (oresid > uio->uio_resid) 2304 goto out; 2305 error = so->so_error; 2306 if (!(flags & MSG_PEEK)) 2307 so->so_error = 0; 2308 goto out; 2309 } 2310 2311 /* Door is closed. Deliver what is left, if any. */ 2312 if (sb->sb_state & SBS_CANTRCVMORE) { 2313 if (sbavail(sb) > 0) 2314 goto deliver; 2315 else 2316 goto out; 2317 } 2318 2319 /* Socket buffer is empty and we shall not block. */ 2320 if (sbavail(sb) == 0 && 2321 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2322 error = EAGAIN; 2323 goto out; 2324 } 2325 2326 /* Socket buffer got some data that we shall deliver now. */ 2327 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 2328 ((so->so_state & SS_NBIO) || 2329 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2330 sbavail(sb) >= sb->sb_lowat || 2331 sbavail(sb) >= uio->uio_resid || 2332 sbavail(sb) >= sb->sb_hiwat) ) { 2333 goto deliver; 2334 } 2335 2336 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2337 if ((flags & MSG_WAITALL) && 2338 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 2339 goto deliver; 2340 2341 /* 2342 * Wait and block until (more) data comes in. 2343 * NB: Drops the sockbuf lock during wait. 2344 */ 2345 error = sbwait(sb); 2346 if (error) 2347 goto out; 2348 goto restart; 2349 2350 deliver: 2351 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2352 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 2353 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2354 2355 /* Statistics. */ 2356 if (uio->uio_td) 2357 uio->uio_td->td_ru.ru_msgrcv++; 2358 2359 /* Fill uio until full or current end of socket buffer is reached. */ 2360 len = min(uio->uio_resid, sbavail(sb)); 2361 if (mp0 != NULL) { 2362 /* Dequeue as many mbufs as possible. */ 2363 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2364 if (*mp0 == NULL) 2365 *mp0 = sb->sb_mb; 2366 else 2367 m_cat(*mp0, sb->sb_mb); 2368 for (m = sb->sb_mb; 2369 m != NULL && m->m_len <= len; 2370 m = m->m_next) { 2371 KASSERT(!(m->m_flags & M_NOTAVAIL), 2372 ("%s: m %p not available", __func__, m)); 2373 len -= m->m_len; 2374 uio->uio_resid -= m->m_len; 2375 sbfree(sb, m); 2376 n = m; 2377 } 2378 n->m_next = NULL; 2379 sb->sb_mb = m; 2380 sb->sb_lastrecord = sb->sb_mb; 2381 if (sb->sb_mb == NULL) 2382 SB_EMPTY_FIXUP(sb); 2383 } 2384 /* Copy the remainder. */ 2385 if (len > 0) { 2386 KASSERT(sb->sb_mb != NULL, 2387 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2388 2389 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2390 if (m == NULL) 2391 len = 0; /* Don't flush data from sockbuf. */ 2392 else 2393 uio->uio_resid -= len; 2394 if (*mp0 != NULL) 2395 m_cat(*mp0, m); 2396 else 2397 *mp0 = m; 2398 if (*mp0 == NULL) { 2399 error = ENOBUFS; 2400 goto out; 2401 } 2402 } 2403 } else { 2404 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2405 SOCKBUF_UNLOCK(sb); 2406 error = m_mbuftouio(uio, sb->sb_mb, len); 2407 SOCKBUF_LOCK(sb); 2408 if (error) 2409 goto out; 2410 } 2411 SBLASTRECORDCHK(sb); 2412 SBLASTMBUFCHK(sb); 2413 2414 /* 2415 * Remove the delivered data from the socket buffer unless we 2416 * were only peeking. 2417 */ 2418 if (!(flags & MSG_PEEK)) { 2419 if (len > 0) 2420 sbdrop_locked(sb, len); 2421 2422 /* Notify protocol that we drained some data. */ 2423 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2424 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2425 !(flags & MSG_SOCALLBCK))) { 2426 SOCKBUF_UNLOCK(sb); 2427 VNET_SO_ASSERT(so); 2428 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 2429 SOCKBUF_LOCK(sb); 2430 } 2431 } 2432 2433 /* 2434 * For MSG_WAITALL we may have to loop again and wait for 2435 * more data to come in. 2436 */ 2437 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2438 goto restart; 2439 out: 2440 SOCKBUF_LOCK_ASSERT(sb); 2441 SBLASTRECORDCHK(sb); 2442 SBLASTMBUFCHK(sb); 2443 SOCKBUF_UNLOCK(sb); 2444 sbunlock(sb); 2445 return (error); 2446 } 2447 2448 /* 2449 * Optimized version of soreceive() for simple datagram cases from userspace. 2450 * Unlike in the stream case, we're able to drop a datagram if copyout() 2451 * fails, and because we handle datagrams atomically, we don't need to use a 2452 * sleep lock to prevent I/O interlacing. 2453 */ 2454 int 2455 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2456 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2457 { 2458 struct mbuf *m, *m2; 2459 int flags, error; 2460 ssize_t len; 2461 struct protosw *pr = so->so_proto; 2462 struct mbuf *nextrecord; 2463 2464 if (psa != NULL) 2465 *psa = NULL; 2466 if (controlp != NULL) 2467 *controlp = NULL; 2468 if (flagsp != NULL) 2469 flags = *flagsp &~ MSG_EOR; 2470 else 2471 flags = 0; 2472 2473 /* 2474 * For any complicated cases, fall back to the full 2475 * soreceive_generic(). 2476 */ 2477 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) 2478 return (soreceive_generic(so, psa, uio, mp0, controlp, 2479 flagsp)); 2480 2481 /* 2482 * Enforce restrictions on use. 2483 */ 2484 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2485 ("soreceive_dgram: wantrcvd")); 2486 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2487 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2488 ("soreceive_dgram: SBS_RCVATMARK")); 2489 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2490 ("soreceive_dgram: P_CONNREQUIRED")); 2491 2492 /* 2493 * Loop blocking while waiting for a datagram. 2494 */ 2495 SOCKBUF_LOCK(&so->so_rcv); 2496 while ((m = so->so_rcv.sb_mb) == NULL) { 2497 KASSERT(sbavail(&so->so_rcv) == 0, 2498 ("soreceive_dgram: sb_mb NULL but sbavail %u", 2499 sbavail(&so->so_rcv))); 2500 if (so->so_error) { 2501 error = so->so_error; 2502 so->so_error = 0; 2503 SOCKBUF_UNLOCK(&so->so_rcv); 2504 return (error); 2505 } 2506 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2507 uio->uio_resid == 0) { 2508 SOCKBUF_UNLOCK(&so->so_rcv); 2509 return (0); 2510 } 2511 if ((so->so_state & SS_NBIO) || 2512 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2513 SOCKBUF_UNLOCK(&so->so_rcv); 2514 return (EWOULDBLOCK); 2515 } 2516 SBLASTRECORDCHK(&so->so_rcv); 2517 SBLASTMBUFCHK(&so->so_rcv); 2518 error = sbwait(&so->so_rcv); 2519 if (error) { 2520 SOCKBUF_UNLOCK(&so->so_rcv); 2521 return (error); 2522 } 2523 } 2524 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2525 2526 if (uio->uio_td) 2527 uio->uio_td->td_ru.ru_msgrcv++; 2528 SBLASTRECORDCHK(&so->so_rcv); 2529 SBLASTMBUFCHK(&so->so_rcv); 2530 nextrecord = m->m_nextpkt; 2531 if (nextrecord == NULL) { 2532 KASSERT(so->so_rcv.sb_lastrecord == m, 2533 ("soreceive_dgram: lastrecord != m")); 2534 } 2535 2536 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2537 ("soreceive_dgram: m_nextpkt != nextrecord")); 2538 2539 /* 2540 * Pull 'm' and its chain off the front of the packet queue. 2541 */ 2542 so->so_rcv.sb_mb = NULL; 2543 sockbuf_pushsync(&so->so_rcv, nextrecord); 2544 2545 /* 2546 * Walk 'm's chain and free that many bytes from the socket buffer. 2547 */ 2548 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2549 sbfree(&so->so_rcv, m2); 2550 2551 /* 2552 * Do a few last checks before we let go of the lock. 2553 */ 2554 SBLASTRECORDCHK(&so->so_rcv); 2555 SBLASTMBUFCHK(&so->so_rcv); 2556 SOCKBUF_UNLOCK(&so->so_rcv); 2557 2558 if (pr->pr_flags & PR_ADDR) { 2559 KASSERT(m->m_type == MT_SONAME, 2560 ("m->m_type == %d", m->m_type)); 2561 if (psa != NULL) 2562 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2563 M_NOWAIT); 2564 m = m_free(m); 2565 } 2566 if (m == NULL) { 2567 /* XXXRW: Can this happen? */ 2568 return (0); 2569 } 2570 2571 /* 2572 * Packet to copyout() is now in 'm' and it is disconnected from the 2573 * queue. 2574 * 2575 * Process one or more MT_CONTROL mbufs present before any data mbufs 2576 * in the first mbuf chain on the socket buffer. We call into the 2577 * protocol to perform externalization (or freeing if controlp == 2578 * NULL). In some cases there can be only MT_CONTROL mbufs without 2579 * MT_DATA mbufs. 2580 */ 2581 if (m->m_type == MT_CONTROL) { 2582 struct mbuf *cm = NULL, *cmn; 2583 struct mbuf **cme = &cm; 2584 2585 do { 2586 m2 = m->m_next; 2587 m->m_next = NULL; 2588 *cme = m; 2589 cme = &(*cme)->m_next; 2590 m = m2; 2591 } while (m != NULL && m->m_type == MT_CONTROL); 2592 while (cm != NULL) { 2593 cmn = cm->m_next; 2594 cm->m_next = NULL; 2595 if (pr->pr_domain->dom_externalize != NULL) { 2596 error = (*pr->pr_domain->dom_externalize) 2597 (cm, controlp, flags); 2598 } else if (controlp != NULL) 2599 *controlp = cm; 2600 else 2601 m_freem(cm); 2602 if (controlp != NULL) { 2603 while (*controlp != NULL) 2604 controlp = &(*controlp)->m_next; 2605 } 2606 cm = cmn; 2607 } 2608 } 2609 KASSERT(m == NULL || m->m_type == MT_DATA, 2610 ("soreceive_dgram: !data")); 2611 while (m != NULL && uio->uio_resid > 0) { 2612 len = uio->uio_resid; 2613 if (len > m->m_len) 2614 len = m->m_len; 2615 error = uiomove(mtod(m, char *), (int)len, uio); 2616 if (error) { 2617 m_freem(m); 2618 return (error); 2619 } 2620 if (len == m->m_len) 2621 m = m_free(m); 2622 else { 2623 m->m_data += len; 2624 m->m_len -= len; 2625 } 2626 } 2627 if (m != NULL) { 2628 flags |= MSG_TRUNC; 2629 m_freem(m); 2630 } 2631 if (flagsp != NULL) 2632 *flagsp |= flags; 2633 return (0); 2634 } 2635 2636 int 2637 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2638 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2639 { 2640 int error; 2641 2642 CURVNET_SET(so->so_vnet); 2643 if (!SOLISTENING(so)) 2644 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, 2645 mp0, controlp, flagsp)); 2646 else 2647 error = ENOTCONN; 2648 CURVNET_RESTORE(); 2649 return (error); 2650 } 2651 2652 int 2653 soshutdown(struct socket *so, int how) 2654 { 2655 struct protosw *pr = so->so_proto; 2656 int error, soerror_enotconn; 2657 2658 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2659 return (EINVAL); 2660 2661 soerror_enotconn = 0; 2662 if ((so->so_state & 2663 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { 2664 /* 2665 * POSIX mandates us to return ENOTCONN when shutdown(2) is 2666 * invoked on a datagram sockets, however historically we would 2667 * actually tear socket down. This is known to be leveraged by 2668 * some applications to unblock process waiting in recvXXX(2) 2669 * by other process that it shares that socket with. Try to meet 2670 * both backward-compatibility and POSIX requirements by forcing 2671 * ENOTCONN but still asking protocol to perform pru_shutdown(). 2672 */ 2673 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) 2674 return (ENOTCONN); 2675 soerror_enotconn = 1; 2676 } 2677 2678 if (SOLISTENING(so)) { 2679 if (how != SHUT_WR) { 2680 SOLISTEN_LOCK(so); 2681 so->so_error = ECONNABORTED; 2682 solisten_wakeup(so); /* unlocks so */ 2683 } 2684 goto done; 2685 } 2686 2687 CURVNET_SET(so->so_vnet); 2688 if (pr->pr_usrreqs->pru_flush != NULL) 2689 (*pr->pr_usrreqs->pru_flush)(so, how); 2690 if (how != SHUT_WR) 2691 sorflush(so); 2692 if (how != SHUT_RD) { 2693 error = (*pr->pr_usrreqs->pru_shutdown)(so); 2694 wakeup(&so->so_timeo); 2695 CURVNET_RESTORE(); 2696 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); 2697 } 2698 wakeup(&so->so_timeo); 2699 CURVNET_RESTORE(); 2700 2701 done: 2702 return (soerror_enotconn ? ENOTCONN : 0); 2703 } 2704 2705 void 2706 sorflush(struct socket *so) 2707 { 2708 struct sockbuf *sb = &so->so_rcv; 2709 struct protosw *pr = so->so_proto; 2710 struct socket aso; 2711 2712 VNET_SO_ASSERT(so); 2713 2714 /* 2715 * In order to avoid calling dom_dispose with the socket buffer mutex 2716 * held, and in order to generally avoid holding the lock for a long 2717 * time, we make a copy of the socket buffer and clear the original 2718 * (except locks, state). The new socket buffer copy won't have 2719 * initialized locks so we can only call routines that won't use or 2720 * assert those locks. 2721 * 2722 * Dislodge threads currently blocked in receive and wait to acquire 2723 * a lock against other simultaneous readers before clearing the 2724 * socket buffer. Don't let our acquire be interrupted by a signal 2725 * despite any existing socket disposition on interruptable waiting. 2726 */ 2727 socantrcvmore(so); 2728 (void) sblock(sb, SBL_WAIT | SBL_NOINTR); 2729 2730 /* 2731 * Invalidate/clear most of the sockbuf structure, but leave selinfo 2732 * and mutex data unchanged. 2733 */ 2734 SOCKBUF_LOCK(sb); 2735 bzero(&aso, sizeof(aso)); 2736 aso.so_pcb = so->so_pcb; 2737 bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, 2738 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2739 bzero(&sb->sb_startzero, 2740 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2741 SOCKBUF_UNLOCK(sb); 2742 sbunlock(sb); 2743 2744 /* 2745 * Dispose of special rights and flush the copied socket. Don't call 2746 * any unsafe routines (that rely on locks being initialized) on aso. 2747 */ 2748 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 2749 (*pr->pr_domain->dom_dispose)(&aso); 2750 sbrelease_internal(&aso.so_rcv, so); 2751 } 2752 2753 /* 2754 * Wrapper for Socket established helper hook. 2755 * Parameters: socket, context of the hook point, hook id. 2756 */ 2757 static int inline 2758 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 2759 { 2760 struct socket_hhook_data hhook_data = { 2761 .so = so, 2762 .hctx = hctx, 2763 .m = NULL, 2764 .status = 0 2765 }; 2766 2767 CURVNET_SET(so->so_vnet); 2768 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 2769 CURVNET_RESTORE(); 2770 2771 /* Ugly but needed, since hhooks return void for now */ 2772 return (hhook_data.status); 2773 } 2774 2775 /* 2776 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2777 * additional variant to handle the case where the option value needs to be 2778 * some kind of integer, but not a specific size. In addition to their use 2779 * here, these functions are also called by the protocol-level pr_ctloutput() 2780 * routines. 2781 */ 2782 int 2783 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2784 { 2785 size_t valsize; 2786 2787 /* 2788 * If the user gives us more than we wanted, we ignore it, but if we 2789 * don't get the minimum length the caller wants, we return EINVAL. 2790 * On success, sopt->sopt_valsize is set to however much we actually 2791 * retrieved. 2792 */ 2793 if ((valsize = sopt->sopt_valsize) < minlen) 2794 return EINVAL; 2795 if (valsize > len) 2796 sopt->sopt_valsize = valsize = len; 2797 2798 if (sopt->sopt_td != NULL) 2799 return (copyin(sopt->sopt_val, buf, valsize)); 2800 2801 bcopy(sopt->sopt_val, buf, valsize); 2802 return (0); 2803 } 2804 2805 /* 2806 * Kernel version of setsockopt(2). 2807 * 2808 * XXX: optlen is size_t, not socklen_t 2809 */ 2810 int 2811 so_setsockopt(struct socket *so, int level, int optname, void *optval, 2812 size_t optlen) 2813 { 2814 struct sockopt sopt; 2815 2816 sopt.sopt_level = level; 2817 sopt.sopt_name = optname; 2818 sopt.sopt_dir = SOPT_SET; 2819 sopt.sopt_val = optval; 2820 sopt.sopt_valsize = optlen; 2821 sopt.sopt_td = NULL; 2822 return (sosetopt(so, &sopt)); 2823 } 2824 2825 int 2826 sosetopt(struct socket *so, struct sockopt *sopt) 2827 { 2828 int error, optval; 2829 struct linger l; 2830 struct timeval tv; 2831 sbintime_t val; 2832 uint32_t val32; 2833 #ifdef MAC 2834 struct mac extmac; 2835 #endif 2836 2837 CURVNET_SET(so->so_vnet); 2838 error = 0; 2839 if (sopt->sopt_level != SOL_SOCKET) { 2840 if (so->so_proto->pr_ctloutput != NULL) 2841 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2842 else 2843 error = ENOPROTOOPT; 2844 } else { 2845 switch (sopt->sopt_name) { 2846 case SO_ACCEPTFILTER: 2847 error = accept_filt_setopt(so, sopt); 2848 if (error) 2849 goto bad; 2850 break; 2851 2852 case SO_LINGER: 2853 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2854 if (error) 2855 goto bad; 2856 if (l.l_linger < 0 || 2857 l.l_linger > USHRT_MAX || 2858 l.l_linger > (INT_MAX / hz)) { 2859 error = EDOM; 2860 goto bad; 2861 } 2862 SOCK_LOCK(so); 2863 so->so_linger = l.l_linger; 2864 if (l.l_onoff) 2865 so->so_options |= SO_LINGER; 2866 else 2867 so->so_options &= ~SO_LINGER; 2868 SOCK_UNLOCK(so); 2869 break; 2870 2871 case SO_DEBUG: 2872 case SO_KEEPALIVE: 2873 case SO_DONTROUTE: 2874 case SO_USELOOPBACK: 2875 case SO_BROADCAST: 2876 case SO_REUSEADDR: 2877 case SO_REUSEPORT: 2878 case SO_REUSEPORT_LB: 2879 case SO_OOBINLINE: 2880 case SO_TIMESTAMP: 2881 case SO_BINTIME: 2882 case SO_NOSIGPIPE: 2883 case SO_NO_DDP: 2884 case SO_NO_OFFLOAD: 2885 error = sooptcopyin(sopt, &optval, sizeof optval, 2886 sizeof optval); 2887 if (error) 2888 goto bad; 2889 SOCK_LOCK(so); 2890 if (optval) 2891 so->so_options |= sopt->sopt_name; 2892 else 2893 so->so_options &= ~sopt->sopt_name; 2894 SOCK_UNLOCK(so); 2895 break; 2896 2897 case SO_SETFIB: 2898 error = sooptcopyin(sopt, &optval, sizeof optval, 2899 sizeof optval); 2900 if (error) 2901 goto bad; 2902 2903 if (optval < 0 || optval >= rt_numfibs) { 2904 error = EINVAL; 2905 goto bad; 2906 } 2907 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 2908 (so->so_proto->pr_domain->dom_family == PF_INET6) || 2909 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 2910 so->so_fibnum = optval; 2911 else 2912 so->so_fibnum = 0; 2913 break; 2914 2915 case SO_USER_COOKIE: 2916 error = sooptcopyin(sopt, &val32, sizeof val32, 2917 sizeof val32); 2918 if (error) 2919 goto bad; 2920 so->so_user_cookie = val32; 2921 break; 2922 2923 case SO_SNDBUF: 2924 case SO_RCVBUF: 2925 case SO_SNDLOWAT: 2926 case SO_RCVLOWAT: 2927 error = sooptcopyin(sopt, &optval, sizeof optval, 2928 sizeof optval); 2929 if (error) 2930 goto bad; 2931 2932 /* 2933 * Values < 1 make no sense for any of these options, 2934 * so disallow them. 2935 */ 2936 if (optval < 1) { 2937 error = EINVAL; 2938 goto bad; 2939 } 2940 2941 error = sbsetopt(so, sopt->sopt_name, optval); 2942 break; 2943 2944 case SO_SNDTIMEO: 2945 case SO_RCVTIMEO: 2946 #ifdef COMPAT_FREEBSD32 2947 if (SV_CURPROC_FLAG(SV_ILP32)) { 2948 struct timeval32 tv32; 2949 2950 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2951 sizeof tv32); 2952 CP(tv32, tv, tv_sec); 2953 CP(tv32, tv, tv_usec); 2954 } else 2955 #endif 2956 error = sooptcopyin(sopt, &tv, sizeof tv, 2957 sizeof tv); 2958 if (error) 2959 goto bad; 2960 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 2961 tv.tv_usec >= 1000000) { 2962 error = EDOM; 2963 goto bad; 2964 } 2965 if (tv.tv_sec > INT32_MAX) 2966 val = SBT_MAX; 2967 else 2968 val = tvtosbt(tv); 2969 switch (sopt->sopt_name) { 2970 case SO_SNDTIMEO: 2971 so->so_snd.sb_timeo = val; 2972 break; 2973 case SO_RCVTIMEO: 2974 so->so_rcv.sb_timeo = val; 2975 break; 2976 } 2977 break; 2978 2979 case SO_LABEL: 2980 #ifdef MAC 2981 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2982 sizeof extmac); 2983 if (error) 2984 goto bad; 2985 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2986 so, &extmac); 2987 #else 2988 error = EOPNOTSUPP; 2989 #endif 2990 break; 2991 2992 case SO_TS_CLOCK: 2993 error = sooptcopyin(sopt, &optval, sizeof optval, 2994 sizeof optval); 2995 if (error) 2996 goto bad; 2997 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 2998 error = EINVAL; 2999 goto bad; 3000 } 3001 so->so_ts_clock = optval; 3002 break; 3003 3004 case SO_MAX_PACING_RATE: 3005 error = sooptcopyin(sopt, &val32, sizeof(val32), 3006 sizeof(val32)); 3007 if (error) 3008 goto bad; 3009 so->so_max_pacing_rate = val32; 3010 break; 3011 3012 default: 3013 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3014 error = hhook_run_socket(so, sopt, 3015 HHOOK_SOCKET_OPT); 3016 else 3017 error = ENOPROTOOPT; 3018 break; 3019 } 3020 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 3021 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 3022 } 3023 bad: 3024 CURVNET_RESTORE(); 3025 return (error); 3026 } 3027 3028 /* 3029 * Helper routine for getsockopt. 3030 */ 3031 int 3032 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 3033 { 3034 int error; 3035 size_t valsize; 3036 3037 error = 0; 3038 3039 /* 3040 * Documented get behavior is that we always return a value, possibly 3041 * truncated to fit in the user's buffer. Traditional behavior is 3042 * that we always tell the user precisely how much we copied, rather 3043 * than something useful like the total amount we had available for 3044 * her. Note that this interface is not idempotent; the entire 3045 * answer must be generated ahead of time. 3046 */ 3047 valsize = min(len, sopt->sopt_valsize); 3048 sopt->sopt_valsize = valsize; 3049 if (sopt->sopt_val != NULL) { 3050 if (sopt->sopt_td != NULL) 3051 error = copyout(buf, sopt->sopt_val, valsize); 3052 else 3053 bcopy(buf, sopt->sopt_val, valsize); 3054 } 3055 return (error); 3056 } 3057 3058 int 3059 sogetopt(struct socket *so, struct sockopt *sopt) 3060 { 3061 int error, optval; 3062 struct linger l; 3063 struct timeval tv; 3064 #ifdef MAC 3065 struct mac extmac; 3066 #endif 3067 3068 CURVNET_SET(so->so_vnet); 3069 error = 0; 3070 if (sopt->sopt_level != SOL_SOCKET) { 3071 if (so->so_proto->pr_ctloutput != NULL) 3072 error = (*so->so_proto->pr_ctloutput)(so, sopt); 3073 else 3074 error = ENOPROTOOPT; 3075 CURVNET_RESTORE(); 3076 return (error); 3077 } else { 3078 switch (sopt->sopt_name) { 3079 case SO_ACCEPTFILTER: 3080 error = accept_filt_getopt(so, sopt); 3081 break; 3082 3083 case SO_LINGER: 3084 SOCK_LOCK(so); 3085 l.l_onoff = so->so_options & SO_LINGER; 3086 l.l_linger = so->so_linger; 3087 SOCK_UNLOCK(so); 3088 error = sooptcopyout(sopt, &l, sizeof l); 3089 break; 3090 3091 case SO_USELOOPBACK: 3092 case SO_DONTROUTE: 3093 case SO_DEBUG: 3094 case SO_KEEPALIVE: 3095 case SO_REUSEADDR: 3096 case SO_REUSEPORT: 3097 case SO_REUSEPORT_LB: 3098 case SO_BROADCAST: 3099 case SO_OOBINLINE: 3100 case SO_ACCEPTCONN: 3101 case SO_TIMESTAMP: 3102 case SO_BINTIME: 3103 case SO_NOSIGPIPE: 3104 optval = so->so_options & sopt->sopt_name; 3105 integer: 3106 error = sooptcopyout(sopt, &optval, sizeof optval); 3107 break; 3108 3109 case SO_DOMAIN: 3110 optval = so->so_proto->pr_domain->dom_family; 3111 goto integer; 3112 3113 case SO_TYPE: 3114 optval = so->so_type; 3115 goto integer; 3116 3117 case SO_PROTOCOL: 3118 optval = so->so_proto->pr_protocol; 3119 goto integer; 3120 3121 case SO_ERROR: 3122 SOCK_LOCK(so); 3123 optval = so->so_error; 3124 so->so_error = 0; 3125 SOCK_UNLOCK(so); 3126 goto integer; 3127 3128 case SO_SNDBUF: 3129 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 3130 so->so_snd.sb_hiwat; 3131 goto integer; 3132 3133 case SO_RCVBUF: 3134 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 3135 so->so_rcv.sb_hiwat; 3136 goto integer; 3137 3138 case SO_SNDLOWAT: 3139 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 3140 so->so_snd.sb_lowat; 3141 goto integer; 3142 3143 case SO_RCVLOWAT: 3144 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 3145 so->so_rcv.sb_lowat; 3146 goto integer; 3147 3148 case SO_SNDTIMEO: 3149 case SO_RCVTIMEO: 3150 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 3151 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 3152 #ifdef COMPAT_FREEBSD32 3153 if (SV_CURPROC_FLAG(SV_ILP32)) { 3154 struct timeval32 tv32; 3155 3156 CP(tv, tv32, tv_sec); 3157 CP(tv, tv32, tv_usec); 3158 error = sooptcopyout(sopt, &tv32, sizeof tv32); 3159 } else 3160 #endif 3161 error = sooptcopyout(sopt, &tv, sizeof tv); 3162 break; 3163 3164 case SO_LABEL: 3165 #ifdef MAC 3166 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3167 sizeof(extmac)); 3168 if (error) 3169 goto bad; 3170 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 3171 so, &extmac); 3172 if (error) 3173 goto bad; 3174 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3175 #else 3176 error = EOPNOTSUPP; 3177 #endif 3178 break; 3179 3180 case SO_PEERLABEL: 3181 #ifdef MAC 3182 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3183 sizeof(extmac)); 3184 if (error) 3185 goto bad; 3186 error = mac_getsockopt_peerlabel( 3187 sopt->sopt_td->td_ucred, so, &extmac); 3188 if (error) 3189 goto bad; 3190 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3191 #else 3192 error = EOPNOTSUPP; 3193 #endif 3194 break; 3195 3196 case SO_LISTENQLIMIT: 3197 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 3198 goto integer; 3199 3200 case SO_LISTENQLEN: 3201 optval = SOLISTENING(so) ? so->sol_qlen : 0; 3202 goto integer; 3203 3204 case SO_LISTENINCQLEN: 3205 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 3206 goto integer; 3207 3208 case SO_TS_CLOCK: 3209 optval = so->so_ts_clock; 3210 goto integer; 3211 3212 case SO_MAX_PACING_RATE: 3213 optval = so->so_max_pacing_rate; 3214 goto integer; 3215 3216 default: 3217 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3218 error = hhook_run_socket(so, sopt, 3219 HHOOK_SOCKET_OPT); 3220 else 3221 error = ENOPROTOOPT; 3222 break; 3223 } 3224 } 3225 #ifdef MAC 3226 bad: 3227 #endif 3228 CURVNET_RESTORE(); 3229 return (error); 3230 } 3231 3232 int 3233 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 3234 { 3235 struct mbuf *m, *m_prev; 3236 int sopt_size = sopt->sopt_valsize; 3237 3238 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3239 if (m == NULL) 3240 return ENOBUFS; 3241 if (sopt_size > MLEN) { 3242 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 3243 if ((m->m_flags & M_EXT) == 0) { 3244 m_free(m); 3245 return ENOBUFS; 3246 } 3247 m->m_len = min(MCLBYTES, sopt_size); 3248 } else { 3249 m->m_len = min(MLEN, sopt_size); 3250 } 3251 sopt_size -= m->m_len; 3252 *mp = m; 3253 m_prev = m; 3254 3255 while (sopt_size) { 3256 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3257 if (m == NULL) { 3258 m_freem(*mp); 3259 return ENOBUFS; 3260 } 3261 if (sopt_size > MLEN) { 3262 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 3263 M_NOWAIT); 3264 if ((m->m_flags & M_EXT) == 0) { 3265 m_freem(m); 3266 m_freem(*mp); 3267 return ENOBUFS; 3268 } 3269 m->m_len = min(MCLBYTES, sopt_size); 3270 } else { 3271 m->m_len = min(MLEN, sopt_size); 3272 } 3273 sopt_size -= m->m_len; 3274 m_prev->m_next = m; 3275 m_prev = m; 3276 } 3277 return (0); 3278 } 3279 3280 int 3281 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 3282 { 3283 struct mbuf *m0 = m; 3284 3285 if (sopt->sopt_val == NULL) 3286 return (0); 3287 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3288 if (sopt->sopt_td != NULL) { 3289 int error; 3290 3291 error = copyin(sopt->sopt_val, mtod(m, char *), 3292 m->m_len); 3293 if (error != 0) { 3294 m_freem(m0); 3295 return(error); 3296 } 3297 } else 3298 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 3299 sopt->sopt_valsize -= m->m_len; 3300 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3301 m = m->m_next; 3302 } 3303 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 3304 panic("ip6_sooptmcopyin"); 3305 return (0); 3306 } 3307 3308 int 3309 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 3310 { 3311 struct mbuf *m0 = m; 3312 size_t valsize = 0; 3313 3314 if (sopt->sopt_val == NULL) 3315 return (0); 3316 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3317 if (sopt->sopt_td != NULL) { 3318 int error; 3319 3320 error = copyout(mtod(m, char *), sopt->sopt_val, 3321 m->m_len); 3322 if (error != 0) { 3323 m_freem(m0); 3324 return(error); 3325 } 3326 } else 3327 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 3328 sopt->sopt_valsize -= m->m_len; 3329 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3330 valsize += m->m_len; 3331 m = m->m_next; 3332 } 3333 if (m != NULL) { 3334 /* enough soopt buffer should be given from user-land */ 3335 m_freem(m0); 3336 return(EINVAL); 3337 } 3338 sopt->sopt_valsize = valsize; 3339 return (0); 3340 } 3341 3342 /* 3343 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3344 * out-of-band data, which will then notify socket consumers. 3345 */ 3346 void 3347 sohasoutofband(struct socket *so) 3348 { 3349 3350 if (so->so_sigio != NULL) 3351 pgsigio(&so->so_sigio, SIGURG, 0); 3352 selwakeuppri(&so->so_rdsel, PSOCK); 3353 } 3354 3355 int 3356 sopoll(struct socket *so, int events, struct ucred *active_cred, 3357 struct thread *td) 3358 { 3359 3360 /* 3361 * We do not need to set or assert curvnet as long as everyone uses 3362 * sopoll_generic(). 3363 */ 3364 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 3365 td)); 3366 } 3367 3368 int 3369 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3370 struct thread *td) 3371 { 3372 int revents; 3373 3374 SOCK_LOCK(so); 3375 if (SOLISTENING(so)) { 3376 if (!(events & (POLLIN | POLLRDNORM))) 3377 revents = 0; 3378 else if (!TAILQ_EMPTY(&so->sol_comp)) 3379 revents = events & (POLLIN | POLLRDNORM); 3380 else if ((events & POLLINIGNEOF) == 0 && so->so_error) 3381 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; 3382 else { 3383 selrecord(td, &so->so_rdsel); 3384 revents = 0; 3385 } 3386 } else { 3387 revents = 0; 3388 SOCKBUF_LOCK(&so->so_snd); 3389 SOCKBUF_LOCK(&so->so_rcv); 3390 if (events & (POLLIN | POLLRDNORM)) 3391 if (soreadabledata(so)) 3392 revents |= events & (POLLIN | POLLRDNORM); 3393 if (events & (POLLOUT | POLLWRNORM)) 3394 if (sowriteable(so)) 3395 revents |= events & (POLLOUT | POLLWRNORM); 3396 if (events & (POLLPRI | POLLRDBAND)) 3397 if (so->so_oobmark || 3398 (so->so_rcv.sb_state & SBS_RCVATMARK)) 3399 revents |= events & (POLLPRI | POLLRDBAND); 3400 if ((events & POLLINIGNEOF) == 0) { 3401 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3402 revents |= events & (POLLIN | POLLRDNORM); 3403 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3404 revents |= POLLHUP; 3405 } 3406 } 3407 if (revents == 0) { 3408 if (events & 3409 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 3410 selrecord(td, &so->so_rdsel); 3411 so->so_rcv.sb_flags |= SB_SEL; 3412 } 3413 if (events & (POLLOUT | POLLWRNORM)) { 3414 selrecord(td, &so->so_wrsel); 3415 so->so_snd.sb_flags |= SB_SEL; 3416 } 3417 } 3418 SOCKBUF_UNLOCK(&so->so_rcv); 3419 SOCKBUF_UNLOCK(&so->so_snd); 3420 } 3421 SOCK_UNLOCK(so); 3422 return (revents); 3423 } 3424 3425 int 3426 soo_kqfilter(struct file *fp, struct knote *kn) 3427 { 3428 struct socket *so = kn->kn_fp->f_data; 3429 struct sockbuf *sb; 3430 struct knlist *knl; 3431 3432 switch (kn->kn_filter) { 3433 case EVFILT_READ: 3434 kn->kn_fop = &soread_filtops; 3435 knl = &so->so_rdsel.si_note; 3436 sb = &so->so_rcv; 3437 break; 3438 case EVFILT_WRITE: 3439 kn->kn_fop = &sowrite_filtops; 3440 knl = &so->so_wrsel.si_note; 3441 sb = &so->so_snd; 3442 break; 3443 case EVFILT_EMPTY: 3444 kn->kn_fop = &soempty_filtops; 3445 knl = &so->so_wrsel.si_note; 3446 sb = &so->so_snd; 3447 break; 3448 default: 3449 return (EINVAL); 3450 } 3451 3452 SOCK_LOCK(so); 3453 if (SOLISTENING(so)) { 3454 knlist_add(knl, kn, 1); 3455 } else { 3456 SOCKBUF_LOCK(sb); 3457 knlist_add(knl, kn, 1); 3458 sb->sb_flags |= SB_KNOTE; 3459 SOCKBUF_UNLOCK(sb); 3460 } 3461 SOCK_UNLOCK(so); 3462 return (0); 3463 } 3464 3465 /* 3466 * Some routines that return EOPNOTSUPP for entry points that are not 3467 * supported by a protocol. Fill in as needed. 3468 */ 3469 int 3470 pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 3471 { 3472 3473 return EOPNOTSUPP; 3474 } 3475 3476 int 3477 pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) 3478 { 3479 3480 return EOPNOTSUPP; 3481 } 3482 3483 int 3484 pru_attach_notsupp(struct socket *so, int proto, struct thread *td) 3485 { 3486 3487 return EOPNOTSUPP; 3488 } 3489 3490 int 3491 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3492 { 3493 3494 return EOPNOTSUPP; 3495 } 3496 3497 int 3498 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3499 struct thread *td) 3500 { 3501 3502 return EOPNOTSUPP; 3503 } 3504 3505 int 3506 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3507 { 3508 3509 return EOPNOTSUPP; 3510 } 3511 3512 int 3513 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3514 struct thread *td) 3515 { 3516 3517 return EOPNOTSUPP; 3518 } 3519 3520 int 3521 pru_connect2_notsupp(struct socket *so1, struct socket *so2) 3522 { 3523 3524 return EOPNOTSUPP; 3525 } 3526 3527 int 3528 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 3529 struct ifnet *ifp, struct thread *td) 3530 { 3531 3532 return EOPNOTSUPP; 3533 } 3534 3535 int 3536 pru_disconnect_notsupp(struct socket *so) 3537 { 3538 3539 return EOPNOTSUPP; 3540 } 3541 3542 int 3543 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) 3544 { 3545 3546 return EOPNOTSUPP; 3547 } 3548 3549 int 3550 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) 3551 { 3552 3553 return EOPNOTSUPP; 3554 } 3555 3556 int 3557 pru_rcvd_notsupp(struct socket *so, int flags) 3558 { 3559 3560 return EOPNOTSUPP; 3561 } 3562 3563 int 3564 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 3565 { 3566 3567 return EOPNOTSUPP; 3568 } 3569 3570 int 3571 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, 3572 struct sockaddr *addr, struct mbuf *control, struct thread *td) 3573 { 3574 3575 return EOPNOTSUPP; 3576 } 3577 3578 int 3579 pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) 3580 { 3581 3582 return (EOPNOTSUPP); 3583 } 3584 3585 /* 3586 * This isn't really a ``null'' operation, but it's the default one and 3587 * doesn't do anything destructive. 3588 */ 3589 int 3590 pru_sense_null(struct socket *so, struct stat *sb) 3591 { 3592 3593 sb->st_blksize = so->so_snd.sb_hiwat; 3594 return 0; 3595 } 3596 3597 int 3598 pru_shutdown_notsupp(struct socket *so) 3599 { 3600 3601 return EOPNOTSUPP; 3602 } 3603 3604 int 3605 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) 3606 { 3607 3608 return EOPNOTSUPP; 3609 } 3610 3611 int 3612 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, 3613 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 3614 { 3615 3616 return EOPNOTSUPP; 3617 } 3618 3619 int 3620 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, 3621 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3622 { 3623 3624 return EOPNOTSUPP; 3625 } 3626 3627 int 3628 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, 3629 struct thread *td) 3630 { 3631 3632 return EOPNOTSUPP; 3633 } 3634 3635 static void 3636 filt_sordetach(struct knote *kn) 3637 { 3638 struct socket *so = kn->kn_fp->f_data; 3639 3640 so_rdknl_lock(so); 3641 knlist_remove(&so->so_rdsel.si_note, kn, 1); 3642 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 3643 so->so_rcv.sb_flags &= ~SB_KNOTE; 3644 so_rdknl_unlock(so); 3645 } 3646 3647 /*ARGSUSED*/ 3648 static int 3649 filt_soread(struct knote *kn, long hint) 3650 { 3651 struct socket *so; 3652 3653 so = kn->kn_fp->f_data; 3654 3655 if (SOLISTENING(so)) { 3656 SOCK_LOCK_ASSERT(so); 3657 kn->kn_data = so->sol_qlen; 3658 if (so->so_error) { 3659 kn->kn_flags |= EV_EOF; 3660 kn->kn_fflags = so->so_error; 3661 return (1); 3662 } 3663 return (!TAILQ_EMPTY(&so->sol_comp)); 3664 } 3665 3666 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3667 3668 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 3669 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3670 kn->kn_flags |= EV_EOF; 3671 kn->kn_fflags = so->so_error; 3672 return (1); 3673 } else if (so->so_error) /* temporary udp error */ 3674 return (1); 3675 3676 if (kn->kn_sfflags & NOTE_LOWAT) { 3677 if (kn->kn_data >= kn->kn_sdata) 3678 return (1); 3679 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 3680 return (1); 3681 3682 /* This hook returning non-zero indicates an event, not error */ 3683 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 3684 } 3685 3686 static void 3687 filt_sowdetach(struct knote *kn) 3688 { 3689 struct socket *so = kn->kn_fp->f_data; 3690 3691 so_wrknl_lock(so); 3692 knlist_remove(&so->so_wrsel.si_note, kn, 1); 3693 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 3694 so->so_snd.sb_flags &= ~SB_KNOTE; 3695 so_wrknl_unlock(so); 3696 } 3697 3698 /*ARGSUSED*/ 3699 static int 3700 filt_sowrite(struct knote *kn, long hint) 3701 { 3702 struct socket *so; 3703 3704 so = kn->kn_fp->f_data; 3705 3706 if (SOLISTENING(so)) 3707 return (0); 3708 3709 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3710 kn->kn_data = sbspace(&so->so_snd); 3711 3712 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 3713 3714 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3715 kn->kn_flags |= EV_EOF; 3716 kn->kn_fflags = so->so_error; 3717 return (1); 3718 } else if (so->so_error) /* temporary udp error */ 3719 return (1); 3720 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3721 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3722 return (0); 3723 else if (kn->kn_sfflags & NOTE_LOWAT) 3724 return (kn->kn_data >= kn->kn_sdata); 3725 else 3726 return (kn->kn_data >= so->so_snd.sb_lowat); 3727 } 3728 3729 static int 3730 filt_soempty(struct knote *kn, long hint) 3731 { 3732 struct socket *so; 3733 3734 so = kn->kn_fp->f_data; 3735 3736 if (SOLISTENING(so)) 3737 return (1); 3738 3739 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3740 kn->kn_data = sbused(&so->so_snd); 3741 3742 if (kn->kn_data == 0) 3743 return (1); 3744 else 3745 return (0); 3746 } 3747 3748 int 3749 socheckuid(struct socket *so, uid_t uid) 3750 { 3751 3752 if (so == NULL) 3753 return (EPERM); 3754 if (so->so_cred->cr_uid != uid) 3755 return (EPERM); 3756 return (0); 3757 } 3758 3759 /* 3760 * These functions are used by protocols to notify the socket layer (and its 3761 * consumers) of state changes in the sockets driven by protocol-side events. 3762 */ 3763 3764 /* 3765 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3766 * 3767 * Normal sequence from the active (originating) side is that 3768 * soisconnecting() is called during processing of connect() call, resulting 3769 * in an eventual call to soisconnected() if/when the connection is 3770 * established. When the connection is torn down soisdisconnecting() is 3771 * called during processing of disconnect() call, and soisdisconnected() is 3772 * called when the connection to the peer is totally severed. The semantics 3773 * of these routines are such that connectionless protocols can call 3774 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3775 * calls when setting up a ``connection'' takes no time. 3776 * 3777 * From the passive side, a socket is created with two queues of sockets: 3778 * so_incomp for connections in progress and so_comp for connections already 3779 * made and awaiting user acceptance. As a protocol is preparing incoming 3780 * connections, it creates a socket structure queued on so_incomp by calling 3781 * sonewconn(). When the connection is established, soisconnected() is 3782 * called, and transfers the socket structure to so_comp, making it available 3783 * to accept(). 3784 * 3785 * If a socket is closed with sockets on either so_incomp or so_comp, these 3786 * sockets are dropped. 3787 * 3788 * If higher-level protocols are implemented in the kernel, the wakeups done 3789 * here will sometimes cause software-interrupt process scheduling. 3790 */ 3791 void 3792 soisconnecting(struct socket *so) 3793 { 3794 3795 SOCK_LOCK(so); 3796 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3797 so->so_state |= SS_ISCONNECTING; 3798 SOCK_UNLOCK(so); 3799 } 3800 3801 void 3802 soisconnected(struct socket *so) 3803 { 3804 3805 SOCK_LOCK(so); 3806 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3807 so->so_state |= SS_ISCONNECTED; 3808 3809 if (so->so_qstate == SQ_INCOMP) { 3810 struct socket *head = so->so_listen; 3811 int ret; 3812 3813 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 3814 /* 3815 * Promoting a socket from incomplete queue to complete, we 3816 * need to go through reverse order of locking. We first do 3817 * trylock, and if that doesn't succeed, we go the hard way 3818 * leaving a reference and rechecking consistency after proper 3819 * locking. 3820 */ 3821 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 3822 soref(head); 3823 SOCK_UNLOCK(so); 3824 SOLISTEN_LOCK(head); 3825 SOCK_LOCK(so); 3826 if (__predict_false(head != so->so_listen)) { 3827 /* 3828 * The socket went off the listen queue, 3829 * should be lost race to close(2) of sol. 3830 * The socket is about to soabort(). 3831 */ 3832 SOCK_UNLOCK(so); 3833 sorele(head); 3834 return; 3835 } 3836 /* Not the last one, as so holds a ref. */ 3837 refcount_release(&head->so_count); 3838 } 3839 again: 3840 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3841 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 3842 head->sol_incqlen--; 3843 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 3844 head->sol_qlen++; 3845 so->so_qstate = SQ_COMP; 3846 SOCK_UNLOCK(so); 3847 solisten_wakeup(head); /* unlocks */ 3848 } else { 3849 SOCKBUF_LOCK(&so->so_rcv); 3850 soupcall_set(so, SO_RCV, 3851 head->sol_accept_filter->accf_callback, 3852 head->sol_accept_filter_arg); 3853 so->so_options &= ~SO_ACCEPTFILTER; 3854 ret = head->sol_accept_filter->accf_callback(so, 3855 head->sol_accept_filter_arg, M_NOWAIT); 3856 if (ret == SU_ISCONNECTED) { 3857 soupcall_clear(so, SO_RCV); 3858 SOCKBUF_UNLOCK(&so->so_rcv); 3859 goto again; 3860 } 3861 SOCKBUF_UNLOCK(&so->so_rcv); 3862 SOCK_UNLOCK(so); 3863 SOLISTEN_UNLOCK(head); 3864 } 3865 return; 3866 } 3867 SOCK_UNLOCK(so); 3868 wakeup(&so->so_timeo); 3869 sorwakeup(so); 3870 sowwakeup(so); 3871 } 3872 3873 void 3874 soisdisconnecting(struct socket *so) 3875 { 3876 3877 SOCK_LOCK(so); 3878 so->so_state &= ~SS_ISCONNECTING; 3879 so->so_state |= SS_ISDISCONNECTING; 3880 3881 if (!SOLISTENING(so)) { 3882 SOCKBUF_LOCK(&so->so_rcv); 3883 socantrcvmore_locked(so); 3884 SOCKBUF_LOCK(&so->so_snd); 3885 socantsendmore_locked(so); 3886 } 3887 SOCK_UNLOCK(so); 3888 wakeup(&so->so_timeo); 3889 } 3890 3891 void 3892 soisdisconnected(struct socket *so) 3893 { 3894 3895 SOCK_LOCK(so); 3896 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3897 so->so_state |= SS_ISDISCONNECTED; 3898 3899 if (!SOLISTENING(so)) { 3900 SOCK_UNLOCK(so); 3901 SOCKBUF_LOCK(&so->so_rcv); 3902 socantrcvmore_locked(so); 3903 SOCKBUF_LOCK(&so->so_snd); 3904 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 3905 socantsendmore_locked(so); 3906 } else 3907 SOCK_UNLOCK(so); 3908 wakeup(&so->so_timeo); 3909 } 3910 3911 /* 3912 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3913 */ 3914 struct sockaddr * 3915 sodupsockaddr(const struct sockaddr *sa, int mflags) 3916 { 3917 struct sockaddr *sa2; 3918 3919 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3920 if (sa2) 3921 bcopy(sa, sa2, sa->sa_len); 3922 return sa2; 3923 } 3924 3925 /* 3926 * Register per-socket destructor. 3927 */ 3928 void 3929 sodtor_set(struct socket *so, so_dtor_t *func) 3930 { 3931 3932 SOCK_LOCK_ASSERT(so); 3933 so->so_dtor = func; 3934 } 3935 3936 /* 3937 * Register per-socket buffer upcalls. 3938 */ 3939 void 3940 soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) 3941 { 3942 struct sockbuf *sb; 3943 3944 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 3945 3946 switch (which) { 3947 case SO_RCV: 3948 sb = &so->so_rcv; 3949 break; 3950 case SO_SND: 3951 sb = &so->so_snd; 3952 break; 3953 default: 3954 panic("soupcall_set: bad which"); 3955 } 3956 SOCKBUF_LOCK_ASSERT(sb); 3957 sb->sb_upcall = func; 3958 sb->sb_upcallarg = arg; 3959 sb->sb_flags |= SB_UPCALL; 3960 } 3961 3962 void 3963 soupcall_clear(struct socket *so, int which) 3964 { 3965 struct sockbuf *sb; 3966 3967 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 3968 3969 switch (which) { 3970 case SO_RCV: 3971 sb = &so->so_rcv; 3972 break; 3973 case SO_SND: 3974 sb = &so->so_snd; 3975 break; 3976 default: 3977 panic("soupcall_clear: bad which"); 3978 } 3979 SOCKBUF_LOCK_ASSERT(sb); 3980 KASSERT(sb->sb_upcall != NULL, 3981 ("%s: so %p no upcall to clear", __func__, so)); 3982 sb->sb_upcall = NULL; 3983 sb->sb_upcallarg = NULL; 3984 sb->sb_flags &= ~SB_UPCALL; 3985 } 3986 3987 void 3988 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 3989 { 3990 3991 SOLISTEN_LOCK_ASSERT(so); 3992 so->sol_upcall = func; 3993 so->sol_upcallarg = arg; 3994 } 3995 3996 static void 3997 so_rdknl_lock(void *arg) 3998 { 3999 struct socket *so = arg; 4000 4001 if (SOLISTENING(so)) 4002 SOCK_LOCK(so); 4003 else 4004 SOCKBUF_LOCK(&so->so_rcv); 4005 } 4006 4007 static void 4008 so_rdknl_unlock(void *arg) 4009 { 4010 struct socket *so = arg; 4011 4012 if (SOLISTENING(so)) 4013 SOCK_UNLOCK(so); 4014 else 4015 SOCKBUF_UNLOCK(&so->so_rcv); 4016 } 4017 4018 static void 4019 so_rdknl_assert_locked(void *arg) 4020 { 4021 struct socket *so = arg; 4022 4023 if (SOLISTENING(so)) 4024 SOCK_LOCK_ASSERT(so); 4025 else 4026 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 4027 } 4028 4029 static void 4030 so_rdknl_assert_unlocked(void *arg) 4031 { 4032 struct socket *so = arg; 4033 4034 if (SOLISTENING(so)) 4035 SOCK_UNLOCK_ASSERT(so); 4036 else 4037 SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); 4038 } 4039 4040 static void 4041 so_wrknl_lock(void *arg) 4042 { 4043 struct socket *so = arg; 4044 4045 if (SOLISTENING(so)) 4046 SOCK_LOCK(so); 4047 else 4048 SOCKBUF_LOCK(&so->so_snd); 4049 } 4050 4051 static void 4052 so_wrknl_unlock(void *arg) 4053 { 4054 struct socket *so = arg; 4055 4056 if (SOLISTENING(so)) 4057 SOCK_UNLOCK(so); 4058 else 4059 SOCKBUF_UNLOCK(&so->so_snd); 4060 } 4061 4062 static void 4063 so_wrknl_assert_locked(void *arg) 4064 { 4065 struct socket *so = arg; 4066 4067 if (SOLISTENING(so)) 4068 SOCK_LOCK_ASSERT(so); 4069 else 4070 SOCKBUF_LOCK_ASSERT(&so->so_snd); 4071 } 4072 4073 static void 4074 so_wrknl_assert_unlocked(void *arg) 4075 { 4076 struct socket *so = arg; 4077 4078 if (SOLISTENING(so)) 4079 SOCK_UNLOCK_ASSERT(so); 4080 else 4081 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); 4082 } 4083 4084 /* 4085 * Create an external-format (``xsocket'') structure using the information in 4086 * the kernel-format socket structure pointed to by so. This is done to 4087 * reduce the spew of irrelevant information over this interface, to isolate 4088 * user code from changes in the kernel structure, and potentially to provide 4089 * information-hiding if we decide that some of this information should be 4090 * hidden from users. 4091 */ 4092 void 4093 sotoxsocket(struct socket *so, struct xsocket *xso) 4094 { 4095 4096 bzero(xso, sizeof(*xso)); 4097 xso->xso_len = sizeof *xso; 4098 xso->xso_so = (uintptr_t)so; 4099 xso->so_type = so->so_type; 4100 xso->so_options = so->so_options; 4101 xso->so_linger = so->so_linger; 4102 xso->so_state = so->so_state; 4103 xso->so_pcb = (uintptr_t)so->so_pcb; 4104 xso->xso_protocol = so->so_proto->pr_protocol; 4105 xso->xso_family = so->so_proto->pr_domain->dom_family; 4106 xso->so_timeo = so->so_timeo; 4107 xso->so_error = so->so_error; 4108 xso->so_uid = so->so_cred->cr_uid; 4109 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 4110 if (SOLISTENING(so)) { 4111 xso->so_qlen = so->sol_qlen; 4112 xso->so_incqlen = so->sol_incqlen; 4113 xso->so_qlimit = so->sol_qlimit; 4114 xso->so_oobmark = 0; 4115 } else { 4116 xso->so_state |= so->so_qstate; 4117 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 4118 xso->so_oobmark = so->so_oobmark; 4119 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 4120 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 4121 } 4122 } 4123 4124 struct sockbuf * 4125 so_sockbuf_rcv(struct socket *so) 4126 { 4127 4128 return (&so->so_rcv); 4129 } 4130 4131 struct sockbuf * 4132 so_sockbuf_snd(struct socket *so) 4133 { 4134 4135 return (&so->so_snd); 4136 } 4137 4138 int 4139 so_state_get(const struct socket *so) 4140 { 4141 4142 return (so->so_state); 4143 } 4144 4145 void 4146 so_state_set(struct socket *so, int val) 4147 { 4148 4149 so->so_state = val; 4150 } 4151 4152 int 4153 so_options_get(const struct socket *so) 4154 { 4155 4156 return (so->so_options); 4157 } 4158 4159 void 4160 so_options_set(struct socket *so, int val) 4161 { 4162 4163 so->so_options = val; 4164 } 4165 4166 int 4167 so_error_get(const struct socket *so) 4168 { 4169 4170 return (so->so_error); 4171 } 4172 4173 void 4174 so_error_set(struct socket *so, int val) 4175 { 4176 4177 so->so_error = val; 4178 } 4179 4180 int 4181 so_linger_get(const struct socket *so) 4182 { 4183 4184 return (so->so_linger); 4185 } 4186 4187 void 4188 so_linger_set(struct socket *so, int val) 4189 { 4190 4191 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), 4192 ("%s: val %d out of range", __func__, val)); 4193 4194 so->so_linger = val; 4195 } 4196 4197 struct protosw * 4198 so_protosw_get(const struct socket *so) 4199 { 4200 4201 return (so->so_proto); 4202 } 4203 4204 void 4205 so_protosw_set(struct socket *so, struct protosw *val) 4206 { 4207 4208 so->so_proto = val; 4209 } 4210 4211 void 4212 so_sorwakeup(struct socket *so) 4213 { 4214 4215 sorwakeup(so); 4216 } 4217 4218 void 4219 so_sowwakeup(struct socket *so) 4220 { 4221 4222 sowwakeup(so); 4223 } 4224 4225 void 4226 so_sorwakeup_locked(struct socket *so) 4227 { 4228 4229 sorwakeup_locked(so); 4230 } 4231 4232 void 4233 so_sowwakeup_locked(struct socket *so) 4234 { 4235 4236 sowwakeup_locked(so); 4237 } 4238 4239 void 4240 so_lock(struct socket *so) 4241 { 4242 4243 SOCK_LOCK(so); 4244 } 4245 4246 void 4247 so_unlock(struct socket *so) 4248 { 4249 4250 SOCK_UNLOCK(so); 4251 } 4252