1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2004 The FreeBSD Foundation 7 * Copyright (c) 2004-2008 Robert N. M. Watson 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 35 */ 36 37 /* 38 * Comments on the socket life cycle: 39 * 40 * soalloc() sets of socket layer state for a socket, called only by 41 * socreate() and sonewconn(). Socket layer private. 42 * 43 * sodealloc() tears down socket layer state for a socket, called only by 44 * sofree() and sonewconn(). Socket layer private. 45 * 46 * pru_attach() associates protocol layer state with an allocated socket; 47 * called only once, may fail, aborting socket allocation. This is called 48 * from socreate() and sonewconn(). Socket layer private. 49 * 50 * pru_detach() disassociates protocol layer state from an attached socket, 51 * and will be called exactly once for sockets in which pru_attach() has 52 * been successfully called. If pru_attach() returned an error, 53 * pru_detach() will not be called. Socket layer private. 54 * 55 * pru_abort() and pru_close() notify the protocol layer that the last 56 * consumer of a socket is starting to tear down the socket, and that the 57 * protocol should terminate the connection. Historically, pru_abort() also 58 * detached protocol state from the socket state, but this is no longer the 59 * case. 60 * 61 * socreate() creates a socket and attaches protocol state. This is a public 62 * interface that may be used by socket layer consumers to create new 63 * sockets. 64 * 65 * sonewconn() creates a socket and attaches protocol state. This is a 66 * public interface that may be used by protocols to create new sockets when 67 * a new connection is received and will be available for accept() on a 68 * listen socket. 69 * 70 * soclose() destroys a socket after possibly waiting for it to disconnect. 71 * This is a public interface that socket consumers should use to close and 72 * release a socket when done with it. 73 * 74 * soabort() destroys a socket without waiting for it to disconnect (used 75 * only for incoming connections that are already partially or fully 76 * connected). This is used internally by the socket layer when clearing 77 * listen socket queues (due to overflow or close on the listen socket), but 78 * is also a public interface protocols may use to abort connections in 79 * their incomplete listen queues should they no longer be required. Sockets 80 * placed in completed connection listen queues should not be aborted for 81 * reasons described in the comment above the soclose() implementation. This 82 * is not a general purpose close routine, and except in the specific 83 * circumstances described here, should not be used. 84 * 85 * sofree() will free a socket and its protocol state if all references on 86 * the socket have been released, and is the public interface to attempt to 87 * free a socket when a reference is removed. This is a socket layer private 88 * interface. 89 * 90 * NOTE: In addition to socreate() and soclose(), which provide a single 91 * socket reference to the consumer to be managed as required, there are two 92 * calls to explicitly manage socket references, soref(), and sorele(). 93 * Currently, these are generally required only when transitioning a socket 94 * from a listen queue to a file descriptor, in order to prevent garbage 95 * collection of the socket at an untimely moment. For a number of reasons, 96 * these interfaces are not preferred, and should be avoided. 97 * 98 * NOTE: With regard to VNETs the general rule is that callers do not set 99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 101 * and sorflush(), which are usually called from a pre-set VNET context. 102 * sopoll() currently does not need a VNET context to be set. 103 */ 104 105 #include <sys/cdefs.h> 106 __FBSDID("$FreeBSD$"); 107 108 #include "opt_inet.h" 109 #include "opt_inet6.h" 110 #include "opt_compat.h" 111 #include "opt_sctp.h" 112 113 #include <sys/param.h> 114 #include <sys/systm.h> 115 #include <sys/fcntl.h> 116 #include <sys/limits.h> 117 #include <sys/lock.h> 118 #include <sys/mac.h> 119 #include <sys/malloc.h> 120 #include <sys/mbuf.h> 121 #include <sys/mutex.h> 122 #include <sys/domain.h> 123 #include <sys/file.h> /* for struct knote */ 124 #include <sys/hhook.h> 125 #include <sys/kernel.h> 126 #include <sys/khelp.h> 127 #include <sys/event.h> 128 #include <sys/eventhandler.h> 129 #include <sys/poll.h> 130 #include <sys/proc.h> 131 #include <sys/protosw.h> 132 #include <sys/socket.h> 133 #include <sys/socketvar.h> 134 #include <sys/resourcevar.h> 135 #include <net/route.h> 136 #include <sys/signalvar.h> 137 #include <sys/stat.h> 138 #include <sys/sx.h> 139 #include <sys/sysctl.h> 140 #include <sys/taskqueue.h> 141 #include <sys/uio.h> 142 #include <sys/jail.h> 143 #include <sys/syslog.h> 144 #include <netinet/in.h> 145 146 #include <net/vnet.h> 147 148 #include <security/mac/mac_framework.h> 149 150 #include <vm/uma.h> 151 152 #ifdef COMPAT_FREEBSD32 153 #include <sys/mount.h> 154 #include <sys/sysent.h> 155 #include <compat/freebsd32/freebsd32.h> 156 #endif 157 158 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 159 int flags); 160 static void so_rdknl_lock(void *); 161 static void so_rdknl_unlock(void *); 162 static void so_rdknl_assert_locked(void *); 163 static void so_rdknl_assert_unlocked(void *); 164 static void so_wrknl_lock(void *); 165 static void so_wrknl_unlock(void *); 166 static void so_wrknl_assert_locked(void *); 167 static void so_wrknl_assert_unlocked(void *); 168 169 static void filt_sordetach(struct knote *kn); 170 static int filt_soread(struct knote *kn, long hint); 171 static void filt_sowdetach(struct knote *kn); 172 static int filt_sowrite(struct knote *kn, long hint); 173 static int filt_soempty(struct knote *kn, long hint); 174 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); 175 fo_kqfilter_t soo_kqfilter; 176 177 static struct filterops soread_filtops = { 178 .f_isfd = 1, 179 .f_detach = filt_sordetach, 180 .f_event = filt_soread, 181 }; 182 static struct filterops sowrite_filtops = { 183 .f_isfd = 1, 184 .f_detach = filt_sowdetach, 185 .f_event = filt_sowrite, 186 }; 187 static struct filterops soempty_filtops = { 188 .f_isfd = 1, 189 .f_detach = filt_sowdetach, 190 .f_event = filt_soempty, 191 }; 192 193 so_gen_t so_gencnt; /* generation count for sockets */ 194 195 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 196 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 197 198 #define VNET_SO_ASSERT(so) \ 199 VNET_ASSERT(curvnet != NULL, \ 200 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 201 202 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); 203 #define V_socket_hhh VNET(socket_hhh) 204 205 /* 206 * Limit on the number of connections in the listen queue waiting 207 * for accept(2). 208 * NB: The original sysctl somaxconn is still available but hidden 209 * to prevent confusion about the actual purpose of this number. 210 */ 211 static u_int somaxconn = SOMAXCONN; 212 213 static int 214 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 215 { 216 int error; 217 int val; 218 219 val = somaxconn; 220 error = sysctl_handle_int(oidp, &val, 0, req); 221 if (error || !req->newptr ) 222 return (error); 223 224 /* 225 * The purpose of the UINT_MAX / 3 limit, is so that the formula 226 * 3 * so_qlimit / 2 227 * below, will not overflow. 228 */ 229 230 if (val < 1 || val > UINT_MAX / 3) 231 return (EINVAL); 232 233 somaxconn = val; 234 return (0); 235 } 236 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 237 0, sizeof(int), sysctl_somaxconn, "I", 238 "Maximum listen socket pending connection accept queue size"); 239 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 240 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 241 0, sizeof(int), sysctl_somaxconn, "I", 242 "Maximum listen socket pending connection accept queue size (compat)"); 243 244 static int numopensockets; 245 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 246 &numopensockets, 0, "Number of open sockets"); 247 248 /* 249 * accept_mtx locks down per-socket fields relating to accept queues. See 250 * socketvar.h for an annotation of the protected fields of struct socket. 251 */ 252 struct mtx accept_mtx; 253 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 254 255 /* 256 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 257 * so_gencnt field. 258 */ 259 static struct mtx so_global_mtx; 260 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 261 262 /* 263 * General IPC sysctl name space, used by sockets and a variety of other IPC 264 * types. 265 */ 266 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 267 268 /* 269 * Initialize the socket subsystem and set up the socket 270 * memory allocator. 271 */ 272 static uma_zone_t socket_zone; 273 int maxsockets; 274 275 static void 276 socket_zone_change(void *tag) 277 { 278 279 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 280 } 281 282 static void 283 socket_hhook_register(int subtype) 284 { 285 286 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, 287 &V_socket_hhh[subtype], 288 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 289 printf("%s: WARNING: unable to register hook\n", __func__); 290 } 291 292 static void 293 socket_hhook_deregister(int subtype) 294 { 295 296 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) 297 printf("%s: WARNING: unable to deregister hook\n", __func__); 298 } 299 300 static void 301 socket_init(void *tag) 302 { 303 304 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 305 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 306 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 307 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 308 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 309 EVENTHANDLER_PRI_FIRST); 310 } 311 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 312 313 static void 314 socket_vnet_init(const void *unused __unused) 315 { 316 int i; 317 318 /* We expect a contiguous range */ 319 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 320 socket_hhook_register(i); 321 } 322 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 323 socket_vnet_init, NULL); 324 325 static void 326 socket_vnet_uninit(const void *unused __unused) 327 { 328 int i; 329 330 for (i = 0; i <= HHOOK_SOCKET_LAST; i++) 331 socket_hhook_deregister(i); 332 } 333 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, 334 socket_vnet_uninit, NULL); 335 336 /* 337 * Initialise maxsockets. This SYSINIT must be run after 338 * tunable_mbinit(). 339 */ 340 static void 341 init_maxsockets(void *ignored) 342 { 343 344 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 345 maxsockets = imax(maxsockets, maxfiles); 346 } 347 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 348 349 /* 350 * Sysctl to get and set the maximum global sockets limit. Notify protocols 351 * of the change so that they can update their dependent limits as required. 352 */ 353 static int 354 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 355 { 356 int error, newmaxsockets; 357 358 newmaxsockets = maxsockets; 359 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 360 if (error == 0 && req->newptr) { 361 if (newmaxsockets > maxsockets && 362 newmaxsockets <= maxfiles) { 363 maxsockets = newmaxsockets; 364 EVENTHANDLER_INVOKE(maxsockets_change); 365 } else 366 error = EINVAL; 367 } 368 return (error); 369 } 370 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 371 &maxsockets, 0, sysctl_maxsockets, "IU", 372 "Maximum number of sockets available"); 373 374 /* 375 * Socket operation routines. These routines are called by the routines in 376 * sys_socket.c or from a system process, and implement the semantics of 377 * socket operations by switching out to the protocol specific routines. 378 */ 379 380 /* 381 * Get a socket structure from our zone, and initialize it. Note that it 382 * would probably be better to allocate socket and PCB at the same time, but 383 * I'm not convinced that all the protocols can be easily modified to do 384 * this. 385 * 386 * soalloc() returns a socket with a ref count of 0. 387 */ 388 static struct socket * 389 soalloc(struct vnet *vnet) 390 { 391 struct socket *so; 392 393 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 394 if (so == NULL) 395 return (NULL); 396 #ifdef MAC 397 if (mac_socket_init(so, M_NOWAIT) != 0) { 398 uma_zfree(socket_zone, so); 399 return (NULL); 400 } 401 #endif 402 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { 403 uma_zfree(socket_zone, so); 404 return (NULL); 405 } 406 407 /* 408 * The socket locking protocol allows to lock 2 sockets at a time, 409 * however, the first one must be a listening socket. WITNESS lacks 410 * a feature to change class of an existing lock, so we use DUPOK. 411 */ 412 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); 413 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 414 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 415 so->so_rcv.sb_sel = &so->so_rdsel; 416 so->so_snd.sb_sel = &so->so_wrsel; 417 sx_init(&so->so_snd.sb_sx, "so_snd_sx"); 418 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); 419 TAILQ_INIT(&so->so_snd.sb_aiojobq); 420 TAILQ_INIT(&so->so_rcv.sb_aiojobq); 421 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); 422 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); 423 #ifdef VIMAGE 424 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 425 __func__, __LINE__, so)); 426 so->so_vnet = vnet; 427 #endif 428 /* We shouldn't need the so_global_mtx */ 429 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { 430 /* Do we need more comprehensive error returns? */ 431 uma_zfree(socket_zone, so); 432 return (NULL); 433 } 434 mtx_lock(&so_global_mtx); 435 so->so_gencnt = ++so_gencnt; 436 ++numopensockets; 437 #ifdef VIMAGE 438 vnet->vnet_sockcnt++; 439 #endif 440 mtx_unlock(&so_global_mtx); 441 442 return (so); 443 } 444 445 /* 446 * Free the storage associated with a socket at the socket layer, tear down 447 * locks, labels, etc. All protocol state is assumed already to have been 448 * torn down (and possibly never set up) by the caller. 449 */ 450 static void 451 sodealloc(struct socket *so) 452 { 453 454 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 455 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 456 457 mtx_lock(&so_global_mtx); 458 so->so_gencnt = ++so_gencnt; 459 --numopensockets; /* Could be below, but faster here. */ 460 #ifdef VIMAGE 461 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 462 __func__, __LINE__, so)); 463 so->so_vnet->vnet_sockcnt--; 464 #endif 465 mtx_unlock(&so_global_mtx); 466 #ifdef MAC 467 mac_socket_destroy(so); 468 #endif 469 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); 470 471 crfree(so->so_cred); 472 khelp_destroy_osd(&so->osd); 473 if (SOLISTENING(so)) { 474 if (so->sol_accept_filter != NULL) 475 accept_filt_setopt(so, NULL); 476 } else { 477 if (so->so_rcv.sb_hiwat) 478 (void)chgsbsize(so->so_cred->cr_uidinfo, 479 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 480 if (so->so_snd.sb_hiwat) 481 (void)chgsbsize(so->so_cred->cr_uidinfo, 482 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 483 sx_destroy(&so->so_snd.sb_sx); 484 sx_destroy(&so->so_rcv.sb_sx); 485 SOCKBUF_LOCK_DESTROY(&so->so_snd); 486 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 487 } 488 mtx_destroy(&so->so_lock); 489 uma_zfree(socket_zone, so); 490 } 491 492 /* 493 * socreate returns a socket with a ref count of 1. The socket should be 494 * closed with soclose(). 495 */ 496 int 497 socreate(int dom, struct socket **aso, int type, int proto, 498 struct ucred *cred, struct thread *td) 499 { 500 struct protosw *prp; 501 struct socket *so; 502 int error; 503 504 if (proto) 505 prp = pffindproto(dom, proto, type); 506 else 507 prp = pffindtype(dom, type); 508 509 if (prp == NULL) { 510 /* No support for domain. */ 511 if (pffinddomain(dom) == NULL) 512 return (EAFNOSUPPORT); 513 /* No support for socket type. */ 514 if (proto == 0 && type != 0) 515 return (EPROTOTYPE); 516 return (EPROTONOSUPPORT); 517 } 518 if (prp->pr_usrreqs->pru_attach == NULL || 519 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 520 return (EPROTONOSUPPORT); 521 522 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 523 return (EPROTONOSUPPORT); 524 525 if (prp->pr_type != type) 526 return (EPROTOTYPE); 527 so = soalloc(CRED_TO_VNET(cred)); 528 if (so == NULL) 529 return (ENOBUFS); 530 531 so->so_type = type; 532 so->so_cred = crhold(cred); 533 if ((prp->pr_domain->dom_family == PF_INET) || 534 (prp->pr_domain->dom_family == PF_INET6) || 535 (prp->pr_domain->dom_family == PF_ROUTE)) 536 so->so_fibnum = td->td_proc->p_fibnum; 537 else 538 so->so_fibnum = 0; 539 so->so_proto = prp; 540 #ifdef MAC 541 mac_socket_create(cred, so); 542 #endif 543 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 544 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 545 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 546 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 547 /* 548 * Auto-sizing of socket buffers is managed by the protocols and 549 * the appropriate flags must be set in the pru_attach function. 550 */ 551 CURVNET_SET(so->so_vnet); 552 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 553 CURVNET_RESTORE(); 554 if (error) { 555 sodealloc(so); 556 return (error); 557 } 558 soref(so); 559 *aso = so; 560 return (0); 561 } 562 563 #ifdef REGRESSION 564 static int regression_sonewconn_earlytest = 1; 565 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 566 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 567 #endif 568 569 /* 570 * When an attempt at a new connection is noted on a socket which accepts 571 * connections, sonewconn is called. If the connection is possible (subject 572 * to space constraints, etc.) then we allocate a new structure, properly 573 * linked into the data structure of the original socket, and return this. 574 * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. 575 * 576 * Note: the ref count on the socket is 0 on return. 577 */ 578 struct socket * 579 sonewconn(struct socket *head, int connstatus) 580 { 581 static struct timeval lastover; 582 static struct timeval overinterval = { 60, 0 }; 583 static int overcount; 584 585 struct socket *so; 586 u_int over; 587 588 SOLISTEN_LOCK(head); 589 over = (head->sol_qlen > 3 * head->sol_qlimit / 2); 590 SOLISTEN_UNLOCK(head); 591 #ifdef REGRESSION 592 if (regression_sonewconn_earlytest && over) { 593 #else 594 if (over) { 595 #endif 596 overcount++; 597 598 if (ratecheck(&lastover, &overinterval)) { 599 log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " 600 "%i already in queue awaiting acceptance " 601 "(%d occurrences)\n", 602 __func__, head->so_pcb, head->sol_qlen, overcount); 603 604 overcount = 0; 605 } 606 607 return (NULL); 608 } 609 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", 610 __func__, head)); 611 so = soalloc(head->so_vnet); 612 if (so == NULL) { 613 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 614 "limit reached or out of memory\n", 615 __func__, head->so_pcb); 616 return (NULL); 617 } 618 so->so_listen = head; 619 so->so_type = head->so_type; 620 so->so_linger = head->so_linger; 621 so->so_state = head->so_state | SS_NOFDREF; 622 so->so_fibnum = head->so_fibnum; 623 so->so_proto = head->so_proto; 624 so->so_cred = crhold(head->so_cred); 625 #ifdef MAC 626 mac_socket_newconn(head, so); 627 #endif 628 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 629 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 630 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 631 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 632 VNET_SO_ASSERT(head); 633 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { 634 sodealloc(so); 635 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 636 __func__, head->so_pcb); 637 return (NULL); 638 } 639 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 640 sodealloc(so); 641 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 642 __func__, head->so_pcb); 643 return (NULL); 644 } 645 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; 646 so->so_snd.sb_lowat = head->sol_sbsnd_lowat; 647 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; 648 so->so_snd.sb_timeo = head->sol_sbsnd_timeo; 649 so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; 650 so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; 651 652 SOLISTEN_LOCK(head); 653 if (head->sol_accept_filter != NULL) 654 connstatus = 0; 655 so->so_state |= connstatus; 656 so->so_options = head->so_options & ~SO_ACCEPTCONN; 657 soref(head); /* A socket on (in)complete queue refs head. */ 658 if (connstatus) { 659 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 660 so->so_qstate = SQ_COMP; 661 head->sol_qlen++; 662 solisten_wakeup(head); /* unlocks */ 663 } else { 664 /* 665 * Keep removing sockets from the head until there's room for 666 * us to insert on the tail. In pre-locking revisions, this 667 * was a simple if(), but as we could be racing with other 668 * threads and soabort() requires dropping locks, we must 669 * loop waiting for the condition to be true. 670 */ 671 while (head->sol_incqlen > head->sol_qlimit) { 672 struct socket *sp; 673 674 sp = TAILQ_FIRST(&head->sol_incomp); 675 TAILQ_REMOVE(&head->sol_incomp, sp, so_list); 676 head->sol_incqlen--; 677 SOCK_LOCK(sp); 678 sp->so_qstate = SQ_NONE; 679 sp->so_listen = NULL; 680 SOCK_UNLOCK(sp); 681 sorele(head); /* does SOLISTEN_UNLOCK, head stays */ 682 soabort(sp); 683 SOLISTEN_LOCK(head); 684 } 685 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); 686 so->so_qstate = SQ_INCOMP; 687 head->sol_incqlen++; 688 SOLISTEN_UNLOCK(head); 689 } 690 return (so); 691 } 692 693 #ifdef SCTP 694 /* 695 * Socket part of sctp_peeloff(). Detach a new socket from an 696 * association. The new socket is returned with a reference. 697 */ 698 struct socket * 699 sopeeloff(struct socket *head) 700 { 701 struct socket *so; 702 703 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 704 __func__, __LINE__, head)); 705 so = soalloc(head->so_vnet); 706 if (so == NULL) { 707 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 708 "limit reached or out of memory\n", 709 __func__, head->so_pcb); 710 return (NULL); 711 } 712 so->so_type = head->so_type; 713 so->so_options = head->so_options; 714 so->so_linger = head->so_linger; 715 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; 716 so->so_fibnum = head->so_fibnum; 717 so->so_proto = head->so_proto; 718 so->so_cred = crhold(head->so_cred); 719 #ifdef MAC 720 mac_socket_newconn(head, so); 721 #endif 722 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, 723 so_rdknl_assert_locked, so_rdknl_assert_unlocked); 724 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, 725 so_wrknl_assert_locked, so_wrknl_assert_unlocked); 726 VNET_SO_ASSERT(head); 727 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 728 sodealloc(so); 729 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 730 __func__, head->so_pcb); 731 return (NULL); 732 } 733 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 734 sodealloc(so); 735 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 736 __func__, head->so_pcb); 737 return (NULL); 738 } 739 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 740 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 741 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 742 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 743 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 744 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 745 746 soref(so); 747 748 return (so); 749 } 750 #endif /* SCTP */ 751 752 int 753 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 754 { 755 int error; 756 757 CURVNET_SET(so->so_vnet); 758 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 759 CURVNET_RESTORE(); 760 return (error); 761 } 762 763 int 764 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 765 { 766 int error; 767 768 CURVNET_SET(so->so_vnet); 769 error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); 770 CURVNET_RESTORE(); 771 return (error); 772 } 773 774 /* 775 * solisten() transitions a socket from a non-listening state to a listening 776 * state, but can also be used to update the listen queue depth on an 777 * existing listen socket. The protocol will call back into the sockets 778 * layer using solisten_proto_check() and solisten_proto() to check and set 779 * socket-layer listen state. Call backs are used so that the protocol can 780 * acquire both protocol and socket layer locks in whatever order is required 781 * by the protocol. 782 * 783 * Protocol implementors are advised to hold the socket lock across the 784 * socket-layer test and set to avoid races at the socket layer. 785 */ 786 int 787 solisten(struct socket *so, int backlog, struct thread *td) 788 { 789 int error; 790 791 CURVNET_SET(so->so_vnet); 792 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 793 CURVNET_RESTORE(); 794 return (error); 795 } 796 797 int 798 solisten_proto_check(struct socket *so) 799 { 800 801 SOCK_LOCK_ASSERT(so); 802 803 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 804 SS_ISDISCONNECTING)) 805 return (EINVAL); 806 return (0); 807 } 808 809 void 810 solisten_proto(struct socket *so, int backlog) 811 { 812 int sbrcv_lowat, sbsnd_lowat; 813 u_int sbrcv_hiwat, sbsnd_hiwat; 814 short sbrcv_flags, sbsnd_flags; 815 sbintime_t sbrcv_timeo, sbsnd_timeo; 816 817 SOCK_LOCK_ASSERT(so); 818 819 if (SOLISTENING(so)) 820 goto listening; 821 822 /* 823 * Change this socket to listening state. 824 */ 825 sbrcv_lowat = so->so_rcv.sb_lowat; 826 sbsnd_lowat = so->so_snd.sb_lowat; 827 sbrcv_hiwat = so->so_rcv.sb_hiwat; 828 sbsnd_hiwat = so->so_snd.sb_hiwat; 829 sbrcv_flags = so->so_rcv.sb_flags; 830 sbsnd_flags = so->so_snd.sb_flags; 831 sbrcv_timeo = so->so_rcv.sb_timeo; 832 sbsnd_timeo = so->so_snd.sb_timeo; 833 834 sbdestroy(&so->so_snd, so); 835 sbdestroy(&so->so_rcv, so); 836 sx_destroy(&so->so_snd.sb_sx); 837 sx_destroy(&so->so_rcv.sb_sx); 838 SOCKBUF_LOCK_DESTROY(&so->so_snd); 839 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 840 841 #ifdef INVARIANTS 842 bzero(&so->so_rcv, 843 sizeof(struct socket) - offsetof(struct socket, so_rcv)); 844 #endif 845 846 so->sol_sbrcv_lowat = sbrcv_lowat; 847 so->sol_sbsnd_lowat = sbsnd_lowat; 848 so->sol_sbrcv_hiwat = sbrcv_hiwat; 849 so->sol_sbsnd_hiwat = sbsnd_hiwat; 850 so->sol_sbrcv_flags = sbrcv_flags; 851 so->sol_sbsnd_flags = sbsnd_flags; 852 so->sol_sbrcv_timeo = sbrcv_timeo; 853 so->sol_sbsnd_timeo = sbsnd_timeo; 854 855 so->sol_qlen = so->sol_incqlen = 0; 856 TAILQ_INIT(&so->sol_incomp); 857 TAILQ_INIT(&so->sol_comp); 858 859 so->sol_accept_filter = NULL; 860 so->sol_accept_filter_arg = NULL; 861 so->sol_accept_filter_str = NULL; 862 863 so->sol_upcall = NULL; 864 so->sol_upcallarg = NULL; 865 866 so->so_options |= SO_ACCEPTCONN; 867 868 listening: 869 if (backlog < 0 || backlog > somaxconn) 870 backlog = somaxconn; 871 so->sol_qlimit = backlog; 872 } 873 874 /* 875 * Wakeup listeners/subsystems once we have a complete connection. 876 * Enters with lock, returns unlocked. 877 */ 878 void 879 solisten_wakeup(struct socket *sol) 880 { 881 882 if (sol->sol_upcall != NULL) 883 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); 884 else { 885 selwakeuppri(&sol->so_rdsel, PSOCK); 886 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); 887 } 888 SOLISTEN_UNLOCK(sol); 889 wakeup_one(&sol->sol_comp); 890 } 891 892 /* 893 * Return single connection off a listening socket queue. Main consumer of 894 * the function is kern_accept4(). Some modules, that do their own accept 895 * management also use the function. 896 * 897 * Listening socket must be locked on entry and is returned unlocked on 898 * return. 899 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. 900 */ 901 int 902 solisten_dequeue(struct socket *head, struct socket **ret, int flags) 903 { 904 struct socket *so; 905 int error; 906 907 SOLISTEN_LOCK_ASSERT(head); 908 909 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && 910 head->so_error == 0) { 911 error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, 912 "accept", 0); 913 if (error != 0) { 914 SOLISTEN_UNLOCK(head); 915 return (error); 916 } 917 } 918 if (head->so_error) { 919 error = head->so_error; 920 head->so_error = 0; 921 SOLISTEN_UNLOCK(head); 922 return (error); 923 } 924 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { 925 SOLISTEN_UNLOCK(head); 926 return (EWOULDBLOCK); 927 } 928 so = TAILQ_FIRST(&head->sol_comp); 929 SOCK_LOCK(so); 930 KASSERT(so->so_qstate == SQ_COMP, 931 ("%s: so %p not SQ_COMP", __func__, so)); 932 soref(so); 933 head->sol_qlen--; 934 so->so_qstate = SQ_NONE; 935 so->so_listen = NULL; 936 TAILQ_REMOVE(&head->sol_comp, so, so_list); 937 if (flags & ACCEPT4_INHERIT) 938 so->so_state |= (head->so_state & SS_NBIO); 939 else 940 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 941 SOCK_UNLOCK(so); 942 sorele(head); 943 944 *ret = so; 945 return (0); 946 } 947 948 /* 949 * Evaluate the reference count and named references on a socket; if no 950 * references remain, free it. This should be called whenever a reference is 951 * released, such as in sorele(), but also when named reference flags are 952 * cleared in socket or protocol code. 953 * 954 * sofree() will free the socket if: 955 * 956 * - There are no outstanding file descriptor references or related consumers 957 * (so_count == 0). 958 * 959 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 960 * 961 * - The protocol does not have an outstanding strong reference on the socket 962 * (SS_PROTOREF). 963 * 964 * - The socket is not in a completed connection queue, so a process has been 965 * notified that it is present. If it is removed, the user process may 966 * block in accept() despite select() saying the socket was ready. 967 */ 968 void 969 sofree(struct socket *so) 970 { 971 struct protosw *pr = so->so_proto; 972 973 SOCK_LOCK_ASSERT(so); 974 975 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 976 (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { 977 SOCK_UNLOCK(so); 978 return; 979 } 980 981 if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { 982 struct socket *sol; 983 984 sol = so->so_listen; 985 KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); 986 987 /* 988 * To solve race between close of a listening socket and 989 * a socket on its incomplete queue, we need to lock both. 990 * The order is first listening socket, then regular. 991 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this 992 * function and the listening socket are the only pointers 993 * to so. To preserve so and sol, we reference both and then 994 * relock. 995 * After relock the socket may not move to so_comp since it 996 * doesn't have PCB already, but it may be removed from 997 * so_incomp. If that happens, we share responsiblity on 998 * freeing the socket, but soclose() has already removed 999 * it from queue. 1000 */ 1001 soref(sol); 1002 soref(so); 1003 SOCK_UNLOCK(so); 1004 SOLISTEN_LOCK(sol); 1005 SOCK_LOCK(so); 1006 if (so->so_qstate == SQ_INCOMP) { 1007 KASSERT(so->so_listen == sol, 1008 ("%s: so %p migrated out of sol %p", 1009 __func__, so, sol)); 1010 TAILQ_REMOVE(&sol->sol_incomp, so, so_list); 1011 sol->sol_incqlen--; 1012 /* This is guarenteed not to be the last. */ 1013 refcount_release(&sol->so_count); 1014 so->so_qstate = SQ_NONE; 1015 so->so_listen = NULL; 1016 } else 1017 KASSERT(so->so_listen == NULL, 1018 ("%s: so %p not on (in)comp with so_listen", 1019 __func__, so)); 1020 sorele(sol); 1021 KASSERT(so->so_count == 1, 1022 ("%s: so %p count %u", __func__, so, so->so_count)); 1023 so->so_count = 0; 1024 } 1025 if (SOLISTENING(so)) 1026 so->so_error = ECONNABORTED; 1027 SOCK_UNLOCK(so); 1028 1029 VNET_SO_ASSERT(so); 1030 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1031 (*pr->pr_domain->dom_dispose)(so); 1032 if (pr->pr_usrreqs->pru_detach != NULL) 1033 (*pr->pr_usrreqs->pru_detach)(so); 1034 1035 /* 1036 * From this point on, we assume that no other references to this 1037 * socket exist anywhere else in the stack. Therefore, no locks need 1038 * to be acquired or held. 1039 * 1040 * We used to do a lot of socket buffer and socket locking here, as 1041 * well as invoke sorflush() and perform wakeups. The direct call to 1042 * dom_dispose() and sbrelease_internal() are an inlining of what was 1043 * necessary from sorflush(). 1044 * 1045 * Notice that the socket buffer and kqueue state are torn down 1046 * before calling pru_detach. This means that protocols shold not 1047 * assume they can perform socket wakeups, etc, in their detach code. 1048 */ 1049 if (!SOLISTENING(so)) { 1050 sbdestroy(&so->so_snd, so); 1051 sbdestroy(&so->so_rcv, so); 1052 } 1053 seldrain(&so->so_rdsel); 1054 seldrain(&so->so_wrsel); 1055 knlist_destroy(&so->so_rdsel.si_note); 1056 knlist_destroy(&so->so_wrsel.si_note); 1057 sodealloc(so); 1058 } 1059 1060 /* 1061 * Close a socket on last file table reference removal. Initiate disconnect 1062 * if connected. Free socket when disconnect complete. 1063 * 1064 * This function will sorele() the socket. Note that soclose() may be called 1065 * prior to the ref count reaching zero. The actual socket structure will 1066 * not be freed until the ref count reaches zero. 1067 */ 1068 int 1069 soclose(struct socket *so) 1070 { 1071 struct accept_queue lqueue; 1072 bool listening; 1073 int error = 0; 1074 1075 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 1076 1077 CURVNET_SET(so->so_vnet); 1078 funsetown(&so->so_sigio); 1079 if (so->so_state & SS_ISCONNECTED) { 1080 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 1081 error = sodisconnect(so); 1082 if (error) { 1083 if (error == ENOTCONN) 1084 error = 0; 1085 goto drop; 1086 } 1087 } 1088 if (so->so_options & SO_LINGER) { 1089 if ((so->so_state & SS_ISDISCONNECTING) && 1090 (so->so_state & SS_NBIO)) 1091 goto drop; 1092 while (so->so_state & SS_ISCONNECTED) { 1093 error = tsleep(&so->so_timeo, 1094 PSOCK | PCATCH, "soclos", 1095 so->so_linger * hz); 1096 if (error) 1097 break; 1098 } 1099 } 1100 } 1101 1102 drop: 1103 if (so->so_proto->pr_usrreqs->pru_close != NULL) 1104 (*so->so_proto->pr_usrreqs->pru_close)(so); 1105 1106 SOCK_LOCK(so); 1107 if ((listening = (so->so_options & SO_ACCEPTCONN))) { 1108 struct socket *sp; 1109 1110 TAILQ_INIT(&lqueue); 1111 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); 1112 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); 1113 1114 so->sol_qlen = so->sol_incqlen = 0; 1115 1116 TAILQ_FOREACH(sp, &lqueue, so_list) { 1117 SOCK_LOCK(sp); 1118 sp->so_qstate = SQ_NONE; 1119 sp->so_listen = NULL; 1120 SOCK_UNLOCK(sp); 1121 /* Guaranteed not to be the last. */ 1122 refcount_release(&so->so_count); 1123 } 1124 } 1125 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 1126 so->so_state |= SS_NOFDREF; 1127 sorele(so); 1128 if (listening) { 1129 struct socket *sp; 1130 1131 TAILQ_FOREACH(sp, &lqueue, so_list) { 1132 SOCK_LOCK(sp); 1133 if (sp->so_count == 0) { 1134 SOCK_UNLOCK(sp); 1135 soabort(sp); 1136 } else 1137 /* sp is now in sofree() */ 1138 SOCK_UNLOCK(sp); 1139 } 1140 } 1141 CURVNET_RESTORE(); 1142 return (error); 1143 } 1144 1145 /* 1146 * soabort() is used to abruptly tear down a connection, such as when a 1147 * resource limit is reached (listen queue depth exceeded), or if a listen 1148 * socket is closed while there are sockets waiting to be accepted. 1149 * 1150 * This interface is tricky, because it is called on an unreferenced socket, 1151 * and must be called only by a thread that has actually removed the socket 1152 * from the listen queue it was on, or races with other threads are risked. 1153 * 1154 * This interface will call into the protocol code, so must not be called 1155 * with any socket locks held. Protocols do call it while holding their own 1156 * recursible protocol mutexes, but this is something that should be subject 1157 * to review in the future. 1158 */ 1159 void 1160 soabort(struct socket *so) 1161 { 1162 1163 /* 1164 * In as much as is possible, assert that no references to this 1165 * socket are held. This is not quite the same as asserting that the 1166 * current thread is responsible for arranging for no references, but 1167 * is as close as we can get for now. 1168 */ 1169 KASSERT(so->so_count == 0, ("soabort: so_count")); 1170 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 1171 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 1172 KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); 1173 VNET_SO_ASSERT(so); 1174 1175 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 1176 (*so->so_proto->pr_usrreqs->pru_abort)(so); 1177 SOCK_LOCK(so); 1178 sofree(so); 1179 } 1180 1181 int 1182 soaccept(struct socket *so, struct sockaddr **nam) 1183 { 1184 int error; 1185 1186 SOCK_LOCK(so); 1187 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 1188 so->so_state &= ~SS_NOFDREF; 1189 SOCK_UNLOCK(so); 1190 1191 CURVNET_SET(so->so_vnet); 1192 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 1193 CURVNET_RESTORE(); 1194 return (error); 1195 } 1196 1197 int 1198 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 1199 { 1200 1201 return (soconnectat(AT_FDCWD, so, nam, td)); 1202 } 1203 1204 int 1205 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 1206 { 1207 int error; 1208 1209 if (so->so_options & SO_ACCEPTCONN) 1210 return (EOPNOTSUPP); 1211 1212 CURVNET_SET(so->so_vnet); 1213 /* 1214 * If protocol is connection-based, can only connect once. 1215 * Otherwise, if connected, try to disconnect first. This allows 1216 * user to disconnect by connecting to, e.g., a null address. 1217 */ 1218 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 1219 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 1220 (error = sodisconnect(so)))) { 1221 error = EISCONN; 1222 } else { 1223 /* 1224 * Prevent accumulated error from previous connection from 1225 * biting us. 1226 */ 1227 so->so_error = 0; 1228 if (fd == AT_FDCWD) { 1229 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, 1230 nam, td); 1231 } else { 1232 error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, 1233 so, nam, td); 1234 } 1235 } 1236 CURVNET_RESTORE(); 1237 1238 return (error); 1239 } 1240 1241 int 1242 soconnect2(struct socket *so1, struct socket *so2) 1243 { 1244 int error; 1245 1246 CURVNET_SET(so1->so_vnet); 1247 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 1248 CURVNET_RESTORE(); 1249 return (error); 1250 } 1251 1252 int 1253 sodisconnect(struct socket *so) 1254 { 1255 int error; 1256 1257 if ((so->so_state & SS_ISCONNECTED) == 0) 1258 return (ENOTCONN); 1259 if (so->so_state & SS_ISDISCONNECTING) 1260 return (EALREADY); 1261 VNET_SO_ASSERT(so); 1262 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 1263 return (error); 1264 } 1265 1266 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1267 1268 int 1269 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1270 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1271 { 1272 long space; 1273 ssize_t resid; 1274 int clen = 0, error, dontroute; 1275 1276 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1277 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1278 ("sosend_dgram: !PR_ATOMIC")); 1279 1280 if (uio != NULL) 1281 resid = uio->uio_resid; 1282 else 1283 resid = top->m_pkthdr.len; 1284 /* 1285 * In theory resid should be unsigned. However, space must be 1286 * signed, as it might be less than 0 if we over-committed, and we 1287 * must use a signed comparison of space and resid. On the other 1288 * hand, a negative resid causes us to loop sending 0-length 1289 * segments to the protocol. 1290 */ 1291 if (resid < 0) { 1292 error = EINVAL; 1293 goto out; 1294 } 1295 1296 dontroute = 1297 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1298 if (td != NULL) 1299 td->td_ru.ru_msgsnd++; 1300 if (control != NULL) 1301 clen = control->m_len; 1302 1303 SOCKBUF_LOCK(&so->so_snd); 1304 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1305 SOCKBUF_UNLOCK(&so->so_snd); 1306 error = EPIPE; 1307 goto out; 1308 } 1309 if (so->so_error) { 1310 error = so->so_error; 1311 so->so_error = 0; 1312 SOCKBUF_UNLOCK(&so->so_snd); 1313 goto out; 1314 } 1315 if ((so->so_state & SS_ISCONNECTED) == 0) { 1316 /* 1317 * `sendto' and `sendmsg' is allowed on a connection-based 1318 * socket if it supports implied connect. Return ENOTCONN if 1319 * not connected and no address is supplied. 1320 */ 1321 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1322 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1323 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1324 !(resid == 0 && clen != 0)) { 1325 SOCKBUF_UNLOCK(&so->so_snd); 1326 error = ENOTCONN; 1327 goto out; 1328 } 1329 } else if (addr == NULL) { 1330 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1331 error = ENOTCONN; 1332 else 1333 error = EDESTADDRREQ; 1334 SOCKBUF_UNLOCK(&so->so_snd); 1335 goto out; 1336 } 1337 } 1338 1339 /* 1340 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1341 * problem and need fixing. 1342 */ 1343 space = sbspace(&so->so_snd); 1344 if (flags & MSG_OOB) 1345 space += 1024; 1346 space -= clen; 1347 SOCKBUF_UNLOCK(&so->so_snd); 1348 if (resid > space) { 1349 error = EMSGSIZE; 1350 goto out; 1351 } 1352 if (uio == NULL) { 1353 resid = 0; 1354 if (flags & MSG_EOR) 1355 top->m_flags |= M_EOR; 1356 } else { 1357 /* 1358 * Copy the data from userland into a mbuf chain. 1359 * If no data is to be copied in, a single empty mbuf 1360 * is returned. 1361 */ 1362 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1363 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1364 if (top == NULL) { 1365 error = EFAULT; /* only possible error */ 1366 goto out; 1367 } 1368 space -= resid - uio->uio_resid; 1369 resid = uio->uio_resid; 1370 } 1371 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1372 /* 1373 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1374 * than with. 1375 */ 1376 if (dontroute) { 1377 SOCK_LOCK(so); 1378 so->so_options |= SO_DONTROUTE; 1379 SOCK_UNLOCK(so); 1380 } 1381 /* 1382 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1383 * of date. We could have received a reset packet in an interrupt or 1384 * maybe we slept while doing page faults in uiomove() etc. We could 1385 * probably recheck again inside the locking protection here, but 1386 * there are probably other places that this also happens. We must 1387 * rethink this. 1388 */ 1389 VNET_SO_ASSERT(so); 1390 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1391 (flags & MSG_OOB) ? PRUS_OOB : 1392 /* 1393 * If the user set MSG_EOF, the protocol understands this flag and 1394 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1395 */ 1396 ((flags & MSG_EOF) && 1397 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1398 (resid <= 0)) ? 1399 PRUS_EOF : 1400 /* If there is more to send set PRUS_MORETOCOME */ 1401 (flags & MSG_MORETOCOME) || 1402 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1403 top, addr, control, td); 1404 if (dontroute) { 1405 SOCK_LOCK(so); 1406 so->so_options &= ~SO_DONTROUTE; 1407 SOCK_UNLOCK(so); 1408 } 1409 clen = 0; 1410 control = NULL; 1411 top = NULL; 1412 out: 1413 if (top != NULL) 1414 m_freem(top); 1415 if (control != NULL) 1416 m_freem(control); 1417 return (error); 1418 } 1419 1420 /* 1421 * Send on a socket. If send must go all at once and message is larger than 1422 * send buffering, then hard error. Lock against other senders. If must go 1423 * all at once and not enough room now, then inform user that this would 1424 * block and do nothing. Otherwise, if nonblocking, send as much as 1425 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1426 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1427 * in mbuf chain must be small enough to send all at once. 1428 * 1429 * Returns nonzero on error, timeout or signal; callers must check for short 1430 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1431 * on return. 1432 */ 1433 int 1434 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1435 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1436 { 1437 long space; 1438 ssize_t resid; 1439 int clen = 0, error, dontroute; 1440 int atomic = sosendallatonce(so) || top; 1441 1442 if (uio != NULL) 1443 resid = uio->uio_resid; 1444 else 1445 resid = top->m_pkthdr.len; 1446 /* 1447 * In theory resid should be unsigned. However, space must be 1448 * signed, as it might be less than 0 if we over-committed, and we 1449 * must use a signed comparison of space and resid. On the other 1450 * hand, a negative resid causes us to loop sending 0-length 1451 * segments to the protocol. 1452 * 1453 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1454 * type sockets since that's an error. 1455 */ 1456 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1457 error = EINVAL; 1458 goto out; 1459 } 1460 1461 dontroute = 1462 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1463 (so->so_proto->pr_flags & PR_ATOMIC); 1464 if (td != NULL) 1465 td->td_ru.ru_msgsnd++; 1466 if (control != NULL) 1467 clen = control->m_len; 1468 1469 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1470 if (error) 1471 goto out; 1472 1473 restart: 1474 do { 1475 SOCKBUF_LOCK(&so->so_snd); 1476 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1477 SOCKBUF_UNLOCK(&so->so_snd); 1478 error = EPIPE; 1479 goto release; 1480 } 1481 if (so->so_error) { 1482 error = so->so_error; 1483 so->so_error = 0; 1484 SOCKBUF_UNLOCK(&so->so_snd); 1485 goto release; 1486 } 1487 if ((so->so_state & SS_ISCONNECTED) == 0) { 1488 /* 1489 * `sendto' and `sendmsg' is allowed on a connection- 1490 * based socket if it supports implied connect. 1491 * Return ENOTCONN if not connected and no address is 1492 * supplied. 1493 */ 1494 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1495 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1496 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1497 !(resid == 0 && clen != 0)) { 1498 SOCKBUF_UNLOCK(&so->so_snd); 1499 error = ENOTCONN; 1500 goto release; 1501 } 1502 } else if (addr == NULL) { 1503 SOCKBUF_UNLOCK(&so->so_snd); 1504 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1505 error = ENOTCONN; 1506 else 1507 error = EDESTADDRREQ; 1508 goto release; 1509 } 1510 } 1511 space = sbspace(&so->so_snd); 1512 if (flags & MSG_OOB) 1513 space += 1024; 1514 if ((atomic && resid > so->so_snd.sb_hiwat) || 1515 clen > so->so_snd.sb_hiwat) { 1516 SOCKBUF_UNLOCK(&so->so_snd); 1517 error = EMSGSIZE; 1518 goto release; 1519 } 1520 if (space < resid + clen && 1521 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1522 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1523 SOCKBUF_UNLOCK(&so->so_snd); 1524 error = EWOULDBLOCK; 1525 goto release; 1526 } 1527 error = sbwait(&so->so_snd); 1528 SOCKBUF_UNLOCK(&so->so_snd); 1529 if (error) 1530 goto release; 1531 goto restart; 1532 } 1533 SOCKBUF_UNLOCK(&so->so_snd); 1534 space -= clen; 1535 do { 1536 if (uio == NULL) { 1537 resid = 0; 1538 if (flags & MSG_EOR) 1539 top->m_flags |= M_EOR; 1540 } else { 1541 /* 1542 * Copy the data from userland into a mbuf 1543 * chain. If resid is 0, which can happen 1544 * only if we have control to send, then 1545 * a single empty mbuf is returned. This 1546 * is a workaround to prevent protocol send 1547 * methods to panic. 1548 */ 1549 top = m_uiotombuf(uio, M_WAITOK, space, 1550 (atomic ? max_hdr : 0), 1551 (atomic ? M_PKTHDR : 0) | 1552 ((flags & MSG_EOR) ? M_EOR : 0)); 1553 if (top == NULL) { 1554 error = EFAULT; /* only possible error */ 1555 goto release; 1556 } 1557 space -= resid - uio->uio_resid; 1558 resid = uio->uio_resid; 1559 } 1560 if (dontroute) { 1561 SOCK_LOCK(so); 1562 so->so_options |= SO_DONTROUTE; 1563 SOCK_UNLOCK(so); 1564 } 1565 /* 1566 * XXX all the SBS_CANTSENDMORE checks previously 1567 * done could be out of date. We could have received 1568 * a reset packet in an interrupt or maybe we slept 1569 * while doing page faults in uiomove() etc. We 1570 * could probably recheck again inside the locking 1571 * protection here, but there are probably other 1572 * places that this also happens. We must rethink 1573 * this. 1574 */ 1575 VNET_SO_ASSERT(so); 1576 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1577 (flags & MSG_OOB) ? PRUS_OOB : 1578 /* 1579 * If the user set MSG_EOF, the protocol understands 1580 * this flag and nothing left to send then use 1581 * PRU_SEND_EOF instead of PRU_SEND. 1582 */ 1583 ((flags & MSG_EOF) && 1584 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1585 (resid <= 0)) ? 1586 PRUS_EOF : 1587 /* If there is more to send set PRUS_MORETOCOME. */ 1588 (flags & MSG_MORETOCOME) || 1589 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1590 top, addr, control, td); 1591 if (dontroute) { 1592 SOCK_LOCK(so); 1593 so->so_options &= ~SO_DONTROUTE; 1594 SOCK_UNLOCK(so); 1595 } 1596 clen = 0; 1597 control = NULL; 1598 top = NULL; 1599 if (error) 1600 goto release; 1601 } while (resid && space > 0); 1602 } while (resid); 1603 1604 release: 1605 sbunlock(&so->so_snd); 1606 out: 1607 if (top != NULL) 1608 m_freem(top); 1609 if (control != NULL) 1610 m_freem(control); 1611 return (error); 1612 } 1613 1614 int 1615 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1616 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1617 { 1618 int error; 1619 1620 CURVNET_SET(so->so_vnet); 1621 if (!SOLISTENING(so)) 1622 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, 1623 top, control, flags, td); 1624 else { 1625 m_freem(top); 1626 m_freem(control); 1627 error = ENOTCONN; 1628 } 1629 CURVNET_RESTORE(); 1630 return (error); 1631 } 1632 1633 /* 1634 * The part of soreceive() that implements reading non-inline out-of-band 1635 * data from a socket. For more complete comments, see soreceive(), from 1636 * which this code originated. 1637 * 1638 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1639 * unable to return an mbuf chain to the caller. 1640 */ 1641 static int 1642 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1643 { 1644 struct protosw *pr = so->so_proto; 1645 struct mbuf *m; 1646 int error; 1647 1648 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1649 VNET_SO_ASSERT(so); 1650 1651 m = m_get(M_WAITOK, MT_DATA); 1652 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1653 if (error) 1654 goto bad; 1655 do { 1656 error = uiomove(mtod(m, void *), 1657 (int) min(uio->uio_resid, m->m_len), uio); 1658 m = m_free(m); 1659 } while (uio->uio_resid && error == 0 && m); 1660 bad: 1661 if (m != NULL) 1662 m_freem(m); 1663 return (error); 1664 } 1665 1666 /* 1667 * Following replacement or removal of the first mbuf on the first mbuf chain 1668 * of a socket buffer, push necessary state changes back into the socket 1669 * buffer so that other consumers see the values consistently. 'nextrecord' 1670 * is the callers locally stored value of the original value of 1671 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1672 * NOTE: 'nextrecord' may be NULL. 1673 */ 1674 static __inline void 1675 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1676 { 1677 1678 SOCKBUF_LOCK_ASSERT(sb); 1679 /* 1680 * First, update for the new value of nextrecord. If necessary, make 1681 * it the first record. 1682 */ 1683 if (sb->sb_mb != NULL) 1684 sb->sb_mb->m_nextpkt = nextrecord; 1685 else 1686 sb->sb_mb = nextrecord; 1687 1688 /* 1689 * Now update any dependent socket buffer fields to reflect the new 1690 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1691 * addition of a second clause that takes care of the case where 1692 * sb_mb has been updated, but remains the last record. 1693 */ 1694 if (sb->sb_mb == NULL) { 1695 sb->sb_mbtail = NULL; 1696 sb->sb_lastrecord = NULL; 1697 } else if (sb->sb_mb->m_nextpkt == NULL) 1698 sb->sb_lastrecord = sb->sb_mb; 1699 } 1700 1701 /* 1702 * Implement receive operations on a socket. We depend on the way that 1703 * records are added to the sockbuf by sbappend. In particular, each record 1704 * (mbufs linked through m_next) must begin with an address if the protocol 1705 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1706 * data, and then zero or more mbufs of data. In order to allow parallelism 1707 * between network receive and copying to user space, as well as avoid 1708 * sleeping with a mutex held, we release the socket buffer mutex during the 1709 * user space copy. Although the sockbuf is locked, new data may still be 1710 * appended, and thus we must maintain consistency of the sockbuf during that 1711 * time. 1712 * 1713 * The caller may receive the data as a single mbuf chain by supplying an 1714 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1715 * the count in uio_resid. 1716 */ 1717 int 1718 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1719 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1720 { 1721 struct mbuf *m, **mp; 1722 int flags, error, offset; 1723 ssize_t len; 1724 struct protosw *pr = so->so_proto; 1725 struct mbuf *nextrecord; 1726 int moff, type = 0; 1727 ssize_t orig_resid = uio->uio_resid; 1728 1729 mp = mp0; 1730 if (psa != NULL) 1731 *psa = NULL; 1732 if (controlp != NULL) 1733 *controlp = NULL; 1734 if (flagsp != NULL) 1735 flags = *flagsp &~ MSG_EOR; 1736 else 1737 flags = 0; 1738 if (flags & MSG_OOB) 1739 return (soreceive_rcvoob(so, uio, flags)); 1740 if (mp != NULL) 1741 *mp = NULL; 1742 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1743 && uio->uio_resid) { 1744 VNET_SO_ASSERT(so); 1745 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1746 } 1747 1748 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1749 if (error) 1750 return (error); 1751 1752 restart: 1753 SOCKBUF_LOCK(&so->so_rcv); 1754 m = so->so_rcv.sb_mb; 1755 /* 1756 * If we have less data than requested, block awaiting more (subject 1757 * to any timeout) if: 1758 * 1. the current count is less than the low water mark, or 1759 * 2. MSG_DONTWAIT is not set 1760 */ 1761 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1762 sbavail(&so->so_rcv) < uio->uio_resid) && 1763 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && 1764 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1765 KASSERT(m != NULL || !sbavail(&so->so_rcv), 1766 ("receive: m == %p sbavail == %u", 1767 m, sbavail(&so->so_rcv))); 1768 if (so->so_error) { 1769 if (m != NULL) 1770 goto dontblock; 1771 error = so->so_error; 1772 if ((flags & MSG_PEEK) == 0) 1773 so->so_error = 0; 1774 SOCKBUF_UNLOCK(&so->so_rcv); 1775 goto release; 1776 } 1777 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1778 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1779 if (m == NULL) { 1780 SOCKBUF_UNLOCK(&so->so_rcv); 1781 goto release; 1782 } else 1783 goto dontblock; 1784 } 1785 for (; m != NULL; m = m->m_next) 1786 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1787 m = so->so_rcv.sb_mb; 1788 goto dontblock; 1789 } 1790 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1791 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1792 SOCKBUF_UNLOCK(&so->so_rcv); 1793 error = ENOTCONN; 1794 goto release; 1795 } 1796 if (uio->uio_resid == 0) { 1797 SOCKBUF_UNLOCK(&so->so_rcv); 1798 goto release; 1799 } 1800 if ((so->so_state & SS_NBIO) || 1801 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1802 SOCKBUF_UNLOCK(&so->so_rcv); 1803 error = EWOULDBLOCK; 1804 goto release; 1805 } 1806 SBLASTRECORDCHK(&so->so_rcv); 1807 SBLASTMBUFCHK(&so->so_rcv); 1808 error = sbwait(&so->so_rcv); 1809 SOCKBUF_UNLOCK(&so->so_rcv); 1810 if (error) 1811 goto release; 1812 goto restart; 1813 } 1814 dontblock: 1815 /* 1816 * From this point onward, we maintain 'nextrecord' as a cache of the 1817 * pointer to the next record in the socket buffer. We must keep the 1818 * various socket buffer pointers and local stack versions of the 1819 * pointers in sync, pushing out modifications before dropping the 1820 * socket buffer mutex, and re-reading them when picking it up. 1821 * 1822 * Otherwise, we will race with the network stack appending new data 1823 * or records onto the socket buffer by using inconsistent/stale 1824 * versions of the field, possibly resulting in socket buffer 1825 * corruption. 1826 * 1827 * By holding the high-level sblock(), we prevent simultaneous 1828 * readers from pulling off the front of the socket buffer. 1829 */ 1830 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1831 if (uio->uio_td) 1832 uio->uio_td->td_ru.ru_msgrcv++; 1833 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1834 SBLASTRECORDCHK(&so->so_rcv); 1835 SBLASTMBUFCHK(&so->so_rcv); 1836 nextrecord = m->m_nextpkt; 1837 if (pr->pr_flags & PR_ADDR) { 1838 KASSERT(m->m_type == MT_SONAME, 1839 ("m->m_type == %d", m->m_type)); 1840 orig_resid = 0; 1841 if (psa != NULL) 1842 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1843 M_NOWAIT); 1844 if (flags & MSG_PEEK) { 1845 m = m->m_next; 1846 } else { 1847 sbfree(&so->so_rcv, m); 1848 so->so_rcv.sb_mb = m_free(m); 1849 m = so->so_rcv.sb_mb; 1850 sockbuf_pushsync(&so->so_rcv, nextrecord); 1851 } 1852 } 1853 1854 /* 1855 * Process one or more MT_CONTROL mbufs present before any data mbufs 1856 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1857 * just copy the data; if !MSG_PEEK, we call into the protocol to 1858 * perform externalization (or freeing if controlp == NULL). 1859 */ 1860 if (m != NULL && m->m_type == MT_CONTROL) { 1861 struct mbuf *cm = NULL, *cmn; 1862 struct mbuf **cme = &cm; 1863 1864 do { 1865 if (flags & MSG_PEEK) { 1866 if (controlp != NULL) { 1867 *controlp = m_copym(m, 0, m->m_len, 1868 M_NOWAIT); 1869 controlp = &(*controlp)->m_next; 1870 } 1871 m = m->m_next; 1872 } else { 1873 sbfree(&so->so_rcv, m); 1874 so->so_rcv.sb_mb = m->m_next; 1875 m->m_next = NULL; 1876 *cme = m; 1877 cme = &(*cme)->m_next; 1878 m = so->so_rcv.sb_mb; 1879 } 1880 } while (m != NULL && m->m_type == MT_CONTROL); 1881 if ((flags & MSG_PEEK) == 0) 1882 sockbuf_pushsync(&so->so_rcv, nextrecord); 1883 while (cm != NULL) { 1884 cmn = cm->m_next; 1885 cm->m_next = NULL; 1886 if (pr->pr_domain->dom_externalize != NULL) { 1887 SOCKBUF_UNLOCK(&so->so_rcv); 1888 VNET_SO_ASSERT(so); 1889 error = (*pr->pr_domain->dom_externalize) 1890 (cm, controlp, flags); 1891 SOCKBUF_LOCK(&so->so_rcv); 1892 } else if (controlp != NULL) 1893 *controlp = cm; 1894 else 1895 m_freem(cm); 1896 if (controlp != NULL) { 1897 orig_resid = 0; 1898 while (*controlp != NULL) 1899 controlp = &(*controlp)->m_next; 1900 } 1901 cm = cmn; 1902 } 1903 if (m != NULL) 1904 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1905 else 1906 nextrecord = so->so_rcv.sb_mb; 1907 orig_resid = 0; 1908 } 1909 if (m != NULL) { 1910 if ((flags & MSG_PEEK) == 0) { 1911 KASSERT(m->m_nextpkt == nextrecord, 1912 ("soreceive: post-control, nextrecord !sync")); 1913 if (nextrecord == NULL) { 1914 KASSERT(so->so_rcv.sb_mb == m, 1915 ("soreceive: post-control, sb_mb!=m")); 1916 KASSERT(so->so_rcv.sb_lastrecord == m, 1917 ("soreceive: post-control, lastrecord!=m")); 1918 } 1919 } 1920 type = m->m_type; 1921 if (type == MT_OOBDATA) 1922 flags |= MSG_OOB; 1923 } else { 1924 if ((flags & MSG_PEEK) == 0) { 1925 KASSERT(so->so_rcv.sb_mb == nextrecord, 1926 ("soreceive: sb_mb != nextrecord")); 1927 if (so->so_rcv.sb_mb == NULL) { 1928 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1929 ("soreceive: sb_lastercord != NULL")); 1930 } 1931 } 1932 } 1933 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1934 SBLASTRECORDCHK(&so->so_rcv); 1935 SBLASTMBUFCHK(&so->so_rcv); 1936 1937 /* 1938 * Now continue to read any data mbufs off of the head of the socket 1939 * buffer until the read request is satisfied. Note that 'type' is 1940 * used to store the type of any mbuf reads that have happened so far 1941 * such that soreceive() can stop reading if the type changes, which 1942 * causes soreceive() to return only one of regular data and inline 1943 * out-of-band data in a single socket receive operation. 1944 */ 1945 moff = 0; 1946 offset = 0; 1947 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 1948 && error == 0) { 1949 /* 1950 * If the type of mbuf has changed since the last mbuf 1951 * examined ('type'), end the receive operation. 1952 */ 1953 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1954 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 1955 if (type != m->m_type) 1956 break; 1957 } else if (type == MT_OOBDATA) 1958 break; 1959 else 1960 KASSERT(m->m_type == MT_DATA, 1961 ("m->m_type == %d", m->m_type)); 1962 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1963 len = uio->uio_resid; 1964 if (so->so_oobmark && len > so->so_oobmark - offset) 1965 len = so->so_oobmark - offset; 1966 if (len > m->m_len - moff) 1967 len = m->m_len - moff; 1968 /* 1969 * If mp is set, just pass back the mbufs. Otherwise copy 1970 * them out via the uio, then free. Sockbuf must be 1971 * consistent here (points to current mbuf, it points to next 1972 * record) when we drop priority; we must note any additions 1973 * to the sockbuf when we block interrupts again. 1974 */ 1975 if (mp == NULL) { 1976 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1977 SBLASTRECORDCHK(&so->so_rcv); 1978 SBLASTMBUFCHK(&so->so_rcv); 1979 SOCKBUF_UNLOCK(&so->so_rcv); 1980 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1981 SOCKBUF_LOCK(&so->so_rcv); 1982 if (error) { 1983 /* 1984 * The MT_SONAME mbuf has already been removed 1985 * from the record, so it is necessary to 1986 * remove the data mbufs, if any, to preserve 1987 * the invariant in the case of PR_ADDR that 1988 * requires MT_SONAME mbufs at the head of 1989 * each record. 1990 */ 1991 if (pr->pr_flags & PR_ATOMIC && 1992 ((flags & MSG_PEEK) == 0)) 1993 (void)sbdroprecord_locked(&so->so_rcv); 1994 SOCKBUF_UNLOCK(&so->so_rcv); 1995 goto release; 1996 } 1997 } else 1998 uio->uio_resid -= len; 1999 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2000 if (len == m->m_len - moff) { 2001 if (m->m_flags & M_EOR) 2002 flags |= MSG_EOR; 2003 if (flags & MSG_PEEK) { 2004 m = m->m_next; 2005 moff = 0; 2006 } else { 2007 nextrecord = m->m_nextpkt; 2008 sbfree(&so->so_rcv, m); 2009 if (mp != NULL) { 2010 m->m_nextpkt = NULL; 2011 *mp = m; 2012 mp = &m->m_next; 2013 so->so_rcv.sb_mb = m = m->m_next; 2014 *mp = NULL; 2015 } else { 2016 so->so_rcv.sb_mb = m_free(m); 2017 m = so->so_rcv.sb_mb; 2018 } 2019 sockbuf_pushsync(&so->so_rcv, nextrecord); 2020 SBLASTRECORDCHK(&so->so_rcv); 2021 SBLASTMBUFCHK(&so->so_rcv); 2022 } 2023 } else { 2024 if (flags & MSG_PEEK) 2025 moff += len; 2026 else { 2027 if (mp != NULL) { 2028 if (flags & MSG_DONTWAIT) { 2029 *mp = m_copym(m, 0, len, 2030 M_NOWAIT); 2031 if (*mp == NULL) { 2032 /* 2033 * m_copym() couldn't 2034 * allocate an mbuf. 2035 * Adjust uio_resid back 2036 * (it was adjusted 2037 * down by len bytes, 2038 * which we didn't end 2039 * up "copying" over). 2040 */ 2041 uio->uio_resid += len; 2042 break; 2043 } 2044 } else { 2045 SOCKBUF_UNLOCK(&so->so_rcv); 2046 *mp = m_copym(m, 0, len, 2047 M_WAITOK); 2048 SOCKBUF_LOCK(&so->so_rcv); 2049 } 2050 } 2051 sbcut_locked(&so->so_rcv, len); 2052 } 2053 } 2054 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2055 if (so->so_oobmark) { 2056 if ((flags & MSG_PEEK) == 0) { 2057 so->so_oobmark -= len; 2058 if (so->so_oobmark == 0) { 2059 so->so_rcv.sb_state |= SBS_RCVATMARK; 2060 break; 2061 } 2062 } else { 2063 offset += len; 2064 if (offset == so->so_oobmark) 2065 break; 2066 } 2067 } 2068 if (flags & MSG_EOR) 2069 break; 2070 /* 2071 * If the MSG_WAITALL flag is set (for non-atomic socket), we 2072 * must not quit until "uio->uio_resid == 0" or an error 2073 * termination. If a signal/timeout occurs, return with a 2074 * short count but without error. Keep sockbuf locked 2075 * against other readers. 2076 */ 2077 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 2078 !sosendallatonce(so) && nextrecord == NULL) { 2079 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2080 if (so->so_error || 2081 so->so_rcv.sb_state & SBS_CANTRCVMORE) 2082 break; 2083 /* 2084 * Notify the protocol that some data has been 2085 * drained before blocking. 2086 */ 2087 if (pr->pr_flags & PR_WANTRCVD) { 2088 SOCKBUF_UNLOCK(&so->so_rcv); 2089 VNET_SO_ASSERT(so); 2090 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2091 SOCKBUF_LOCK(&so->so_rcv); 2092 } 2093 SBLASTRECORDCHK(&so->so_rcv); 2094 SBLASTMBUFCHK(&so->so_rcv); 2095 /* 2096 * We could receive some data while was notifying 2097 * the protocol. Skip blocking in this case. 2098 */ 2099 if (so->so_rcv.sb_mb == NULL) { 2100 error = sbwait(&so->so_rcv); 2101 if (error) { 2102 SOCKBUF_UNLOCK(&so->so_rcv); 2103 goto release; 2104 } 2105 } 2106 m = so->so_rcv.sb_mb; 2107 if (m != NULL) 2108 nextrecord = m->m_nextpkt; 2109 } 2110 } 2111 2112 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2113 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 2114 flags |= MSG_TRUNC; 2115 if ((flags & MSG_PEEK) == 0) 2116 (void) sbdroprecord_locked(&so->so_rcv); 2117 } 2118 if ((flags & MSG_PEEK) == 0) { 2119 if (m == NULL) { 2120 /* 2121 * First part is an inline SB_EMPTY_FIXUP(). Second 2122 * part makes sure sb_lastrecord is up-to-date if 2123 * there is still data in the socket buffer. 2124 */ 2125 so->so_rcv.sb_mb = nextrecord; 2126 if (so->so_rcv.sb_mb == NULL) { 2127 so->so_rcv.sb_mbtail = NULL; 2128 so->so_rcv.sb_lastrecord = NULL; 2129 } else if (nextrecord->m_nextpkt == NULL) 2130 so->so_rcv.sb_lastrecord = nextrecord; 2131 } 2132 SBLASTRECORDCHK(&so->so_rcv); 2133 SBLASTMBUFCHK(&so->so_rcv); 2134 /* 2135 * If soreceive() is being done from the socket callback, 2136 * then don't need to generate ACK to peer to update window, 2137 * since ACK will be generated on return to TCP. 2138 */ 2139 if (!(flags & MSG_SOCALLBCK) && 2140 (pr->pr_flags & PR_WANTRCVD)) { 2141 SOCKBUF_UNLOCK(&so->so_rcv); 2142 VNET_SO_ASSERT(so); 2143 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 2144 SOCKBUF_LOCK(&so->so_rcv); 2145 } 2146 } 2147 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2148 if (orig_resid == uio->uio_resid && orig_resid && 2149 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 2150 SOCKBUF_UNLOCK(&so->so_rcv); 2151 goto restart; 2152 } 2153 SOCKBUF_UNLOCK(&so->so_rcv); 2154 2155 if (flagsp != NULL) 2156 *flagsp |= flags; 2157 release: 2158 sbunlock(&so->so_rcv); 2159 return (error); 2160 } 2161 2162 /* 2163 * Optimized version of soreceive() for stream (TCP) sockets. 2164 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled. 2165 */ 2166 int 2167 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 2168 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2169 { 2170 int len = 0, error = 0, flags, oresid; 2171 struct sockbuf *sb; 2172 struct mbuf *m, *n = NULL; 2173 2174 /* We only do stream sockets. */ 2175 if (so->so_type != SOCK_STREAM) 2176 return (EINVAL); 2177 if (psa != NULL) 2178 *psa = NULL; 2179 if (controlp != NULL) 2180 return (EINVAL); 2181 if (flagsp != NULL) 2182 flags = *flagsp &~ MSG_EOR; 2183 else 2184 flags = 0; 2185 if (flags & MSG_OOB) 2186 return (soreceive_rcvoob(so, uio, flags)); 2187 if (mp0 != NULL) 2188 *mp0 = NULL; 2189 2190 sb = &so->so_rcv; 2191 2192 /* Prevent other readers from entering the socket. */ 2193 error = sblock(sb, SBLOCKWAIT(flags)); 2194 if (error) 2195 goto out; 2196 SOCKBUF_LOCK(sb); 2197 2198 /* Easy one, no space to copyout anything. */ 2199 if (uio->uio_resid == 0) { 2200 error = EINVAL; 2201 goto out; 2202 } 2203 oresid = uio->uio_resid; 2204 2205 /* We will never ever get anything unless we are or were connected. */ 2206 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2207 error = ENOTCONN; 2208 goto out; 2209 } 2210 2211 restart: 2212 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2213 2214 /* Abort if socket has reported problems. */ 2215 if (so->so_error) { 2216 if (sbavail(sb) > 0) 2217 goto deliver; 2218 if (oresid > uio->uio_resid) 2219 goto out; 2220 error = so->so_error; 2221 if (!(flags & MSG_PEEK)) 2222 so->so_error = 0; 2223 goto out; 2224 } 2225 2226 /* Door is closed. Deliver what is left, if any. */ 2227 if (sb->sb_state & SBS_CANTRCVMORE) { 2228 if (sbavail(sb) > 0) 2229 goto deliver; 2230 else 2231 goto out; 2232 } 2233 2234 /* Socket buffer is empty and we shall not block. */ 2235 if (sbavail(sb) == 0 && 2236 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2237 error = EAGAIN; 2238 goto out; 2239 } 2240 2241 /* Socket buffer got some data that we shall deliver now. */ 2242 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 2243 ((so->so_state & SS_NBIO) || 2244 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2245 sbavail(sb) >= sb->sb_lowat || 2246 sbavail(sb) >= uio->uio_resid || 2247 sbavail(sb) >= sb->sb_hiwat) ) { 2248 goto deliver; 2249 } 2250 2251 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2252 if ((flags & MSG_WAITALL) && 2253 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) 2254 goto deliver; 2255 2256 /* 2257 * Wait and block until (more) data comes in. 2258 * NB: Drops the sockbuf lock during wait. 2259 */ 2260 error = sbwait(sb); 2261 if (error) 2262 goto out; 2263 goto restart; 2264 2265 deliver: 2266 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2267 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 2268 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2269 2270 /* Statistics. */ 2271 if (uio->uio_td) 2272 uio->uio_td->td_ru.ru_msgrcv++; 2273 2274 /* Fill uio until full or current end of socket buffer is reached. */ 2275 len = min(uio->uio_resid, sbavail(sb)); 2276 if (mp0 != NULL) { 2277 /* Dequeue as many mbufs as possible. */ 2278 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2279 if (*mp0 == NULL) 2280 *mp0 = sb->sb_mb; 2281 else 2282 m_cat(*mp0, sb->sb_mb); 2283 for (m = sb->sb_mb; 2284 m != NULL && m->m_len <= len; 2285 m = m->m_next) { 2286 KASSERT(!(m->m_flags & M_NOTAVAIL), 2287 ("%s: m %p not available", __func__, m)); 2288 len -= m->m_len; 2289 uio->uio_resid -= m->m_len; 2290 sbfree(sb, m); 2291 n = m; 2292 } 2293 n->m_next = NULL; 2294 sb->sb_mb = m; 2295 sb->sb_lastrecord = sb->sb_mb; 2296 if (sb->sb_mb == NULL) 2297 SB_EMPTY_FIXUP(sb); 2298 } 2299 /* Copy the remainder. */ 2300 if (len > 0) { 2301 KASSERT(sb->sb_mb != NULL, 2302 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2303 2304 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2305 if (m == NULL) 2306 len = 0; /* Don't flush data from sockbuf. */ 2307 else 2308 uio->uio_resid -= len; 2309 if (*mp0 != NULL) 2310 m_cat(*mp0, m); 2311 else 2312 *mp0 = m; 2313 if (*mp0 == NULL) { 2314 error = ENOBUFS; 2315 goto out; 2316 } 2317 } 2318 } else { 2319 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2320 SOCKBUF_UNLOCK(sb); 2321 error = m_mbuftouio(uio, sb->sb_mb, len); 2322 SOCKBUF_LOCK(sb); 2323 if (error) 2324 goto out; 2325 } 2326 SBLASTRECORDCHK(sb); 2327 SBLASTMBUFCHK(sb); 2328 2329 /* 2330 * Remove the delivered data from the socket buffer unless we 2331 * were only peeking. 2332 */ 2333 if (!(flags & MSG_PEEK)) { 2334 if (len > 0) 2335 sbdrop_locked(sb, len); 2336 2337 /* Notify protocol that we drained some data. */ 2338 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2339 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2340 !(flags & MSG_SOCALLBCK))) { 2341 SOCKBUF_UNLOCK(sb); 2342 VNET_SO_ASSERT(so); 2343 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 2344 SOCKBUF_LOCK(sb); 2345 } 2346 } 2347 2348 /* 2349 * For MSG_WAITALL we may have to loop again and wait for 2350 * more data to come in. 2351 */ 2352 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2353 goto restart; 2354 out: 2355 SOCKBUF_LOCK_ASSERT(sb); 2356 SBLASTRECORDCHK(sb); 2357 SBLASTMBUFCHK(sb); 2358 SOCKBUF_UNLOCK(sb); 2359 sbunlock(sb); 2360 return (error); 2361 } 2362 2363 /* 2364 * Optimized version of soreceive() for simple datagram cases from userspace. 2365 * Unlike in the stream case, we're able to drop a datagram if copyout() 2366 * fails, and because we handle datagrams atomically, we don't need to use a 2367 * sleep lock to prevent I/O interlacing. 2368 */ 2369 int 2370 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2371 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2372 { 2373 struct mbuf *m, *m2; 2374 int flags, error; 2375 ssize_t len; 2376 struct protosw *pr = so->so_proto; 2377 struct mbuf *nextrecord; 2378 2379 if (psa != NULL) 2380 *psa = NULL; 2381 if (controlp != NULL) 2382 *controlp = NULL; 2383 if (flagsp != NULL) 2384 flags = *flagsp &~ MSG_EOR; 2385 else 2386 flags = 0; 2387 2388 /* 2389 * For any complicated cases, fall back to the full 2390 * soreceive_generic(). 2391 */ 2392 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) 2393 return (soreceive_generic(so, psa, uio, mp0, controlp, 2394 flagsp)); 2395 2396 /* 2397 * Enforce restrictions on use. 2398 */ 2399 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2400 ("soreceive_dgram: wantrcvd")); 2401 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2402 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2403 ("soreceive_dgram: SBS_RCVATMARK")); 2404 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2405 ("soreceive_dgram: P_CONNREQUIRED")); 2406 2407 /* 2408 * Loop blocking while waiting for a datagram. 2409 */ 2410 SOCKBUF_LOCK(&so->so_rcv); 2411 while ((m = so->so_rcv.sb_mb) == NULL) { 2412 KASSERT(sbavail(&so->so_rcv) == 0, 2413 ("soreceive_dgram: sb_mb NULL but sbavail %u", 2414 sbavail(&so->so_rcv))); 2415 if (so->so_error) { 2416 error = so->so_error; 2417 so->so_error = 0; 2418 SOCKBUF_UNLOCK(&so->so_rcv); 2419 return (error); 2420 } 2421 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2422 uio->uio_resid == 0) { 2423 SOCKBUF_UNLOCK(&so->so_rcv); 2424 return (0); 2425 } 2426 if ((so->so_state & SS_NBIO) || 2427 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2428 SOCKBUF_UNLOCK(&so->so_rcv); 2429 return (EWOULDBLOCK); 2430 } 2431 SBLASTRECORDCHK(&so->so_rcv); 2432 SBLASTMBUFCHK(&so->so_rcv); 2433 error = sbwait(&so->so_rcv); 2434 if (error) { 2435 SOCKBUF_UNLOCK(&so->so_rcv); 2436 return (error); 2437 } 2438 } 2439 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2440 2441 if (uio->uio_td) 2442 uio->uio_td->td_ru.ru_msgrcv++; 2443 SBLASTRECORDCHK(&so->so_rcv); 2444 SBLASTMBUFCHK(&so->so_rcv); 2445 nextrecord = m->m_nextpkt; 2446 if (nextrecord == NULL) { 2447 KASSERT(so->so_rcv.sb_lastrecord == m, 2448 ("soreceive_dgram: lastrecord != m")); 2449 } 2450 2451 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2452 ("soreceive_dgram: m_nextpkt != nextrecord")); 2453 2454 /* 2455 * Pull 'm' and its chain off the front of the packet queue. 2456 */ 2457 so->so_rcv.sb_mb = NULL; 2458 sockbuf_pushsync(&so->so_rcv, nextrecord); 2459 2460 /* 2461 * Walk 'm's chain and free that many bytes from the socket buffer. 2462 */ 2463 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2464 sbfree(&so->so_rcv, m2); 2465 2466 /* 2467 * Do a few last checks before we let go of the lock. 2468 */ 2469 SBLASTRECORDCHK(&so->so_rcv); 2470 SBLASTMBUFCHK(&so->so_rcv); 2471 SOCKBUF_UNLOCK(&so->so_rcv); 2472 2473 if (pr->pr_flags & PR_ADDR) { 2474 KASSERT(m->m_type == MT_SONAME, 2475 ("m->m_type == %d", m->m_type)); 2476 if (psa != NULL) 2477 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2478 M_NOWAIT); 2479 m = m_free(m); 2480 } 2481 if (m == NULL) { 2482 /* XXXRW: Can this happen? */ 2483 return (0); 2484 } 2485 2486 /* 2487 * Packet to copyout() is now in 'm' and it is disconnected from the 2488 * queue. 2489 * 2490 * Process one or more MT_CONTROL mbufs present before any data mbufs 2491 * in the first mbuf chain on the socket buffer. We call into the 2492 * protocol to perform externalization (or freeing if controlp == 2493 * NULL). In some cases there can be only MT_CONTROL mbufs without 2494 * MT_DATA mbufs. 2495 */ 2496 if (m->m_type == MT_CONTROL) { 2497 struct mbuf *cm = NULL, *cmn; 2498 struct mbuf **cme = &cm; 2499 2500 do { 2501 m2 = m->m_next; 2502 m->m_next = NULL; 2503 *cme = m; 2504 cme = &(*cme)->m_next; 2505 m = m2; 2506 } while (m != NULL && m->m_type == MT_CONTROL); 2507 while (cm != NULL) { 2508 cmn = cm->m_next; 2509 cm->m_next = NULL; 2510 if (pr->pr_domain->dom_externalize != NULL) { 2511 error = (*pr->pr_domain->dom_externalize) 2512 (cm, controlp, flags); 2513 } else if (controlp != NULL) 2514 *controlp = cm; 2515 else 2516 m_freem(cm); 2517 if (controlp != NULL) { 2518 while (*controlp != NULL) 2519 controlp = &(*controlp)->m_next; 2520 } 2521 cm = cmn; 2522 } 2523 } 2524 KASSERT(m == NULL || m->m_type == MT_DATA, 2525 ("soreceive_dgram: !data")); 2526 while (m != NULL && uio->uio_resid > 0) { 2527 len = uio->uio_resid; 2528 if (len > m->m_len) 2529 len = m->m_len; 2530 error = uiomove(mtod(m, char *), (int)len, uio); 2531 if (error) { 2532 m_freem(m); 2533 return (error); 2534 } 2535 if (len == m->m_len) 2536 m = m_free(m); 2537 else { 2538 m->m_data += len; 2539 m->m_len -= len; 2540 } 2541 } 2542 if (m != NULL) { 2543 flags |= MSG_TRUNC; 2544 m_freem(m); 2545 } 2546 if (flagsp != NULL) 2547 *flagsp |= flags; 2548 return (0); 2549 } 2550 2551 int 2552 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2553 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2554 { 2555 int error; 2556 2557 CURVNET_SET(so->so_vnet); 2558 if (!SOLISTENING(so)) 2559 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, 2560 mp0, controlp, flagsp)); 2561 else 2562 error = ENOTCONN; 2563 CURVNET_RESTORE(); 2564 return (error); 2565 } 2566 2567 int 2568 soshutdown(struct socket *so, int how) 2569 { 2570 struct protosw *pr = so->so_proto; 2571 int error, soerror_enotconn; 2572 2573 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2574 return (EINVAL); 2575 2576 soerror_enotconn = 0; 2577 if ((so->so_state & 2578 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { 2579 /* 2580 * POSIX mandates us to return ENOTCONN when shutdown(2) is 2581 * invoked on a datagram sockets, however historically we would 2582 * actually tear socket down. This is known to be leveraged by 2583 * some applications to unblock process waiting in recvXXX(2) 2584 * by other process that it shares that socket with. Try to meet 2585 * both backward-compatibility and POSIX requirements by forcing 2586 * ENOTCONN but still asking protocol to perform pru_shutdown(). 2587 */ 2588 if (so->so_type != SOCK_DGRAM) 2589 return (ENOTCONN); 2590 soerror_enotconn = 1; 2591 } 2592 2593 CURVNET_SET(so->so_vnet); 2594 if (pr->pr_usrreqs->pru_flush != NULL) 2595 (*pr->pr_usrreqs->pru_flush)(so, how); 2596 if (how != SHUT_WR) 2597 sorflush(so); 2598 if (how != SHUT_RD) { 2599 error = (*pr->pr_usrreqs->pru_shutdown)(so); 2600 wakeup(&so->so_timeo); 2601 CURVNET_RESTORE(); 2602 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); 2603 } 2604 wakeup(&so->so_timeo); 2605 CURVNET_RESTORE(); 2606 2607 return (soerror_enotconn ? ENOTCONN : 0); 2608 } 2609 2610 void 2611 sorflush(struct socket *so) 2612 { 2613 struct sockbuf *sb = &so->so_rcv; 2614 struct protosw *pr = so->so_proto; 2615 struct socket aso; 2616 2617 VNET_SO_ASSERT(so); 2618 2619 /* 2620 * In order to avoid calling dom_dispose with the socket buffer mutex 2621 * held, and in order to generally avoid holding the lock for a long 2622 * time, we make a copy of the socket buffer and clear the original 2623 * (except locks, state). The new socket buffer copy won't have 2624 * initialized locks so we can only call routines that won't use or 2625 * assert those locks. 2626 * 2627 * Dislodge threads currently blocked in receive and wait to acquire 2628 * a lock against other simultaneous readers before clearing the 2629 * socket buffer. Don't let our acquire be interrupted by a signal 2630 * despite any existing socket disposition on interruptable waiting. 2631 */ 2632 socantrcvmore(so); 2633 (void) sblock(sb, SBL_WAIT | SBL_NOINTR); 2634 2635 /* 2636 * Invalidate/clear most of the sockbuf structure, but leave selinfo 2637 * and mutex data unchanged. 2638 */ 2639 SOCKBUF_LOCK(sb); 2640 bzero(&aso, sizeof(aso)); 2641 aso.so_pcb = so->so_pcb; 2642 bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, 2643 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2644 bzero(&sb->sb_startzero, 2645 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2646 SOCKBUF_UNLOCK(sb); 2647 sbunlock(sb); 2648 2649 /* 2650 * Dispose of special rights and flush the copied socket. Don't call 2651 * any unsafe routines (that rely on locks being initialized) on aso. 2652 */ 2653 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 2654 (*pr->pr_domain->dom_dispose)(&aso); 2655 sbrelease_internal(&aso.so_rcv, so); 2656 } 2657 2658 /* 2659 * Wrapper for Socket established helper hook. 2660 * Parameters: socket, context of the hook point, hook id. 2661 */ 2662 static int inline 2663 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) 2664 { 2665 struct socket_hhook_data hhook_data = { 2666 .so = so, 2667 .hctx = hctx, 2668 .m = NULL, 2669 .status = 0 2670 }; 2671 2672 CURVNET_SET(so->so_vnet); 2673 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); 2674 CURVNET_RESTORE(); 2675 2676 /* Ugly but needed, since hhooks return void for now */ 2677 return (hhook_data.status); 2678 } 2679 2680 /* 2681 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2682 * additional variant to handle the case where the option value needs to be 2683 * some kind of integer, but not a specific size. In addition to their use 2684 * here, these functions are also called by the protocol-level pr_ctloutput() 2685 * routines. 2686 */ 2687 int 2688 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2689 { 2690 size_t valsize; 2691 2692 /* 2693 * If the user gives us more than we wanted, we ignore it, but if we 2694 * don't get the minimum length the caller wants, we return EINVAL. 2695 * On success, sopt->sopt_valsize is set to however much we actually 2696 * retrieved. 2697 */ 2698 if ((valsize = sopt->sopt_valsize) < minlen) 2699 return EINVAL; 2700 if (valsize > len) 2701 sopt->sopt_valsize = valsize = len; 2702 2703 if (sopt->sopt_td != NULL) 2704 return (copyin(sopt->sopt_val, buf, valsize)); 2705 2706 bcopy(sopt->sopt_val, buf, valsize); 2707 return (0); 2708 } 2709 2710 /* 2711 * Kernel version of setsockopt(2). 2712 * 2713 * XXX: optlen is size_t, not socklen_t 2714 */ 2715 int 2716 so_setsockopt(struct socket *so, int level, int optname, void *optval, 2717 size_t optlen) 2718 { 2719 struct sockopt sopt; 2720 2721 sopt.sopt_level = level; 2722 sopt.sopt_name = optname; 2723 sopt.sopt_dir = SOPT_SET; 2724 sopt.sopt_val = optval; 2725 sopt.sopt_valsize = optlen; 2726 sopt.sopt_td = NULL; 2727 return (sosetopt(so, &sopt)); 2728 } 2729 2730 int 2731 sosetopt(struct socket *so, struct sockopt *sopt) 2732 { 2733 int error, optval; 2734 struct linger l; 2735 struct timeval tv; 2736 sbintime_t val; 2737 uint32_t val32; 2738 #ifdef MAC 2739 struct mac extmac; 2740 #endif 2741 2742 CURVNET_SET(so->so_vnet); 2743 error = 0; 2744 if (sopt->sopt_level != SOL_SOCKET) { 2745 if (so->so_proto->pr_ctloutput != NULL) { 2746 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2747 CURVNET_RESTORE(); 2748 return (error); 2749 } 2750 error = ENOPROTOOPT; 2751 } else { 2752 switch (sopt->sopt_name) { 2753 case SO_ACCEPTFILTER: 2754 error = accept_filt_setopt(so, sopt); 2755 if (error) 2756 goto bad; 2757 break; 2758 2759 case SO_LINGER: 2760 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2761 if (error) 2762 goto bad; 2763 2764 SOCK_LOCK(so); 2765 so->so_linger = l.l_linger; 2766 if (l.l_onoff) 2767 so->so_options |= SO_LINGER; 2768 else 2769 so->so_options &= ~SO_LINGER; 2770 SOCK_UNLOCK(so); 2771 break; 2772 2773 case SO_DEBUG: 2774 case SO_KEEPALIVE: 2775 case SO_DONTROUTE: 2776 case SO_USELOOPBACK: 2777 case SO_BROADCAST: 2778 case SO_REUSEADDR: 2779 case SO_REUSEPORT: 2780 case SO_OOBINLINE: 2781 case SO_TIMESTAMP: 2782 case SO_BINTIME: 2783 case SO_NOSIGPIPE: 2784 case SO_NO_DDP: 2785 case SO_NO_OFFLOAD: 2786 error = sooptcopyin(sopt, &optval, sizeof optval, 2787 sizeof optval); 2788 if (error) 2789 goto bad; 2790 SOCK_LOCK(so); 2791 if (optval) 2792 so->so_options |= sopt->sopt_name; 2793 else 2794 so->so_options &= ~sopt->sopt_name; 2795 SOCK_UNLOCK(so); 2796 break; 2797 2798 case SO_SETFIB: 2799 error = sooptcopyin(sopt, &optval, sizeof optval, 2800 sizeof optval); 2801 if (error) 2802 goto bad; 2803 2804 if (optval < 0 || optval >= rt_numfibs) { 2805 error = EINVAL; 2806 goto bad; 2807 } 2808 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 2809 (so->so_proto->pr_domain->dom_family == PF_INET6) || 2810 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 2811 so->so_fibnum = optval; 2812 else 2813 so->so_fibnum = 0; 2814 break; 2815 2816 case SO_USER_COOKIE: 2817 error = sooptcopyin(sopt, &val32, sizeof val32, 2818 sizeof val32); 2819 if (error) 2820 goto bad; 2821 so->so_user_cookie = val32; 2822 break; 2823 2824 case SO_SNDBUF: 2825 case SO_RCVBUF: 2826 case SO_SNDLOWAT: 2827 case SO_RCVLOWAT: 2828 error = sooptcopyin(sopt, &optval, sizeof optval, 2829 sizeof optval); 2830 if (error) 2831 goto bad; 2832 2833 /* 2834 * Values < 1 make no sense for any of these options, 2835 * so disallow them. 2836 */ 2837 if (optval < 1) { 2838 error = EINVAL; 2839 goto bad; 2840 } 2841 2842 error = sbsetopt(so, sopt->sopt_name, optval); 2843 break; 2844 2845 case SO_SNDTIMEO: 2846 case SO_RCVTIMEO: 2847 #ifdef COMPAT_FREEBSD32 2848 if (SV_CURPROC_FLAG(SV_ILP32)) { 2849 struct timeval32 tv32; 2850 2851 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2852 sizeof tv32); 2853 CP(tv32, tv, tv_sec); 2854 CP(tv32, tv, tv_usec); 2855 } else 2856 #endif 2857 error = sooptcopyin(sopt, &tv, sizeof tv, 2858 sizeof tv); 2859 if (error) 2860 goto bad; 2861 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 2862 tv.tv_usec >= 1000000) { 2863 error = EDOM; 2864 goto bad; 2865 } 2866 if (tv.tv_sec > INT32_MAX) 2867 val = SBT_MAX; 2868 else 2869 val = tvtosbt(tv); 2870 switch (sopt->sopt_name) { 2871 case SO_SNDTIMEO: 2872 so->so_snd.sb_timeo = val; 2873 break; 2874 case SO_RCVTIMEO: 2875 so->so_rcv.sb_timeo = val; 2876 break; 2877 } 2878 break; 2879 2880 case SO_LABEL: 2881 #ifdef MAC 2882 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2883 sizeof extmac); 2884 if (error) 2885 goto bad; 2886 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2887 so, &extmac); 2888 #else 2889 error = EOPNOTSUPP; 2890 #endif 2891 break; 2892 2893 case SO_TS_CLOCK: 2894 error = sooptcopyin(sopt, &optval, sizeof optval, 2895 sizeof optval); 2896 if (error) 2897 goto bad; 2898 if (optval < 0 || optval > SO_TS_CLOCK_MAX) { 2899 error = EINVAL; 2900 goto bad; 2901 } 2902 so->so_ts_clock = optval; 2903 break; 2904 2905 case SO_MAX_PACING_RATE: 2906 error = sooptcopyin(sopt, &val32, sizeof(val32), 2907 sizeof(val32)); 2908 if (error) 2909 goto bad; 2910 so->so_max_pacing_rate = val32; 2911 break; 2912 2913 default: 2914 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 2915 error = hhook_run_socket(so, sopt, 2916 HHOOK_SOCKET_OPT); 2917 else 2918 error = ENOPROTOOPT; 2919 break; 2920 } 2921 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 2922 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 2923 } 2924 bad: 2925 CURVNET_RESTORE(); 2926 return (error); 2927 } 2928 2929 /* 2930 * Helper routine for getsockopt. 2931 */ 2932 int 2933 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2934 { 2935 int error; 2936 size_t valsize; 2937 2938 error = 0; 2939 2940 /* 2941 * Documented get behavior is that we always return a value, possibly 2942 * truncated to fit in the user's buffer. Traditional behavior is 2943 * that we always tell the user precisely how much we copied, rather 2944 * than something useful like the total amount we had available for 2945 * her. Note that this interface is not idempotent; the entire 2946 * answer must be generated ahead of time. 2947 */ 2948 valsize = min(len, sopt->sopt_valsize); 2949 sopt->sopt_valsize = valsize; 2950 if (sopt->sopt_val != NULL) { 2951 if (sopt->sopt_td != NULL) 2952 error = copyout(buf, sopt->sopt_val, valsize); 2953 else 2954 bcopy(buf, sopt->sopt_val, valsize); 2955 } 2956 return (error); 2957 } 2958 2959 int 2960 sogetopt(struct socket *so, struct sockopt *sopt) 2961 { 2962 int error, optval; 2963 struct linger l; 2964 struct timeval tv; 2965 #ifdef MAC 2966 struct mac extmac; 2967 #endif 2968 2969 CURVNET_SET(so->so_vnet); 2970 error = 0; 2971 if (sopt->sopt_level != SOL_SOCKET) { 2972 if (so->so_proto->pr_ctloutput != NULL) 2973 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2974 else 2975 error = ENOPROTOOPT; 2976 CURVNET_RESTORE(); 2977 return (error); 2978 } else { 2979 switch (sopt->sopt_name) { 2980 case SO_ACCEPTFILTER: 2981 error = accept_filt_getopt(so, sopt); 2982 break; 2983 2984 case SO_LINGER: 2985 SOCK_LOCK(so); 2986 l.l_onoff = so->so_options & SO_LINGER; 2987 l.l_linger = so->so_linger; 2988 SOCK_UNLOCK(so); 2989 error = sooptcopyout(sopt, &l, sizeof l); 2990 break; 2991 2992 case SO_USELOOPBACK: 2993 case SO_DONTROUTE: 2994 case SO_DEBUG: 2995 case SO_KEEPALIVE: 2996 case SO_REUSEADDR: 2997 case SO_REUSEPORT: 2998 case SO_BROADCAST: 2999 case SO_OOBINLINE: 3000 case SO_ACCEPTCONN: 3001 case SO_TIMESTAMP: 3002 case SO_BINTIME: 3003 case SO_NOSIGPIPE: 3004 optval = so->so_options & sopt->sopt_name; 3005 integer: 3006 error = sooptcopyout(sopt, &optval, sizeof optval); 3007 break; 3008 3009 case SO_TYPE: 3010 optval = so->so_type; 3011 goto integer; 3012 3013 case SO_PROTOCOL: 3014 optval = so->so_proto->pr_protocol; 3015 goto integer; 3016 3017 case SO_ERROR: 3018 SOCK_LOCK(so); 3019 optval = so->so_error; 3020 so->so_error = 0; 3021 SOCK_UNLOCK(so); 3022 goto integer; 3023 3024 case SO_SNDBUF: 3025 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : 3026 so->so_snd.sb_hiwat; 3027 goto integer; 3028 3029 case SO_RCVBUF: 3030 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : 3031 so->so_rcv.sb_hiwat; 3032 goto integer; 3033 3034 case SO_SNDLOWAT: 3035 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : 3036 so->so_snd.sb_lowat; 3037 goto integer; 3038 3039 case SO_RCVLOWAT: 3040 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : 3041 so->so_rcv.sb_lowat; 3042 goto integer; 3043 3044 case SO_SNDTIMEO: 3045 case SO_RCVTIMEO: 3046 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 3047 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 3048 #ifdef COMPAT_FREEBSD32 3049 if (SV_CURPROC_FLAG(SV_ILP32)) { 3050 struct timeval32 tv32; 3051 3052 CP(tv, tv32, tv_sec); 3053 CP(tv, tv32, tv_usec); 3054 error = sooptcopyout(sopt, &tv32, sizeof tv32); 3055 } else 3056 #endif 3057 error = sooptcopyout(sopt, &tv, sizeof tv); 3058 break; 3059 3060 case SO_LABEL: 3061 #ifdef MAC 3062 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3063 sizeof(extmac)); 3064 if (error) 3065 goto bad; 3066 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 3067 so, &extmac); 3068 if (error) 3069 goto bad; 3070 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3071 #else 3072 error = EOPNOTSUPP; 3073 #endif 3074 break; 3075 3076 case SO_PEERLABEL: 3077 #ifdef MAC 3078 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 3079 sizeof(extmac)); 3080 if (error) 3081 goto bad; 3082 error = mac_getsockopt_peerlabel( 3083 sopt->sopt_td->td_ucred, so, &extmac); 3084 if (error) 3085 goto bad; 3086 error = sooptcopyout(sopt, &extmac, sizeof extmac); 3087 #else 3088 error = EOPNOTSUPP; 3089 #endif 3090 break; 3091 3092 case SO_LISTENQLIMIT: 3093 optval = SOLISTENING(so) ? so->sol_qlimit : 0; 3094 goto integer; 3095 3096 case SO_LISTENQLEN: 3097 optval = SOLISTENING(so) ? so->sol_qlen : 0; 3098 goto integer; 3099 3100 case SO_LISTENINCQLEN: 3101 optval = SOLISTENING(so) ? so->sol_incqlen : 0; 3102 goto integer; 3103 3104 case SO_TS_CLOCK: 3105 optval = so->so_ts_clock; 3106 goto integer; 3107 3108 case SO_MAX_PACING_RATE: 3109 optval = so->so_max_pacing_rate; 3110 goto integer; 3111 3112 default: 3113 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) 3114 error = hhook_run_socket(so, sopt, 3115 HHOOK_SOCKET_OPT); 3116 else 3117 error = ENOPROTOOPT; 3118 break; 3119 } 3120 } 3121 #ifdef MAC 3122 bad: 3123 #endif 3124 CURVNET_RESTORE(); 3125 return (error); 3126 } 3127 3128 int 3129 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 3130 { 3131 struct mbuf *m, *m_prev; 3132 int sopt_size = sopt->sopt_valsize; 3133 3134 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3135 if (m == NULL) 3136 return ENOBUFS; 3137 if (sopt_size > MLEN) { 3138 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 3139 if ((m->m_flags & M_EXT) == 0) { 3140 m_free(m); 3141 return ENOBUFS; 3142 } 3143 m->m_len = min(MCLBYTES, sopt_size); 3144 } else { 3145 m->m_len = min(MLEN, sopt_size); 3146 } 3147 sopt_size -= m->m_len; 3148 *mp = m; 3149 m_prev = m; 3150 3151 while (sopt_size) { 3152 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 3153 if (m == NULL) { 3154 m_freem(*mp); 3155 return ENOBUFS; 3156 } 3157 if (sopt_size > MLEN) { 3158 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 3159 M_NOWAIT); 3160 if ((m->m_flags & M_EXT) == 0) { 3161 m_freem(m); 3162 m_freem(*mp); 3163 return ENOBUFS; 3164 } 3165 m->m_len = min(MCLBYTES, sopt_size); 3166 } else { 3167 m->m_len = min(MLEN, sopt_size); 3168 } 3169 sopt_size -= m->m_len; 3170 m_prev->m_next = m; 3171 m_prev = m; 3172 } 3173 return (0); 3174 } 3175 3176 int 3177 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 3178 { 3179 struct mbuf *m0 = m; 3180 3181 if (sopt->sopt_val == NULL) 3182 return (0); 3183 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3184 if (sopt->sopt_td != NULL) { 3185 int error; 3186 3187 error = copyin(sopt->sopt_val, mtod(m, char *), 3188 m->m_len); 3189 if (error != 0) { 3190 m_freem(m0); 3191 return(error); 3192 } 3193 } else 3194 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 3195 sopt->sopt_valsize -= m->m_len; 3196 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3197 m = m->m_next; 3198 } 3199 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 3200 panic("ip6_sooptmcopyin"); 3201 return (0); 3202 } 3203 3204 int 3205 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 3206 { 3207 struct mbuf *m0 = m; 3208 size_t valsize = 0; 3209 3210 if (sopt->sopt_val == NULL) 3211 return (0); 3212 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 3213 if (sopt->sopt_td != NULL) { 3214 int error; 3215 3216 error = copyout(mtod(m, char *), sopt->sopt_val, 3217 m->m_len); 3218 if (error != 0) { 3219 m_freem(m0); 3220 return(error); 3221 } 3222 } else 3223 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 3224 sopt->sopt_valsize -= m->m_len; 3225 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 3226 valsize += m->m_len; 3227 m = m->m_next; 3228 } 3229 if (m != NULL) { 3230 /* enough soopt buffer should be given from user-land */ 3231 m_freem(m0); 3232 return(EINVAL); 3233 } 3234 sopt->sopt_valsize = valsize; 3235 return (0); 3236 } 3237 3238 /* 3239 * sohasoutofband(): protocol notifies socket layer of the arrival of new 3240 * out-of-band data, which will then notify socket consumers. 3241 */ 3242 void 3243 sohasoutofband(struct socket *so) 3244 { 3245 3246 if (so->so_sigio != NULL) 3247 pgsigio(&so->so_sigio, SIGURG, 0); 3248 selwakeuppri(&so->so_rdsel, PSOCK); 3249 } 3250 3251 int 3252 sopoll(struct socket *so, int events, struct ucred *active_cred, 3253 struct thread *td) 3254 { 3255 3256 /* 3257 * We do not need to set or assert curvnet as long as everyone uses 3258 * sopoll_generic(). 3259 */ 3260 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 3261 td)); 3262 } 3263 3264 int 3265 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3266 struct thread *td) 3267 { 3268 int revents; 3269 3270 SOCK_LOCK(so); 3271 if (SOLISTENING(so)) { 3272 if (!(events & (POLLIN | POLLRDNORM))) 3273 revents = 0; 3274 else if (!TAILQ_EMPTY(&so->sol_comp)) 3275 revents = events & (POLLIN | POLLRDNORM); 3276 else { 3277 selrecord(td, &so->so_rdsel); 3278 revents = 0; 3279 } 3280 } else { 3281 revents = 0; 3282 SOCKBUF_LOCK(&so->so_snd); 3283 SOCKBUF_LOCK(&so->so_rcv); 3284 if (events & (POLLIN | POLLRDNORM)) 3285 if (soreadabledata(so)) 3286 revents |= events & (POLLIN | POLLRDNORM); 3287 if (events & (POLLOUT | POLLWRNORM)) 3288 if (sowriteable(so)) 3289 revents |= events & (POLLOUT | POLLWRNORM); 3290 if (events & (POLLPRI | POLLRDBAND)) 3291 if (so->so_oobmark || 3292 (so->so_rcv.sb_state & SBS_RCVATMARK)) 3293 revents |= events & (POLLPRI | POLLRDBAND); 3294 if ((events & POLLINIGNEOF) == 0) { 3295 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3296 revents |= events & (POLLIN | POLLRDNORM); 3297 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3298 revents |= POLLHUP; 3299 } 3300 } 3301 if (revents == 0) { 3302 if (events & 3303 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 3304 selrecord(td, &so->so_rdsel); 3305 so->so_rcv.sb_flags |= SB_SEL; 3306 } 3307 if (events & (POLLOUT | POLLWRNORM)) { 3308 selrecord(td, &so->so_wrsel); 3309 so->so_snd.sb_flags |= SB_SEL; 3310 } 3311 } 3312 SOCKBUF_UNLOCK(&so->so_rcv); 3313 SOCKBUF_UNLOCK(&so->so_snd); 3314 } 3315 SOCK_UNLOCK(so); 3316 return (revents); 3317 } 3318 3319 int 3320 soo_kqfilter(struct file *fp, struct knote *kn) 3321 { 3322 struct socket *so = kn->kn_fp->f_data; 3323 struct sockbuf *sb; 3324 struct knlist *knl; 3325 3326 switch (kn->kn_filter) { 3327 case EVFILT_READ: 3328 kn->kn_fop = &soread_filtops; 3329 knl = &so->so_rdsel.si_note; 3330 sb = &so->so_rcv; 3331 break; 3332 case EVFILT_WRITE: 3333 kn->kn_fop = &sowrite_filtops; 3334 knl = &so->so_wrsel.si_note; 3335 sb = &so->so_snd; 3336 break; 3337 case EVFILT_EMPTY: 3338 kn->kn_fop = &soempty_filtops; 3339 knl = &so->so_wrsel.si_note; 3340 sb = &so->so_snd; 3341 break; 3342 default: 3343 return (EINVAL); 3344 } 3345 3346 SOCK_LOCK(so); 3347 if (SOLISTENING(so)) { 3348 knlist_add(knl, kn, 1); 3349 } else { 3350 SOCKBUF_LOCK(sb); 3351 knlist_add(knl, kn, 1); 3352 sb->sb_flags |= SB_KNOTE; 3353 SOCKBUF_UNLOCK(sb); 3354 } 3355 SOCK_UNLOCK(so); 3356 return (0); 3357 } 3358 3359 /* 3360 * Some routines that return EOPNOTSUPP for entry points that are not 3361 * supported by a protocol. Fill in as needed. 3362 */ 3363 int 3364 pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 3365 { 3366 3367 return EOPNOTSUPP; 3368 } 3369 3370 int 3371 pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) 3372 { 3373 3374 return EOPNOTSUPP; 3375 } 3376 3377 int 3378 pru_attach_notsupp(struct socket *so, int proto, struct thread *td) 3379 { 3380 3381 return EOPNOTSUPP; 3382 } 3383 3384 int 3385 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3386 { 3387 3388 return EOPNOTSUPP; 3389 } 3390 3391 int 3392 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3393 struct thread *td) 3394 { 3395 3396 return EOPNOTSUPP; 3397 } 3398 3399 int 3400 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3401 { 3402 3403 return EOPNOTSUPP; 3404 } 3405 3406 int 3407 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3408 struct thread *td) 3409 { 3410 3411 return EOPNOTSUPP; 3412 } 3413 3414 int 3415 pru_connect2_notsupp(struct socket *so1, struct socket *so2) 3416 { 3417 3418 return EOPNOTSUPP; 3419 } 3420 3421 int 3422 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 3423 struct ifnet *ifp, struct thread *td) 3424 { 3425 3426 return EOPNOTSUPP; 3427 } 3428 3429 int 3430 pru_disconnect_notsupp(struct socket *so) 3431 { 3432 3433 return EOPNOTSUPP; 3434 } 3435 3436 int 3437 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) 3438 { 3439 3440 return EOPNOTSUPP; 3441 } 3442 3443 int 3444 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) 3445 { 3446 3447 return EOPNOTSUPP; 3448 } 3449 3450 int 3451 pru_rcvd_notsupp(struct socket *so, int flags) 3452 { 3453 3454 return EOPNOTSUPP; 3455 } 3456 3457 int 3458 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 3459 { 3460 3461 return EOPNOTSUPP; 3462 } 3463 3464 int 3465 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, 3466 struct sockaddr *addr, struct mbuf *control, struct thread *td) 3467 { 3468 3469 return EOPNOTSUPP; 3470 } 3471 3472 int 3473 pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) 3474 { 3475 3476 return (EOPNOTSUPP); 3477 } 3478 3479 /* 3480 * This isn't really a ``null'' operation, but it's the default one and 3481 * doesn't do anything destructive. 3482 */ 3483 int 3484 pru_sense_null(struct socket *so, struct stat *sb) 3485 { 3486 3487 sb->st_blksize = so->so_snd.sb_hiwat; 3488 return 0; 3489 } 3490 3491 int 3492 pru_shutdown_notsupp(struct socket *so) 3493 { 3494 3495 return EOPNOTSUPP; 3496 } 3497 3498 int 3499 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) 3500 { 3501 3502 return EOPNOTSUPP; 3503 } 3504 3505 int 3506 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, 3507 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 3508 { 3509 3510 return EOPNOTSUPP; 3511 } 3512 3513 int 3514 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, 3515 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3516 { 3517 3518 return EOPNOTSUPP; 3519 } 3520 3521 int 3522 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, 3523 struct thread *td) 3524 { 3525 3526 return EOPNOTSUPP; 3527 } 3528 3529 static void 3530 filt_sordetach(struct knote *kn) 3531 { 3532 struct socket *so = kn->kn_fp->f_data; 3533 3534 so_rdknl_lock(so); 3535 knlist_remove(&so->so_rdsel.si_note, kn, 1); 3536 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) 3537 so->so_rcv.sb_flags &= ~SB_KNOTE; 3538 so_rdknl_unlock(so); 3539 } 3540 3541 /*ARGSUSED*/ 3542 static int 3543 filt_soread(struct knote *kn, long hint) 3544 { 3545 struct socket *so; 3546 3547 so = kn->kn_fp->f_data; 3548 3549 if (SOLISTENING(so)) { 3550 SOCK_LOCK_ASSERT(so); 3551 kn->kn_data = so->sol_qlen; 3552 return (!TAILQ_EMPTY(&so->sol_comp)); 3553 } 3554 3555 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3556 3557 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; 3558 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3559 kn->kn_flags |= EV_EOF; 3560 kn->kn_fflags = so->so_error; 3561 return (1); 3562 } else if (so->so_error) /* temporary udp error */ 3563 return (1); 3564 3565 if (kn->kn_sfflags & NOTE_LOWAT) { 3566 if (kn->kn_data >= kn->kn_sdata) 3567 return (1); 3568 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) 3569 return (1); 3570 3571 /* This hook returning non-zero indicates an event, not error */ 3572 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); 3573 } 3574 3575 static void 3576 filt_sowdetach(struct knote *kn) 3577 { 3578 struct socket *so = kn->kn_fp->f_data; 3579 3580 so_wrknl_lock(so); 3581 knlist_remove(&so->so_wrsel.si_note, kn, 1); 3582 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) 3583 so->so_snd.sb_flags &= ~SB_KNOTE; 3584 so_wrknl_unlock(so); 3585 } 3586 3587 /*ARGSUSED*/ 3588 static int 3589 filt_sowrite(struct knote *kn, long hint) 3590 { 3591 struct socket *so; 3592 3593 so = kn->kn_fp->f_data; 3594 3595 if (SOLISTENING(so)) 3596 return (0); 3597 3598 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3599 kn->kn_data = sbspace(&so->so_snd); 3600 3601 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); 3602 3603 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3604 kn->kn_flags |= EV_EOF; 3605 kn->kn_fflags = so->so_error; 3606 return (1); 3607 } else if (so->so_error) /* temporary udp error */ 3608 return (1); 3609 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3610 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3611 return (0); 3612 else if (kn->kn_sfflags & NOTE_LOWAT) 3613 return (kn->kn_data >= kn->kn_sdata); 3614 else 3615 return (kn->kn_data >= so->so_snd.sb_lowat); 3616 } 3617 3618 static int 3619 filt_soempty(struct knote *kn, long hint) 3620 { 3621 struct socket *so; 3622 3623 so = kn->kn_fp->f_data; 3624 3625 if (SOLISTENING(so)) 3626 return (1); 3627 3628 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3629 kn->kn_data = sbused(&so->so_snd); 3630 3631 if (kn->kn_data == 0) 3632 return (1); 3633 else 3634 return (0); 3635 } 3636 3637 int 3638 socheckuid(struct socket *so, uid_t uid) 3639 { 3640 3641 if (so == NULL) 3642 return (EPERM); 3643 if (so->so_cred->cr_uid != uid) 3644 return (EPERM); 3645 return (0); 3646 } 3647 3648 /* 3649 * These functions are used by protocols to notify the socket layer (and its 3650 * consumers) of state changes in the sockets driven by protocol-side events. 3651 */ 3652 3653 /* 3654 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3655 * 3656 * Normal sequence from the active (originating) side is that 3657 * soisconnecting() is called during processing of connect() call, resulting 3658 * in an eventual call to soisconnected() if/when the connection is 3659 * established. When the connection is torn down soisdisconnecting() is 3660 * called during processing of disconnect() call, and soisdisconnected() is 3661 * called when the connection to the peer is totally severed. The semantics 3662 * of these routines are such that connectionless protocols can call 3663 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3664 * calls when setting up a ``connection'' takes no time. 3665 * 3666 * From the passive side, a socket is created with two queues of sockets: 3667 * so_incomp for connections in progress and so_comp for connections already 3668 * made and awaiting user acceptance. As a protocol is preparing incoming 3669 * connections, it creates a socket structure queued on so_incomp by calling 3670 * sonewconn(). When the connection is established, soisconnected() is 3671 * called, and transfers the socket structure to so_comp, making it available 3672 * to accept(). 3673 * 3674 * If a socket is closed with sockets on either so_incomp or so_comp, these 3675 * sockets are dropped. 3676 * 3677 * If higher-level protocols are implemented in the kernel, the wakeups done 3678 * here will sometimes cause software-interrupt process scheduling. 3679 */ 3680 void 3681 soisconnecting(struct socket *so) 3682 { 3683 3684 SOCK_LOCK(so); 3685 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3686 so->so_state |= SS_ISCONNECTING; 3687 SOCK_UNLOCK(so); 3688 } 3689 3690 void 3691 soisconnected(struct socket *so) 3692 { 3693 3694 SOCK_LOCK(so); 3695 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3696 so->so_state |= SS_ISCONNECTED; 3697 3698 if (so->so_qstate == SQ_INCOMP) { 3699 struct socket *head = so->so_listen; 3700 int ret; 3701 3702 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); 3703 /* 3704 * Promoting a socket from incomplete queue to complete, we 3705 * need to go through reverse order of locking. We first do 3706 * trylock, and if that doesn't succeed, we go the hard way 3707 * leaving a reference and rechecking consistency after proper 3708 * locking. 3709 */ 3710 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { 3711 soref(head); 3712 SOCK_UNLOCK(so); 3713 SOLISTEN_LOCK(head); 3714 SOCK_LOCK(so); 3715 if (__predict_false(head != so->so_listen)) { 3716 /* 3717 * The socket went off the listen queue, 3718 * should be lost race to close(2) of sol. 3719 * The socket is about to soabort(). 3720 */ 3721 SOCK_UNLOCK(so); 3722 sorele(head); 3723 return; 3724 } 3725 /* Not the last one, as so holds a ref. */ 3726 refcount_release(&head->so_count); 3727 } 3728 again: 3729 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3730 TAILQ_REMOVE(&head->sol_incomp, so, so_list); 3731 head->sol_incqlen--; 3732 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); 3733 head->sol_qlen++; 3734 so->so_qstate = SQ_COMP; 3735 SOCK_UNLOCK(so); 3736 solisten_wakeup(head); /* unlocks */ 3737 } else { 3738 SOCKBUF_LOCK(&so->so_rcv); 3739 soupcall_set(so, SO_RCV, 3740 head->sol_accept_filter->accf_callback, 3741 head->sol_accept_filter_arg); 3742 so->so_options &= ~SO_ACCEPTFILTER; 3743 ret = head->sol_accept_filter->accf_callback(so, 3744 head->sol_accept_filter_arg, M_NOWAIT); 3745 if (ret == SU_ISCONNECTED) { 3746 soupcall_clear(so, SO_RCV); 3747 SOCKBUF_UNLOCK(&so->so_rcv); 3748 goto again; 3749 } 3750 SOCKBUF_UNLOCK(&so->so_rcv); 3751 SOCK_UNLOCK(so); 3752 SOLISTEN_UNLOCK(head); 3753 } 3754 return; 3755 } 3756 SOCK_UNLOCK(so); 3757 wakeup(&so->so_timeo); 3758 sorwakeup(so); 3759 sowwakeup(so); 3760 } 3761 3762 void 3763 soisdisconnecting(struct socket *so) 3764 { 3765 3766 SOCK_LOCK(so); 3767 so->so_state &= ~SS_ISCONNECTING; 3768 so->so_state |= SS_ISDISCONNECTING; 3769 3770 if (!SOLISTENING(so)) { 3771 SOCKBUF_LOCK(&so->so_rcv); 3772 socantrcvmore_locked(so); 3773 SOCKBUF_LOCK(&so->so_snd); 3774 socantsendmore_locked(so); 3775 } 3776 SOCK_UNLOCK(so); 3777 wakeup(&so->so_timeo); 3778 } 3779 3780 void 3781 soisdisconnected(struct socket *so) 3782 { 3783 3784 SOCK_LOCK(so); 3785 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3786 so->so_state |= SS_ISDISCONNECTED; 3787 3788 if (!SOLISTENING(so)) { 3789 SOCK_UNLOCK(so); 3790 SOCKBUF_LOCK(&so->so_rcv); 3791 socantrcvmore_locked(so); 3792 SOCKBUF_LOCK(&so->so_snd); 3793 sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); 3794 socantsendmore_locked(so); 3795 } else 3796 SOCK_UNLOCK(so); 3797 wakeup(&so->so_timeo); 3798 } 3799 3800 /* 3801 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3802 */ 3803 struct sockaddr * 3804 sodupsockaddr(const struct sockaddr *sa, int mflags) 3805 { 3806 struct sockaddr *sa2; 3807 3808 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3809 if (sa2) 3810 bcopy(sa, sa2, sa->sa_len); 3811 return sa2; 3812 } 3813 3814 /* 3815 * Register per-socket buffer upcalls. 3816 */ 3817 void 3818 soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) 3819 { 3820 struct sockbuf *sb; 3821 3822 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 3823 3824 switch (which) { 3825 case SO_RCV: 3826 sb = &so->so_rcv; 3827 break; 3828 case SO_SND: 3829 sb = &so->so_snd; 3830 break; 3831 default: 3832 panic("soupcall_set: bad which"); 3833 } 3834 SOCKBUF_LOCK_ASSERT(sb); 3835 sb->sb_upcall = func; 3836 sb->sb_upcallarg = arg; 3837 sb->sb_flags |= SB_UPCALL; 3838 } 3839 3840 void 3841 soupcall_clear(struct socket *so, int which) 3842 { 3843 struct sockbuf *sb; 3844 3845 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); 3846 3847 switch (which) { 3848 case SO_RCV: 3849 sb = &so->so_rcv; 3850 break; 3851 case SO_SND: 3852 sb = &so->so_snd; 3853 break; 3854 default: 3855 panic("soupcall_clear: bad which"); 3856 } 3857 SOCKBUF_LOCK_ASSERT(sb); 3858 KASSERT(sb->sb_upcall != NULL, 3859 ("%s: so %p no upcall to clear", __func__, so)); 3860 sb->sb_upcall = NULL; 3861 sb->sb_upcallarg = NULL; 3862 sb->sb_flags &= ~SB_UPCALL; 3863 } 3864 3865 void 3866 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) 3867 { 3868 3869 SOLISTEN_LOCK_ASSERT(so); 3870 so->sol_upcall = func; 3871 so->sol_upcallarg = arg; 3872 } 3873 3874 static void 3875 so_rdknl_lock(void *arg) 3876 { 3877 struct socket *so = arg; 3878 3879 if (SOLISTENING(so)) 3880 SOCK_LOCK(so); 3881 else 3882 SOCKBUF_LOCK(&so->so_rcv); 3883 } 3884 3885 static void 3886 so_rdknl_unlock(void *arg) 3887 { 3888 struct socket *so = arg; 3889 3890 if (SOLISTENING(so)) 3891 SOCK_UNLOCK(so); 3892 else 3893 SOCKBUF_UNLOCK(&so->so_rcv); 3894 } 3895 3896 static void 3897 so_rdknl_assert_locked(void *arg) 3898 { 3899 struct socket *so = arg; 3900 3901 if (SOLISTENING(so)) 3902 SOCK_LOCK_ASSERT(so); 3903 else 3904 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3905 } 3906 3907 static void 3908 so_rdknl_assert_unlocked(void *arg) 3909 { 3910 struct socket *so = arg; 3911 3912 if (SOLISTENING(so)) 3913 SOCK_UNLOCK_ASSERT(so); 3914 else 3915 SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); 3916 } 3917 3918 static void 3919 so_wrknl_lock(void *arg) 3920 { 3921 struct socket *so = arg; 3922 3923 if (SOLISTENING(so)) 3924 SOCK_LOCK(so); 3925 else 3926 SOCKBUF_LOCK(&so->so_snd); 3927 } 3928 3929 static void 3930 so_wrknl_unlock(void *arg) 3931 { 3932 struct socket *so = arg; 3933 3934 if (SOLISTENING(so)) 3935 SOCK_UNLOCK(so); 3936 else 3937 SOCKBUF_UNLOCK(&so->so_snd); 3938 } 3939 3940 static void 3941 so_wrknl_assert_locked(void *arg) 3942 { 3943 struct socket *so = arg; 3944 3945 if (SOLISTENING(so)) 3946 SOCK_LOCK_ASSERT(so); 3947 else 3948 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3949 } 3950 3951 static void 3952 so_wrknl_assert_unlocked(void *arg) 3953 { 3954 struct socket *so = arg; 3955 3956 if (SOLISTENING(so)) 3957 SOCK_UNLOCK_ASSERT(so); 3958 else 3959 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); 3960 } 3961 3962 /* 3963 * Create an external-format (``xsocket'') structure using the information in 3964 * the kernel-format socket structure pointed to by so. This is done to 3965 * reduce the spew of irrelevant information over this interface, to isolate 3966 * user code from changes in the kernel structure, and potentially to provide 3967 * information-hiding if we decide that some of this information should be 3968 * hidden from users. 3969 */ 3970 void 3971 sotoxsocket(struct socket *so, struct xsocket *xso) 3972 { 3973 3974 xso->xso_len = sizeof *xso; 3975 xso->xso_so = so; 3976 xso->so_type = so->so_type; 3977 xso->so_options = so->so_options; 3978 xso->so_linger = so->so_linger; 3979 xso->so_state = so->so_state; 3980 xso->so_pcb = so->so_pcb; 3981 xso->xso_protocol = so->so_proto->pr_protocol; 3982 xso->xso_family = so->so_proto->pr_domain->dom_family; 3983 xso->so_timeo = so->so_timeo; 3984 xso->so_error = so->so_error; 3985 xso->so_uid = so->so_cred->cr_uid; 3986 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 3987 if (SOLISTENING(so)) { 3988 xso->so_qlen = so->sol_qlen; 3989 xso->so_incqlen = so->sol_incqlen; 3990 xso->so_qlimit = so->sol_qlimit; 3991 xso->so_oobmark = 0; 3992 bzero(&xso->so_snd, sizeof(xso->so_snd)); 3993 bzero(&xso->so_rcv, sizeof(xso->so_rcv)); 3994 } else { 3995 xso->so_state |= so->so_qstate; 3996 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; 3997 xso->so_oobmark = so->so_oobmark; 3998 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 3999 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 4000 } 4001 } 4002 4003 struct sockbuf * 4004 so_sockbuf_rcv(struct socket *so) 4005 { 4006 4007 return (&so->so_rcv); 4008 } 4009 4010 struct sockbuf * 4011 so_sockbuf_snd(struct socket *so) 4012 { 4013 4014 return (&so->so_snd); 4015 } 4016 4017 int 4018 so_state_get(const struct socket *so) 4019 { 4020 4021 return (so->so_state); 4022 } 4023 4024 void 4025 so_state_set(struct socket *so, int val) 4026 { 4027 4028 so->so_state = val; 4029 } 4030 4031 int 4032 so_options_get(const struct socket *so) 4033 { 4034 4035 return (so->so_options); 4036 } 4037 4038 void 4039 so_options_set(struct socket *so, int val) 4040 { 4041 4042 so->so_options = val; 4043 } 4044 4045 int 4046 so_error_get(const struct socket *so) 4047 { 4048 4049 return (so->so_error); 4050 } 4051 4052 void 4053 so_error_set(struct socket *so, int val) 4054 { 4055 4056 so->so_error = val; 4057 } 4058 4059 int 4060 so_linger_get(const struct socket *so) 4061 { 4062 4063 return (so->so_linger); 4064 } 4065 4066 void 4067 so_linger_set(struct socket *so, int val) 4068 { 4069 4070 so->so_linger = val; 4071 } 4072 4073 struct protosw * 4074 so_protosw_get(const struct socket *so) 4075 { 4076 4077 return (so->so_proto); 4078 } 4079 4080 void 4081 so_protosw_set(struct socket *so, struct protosw *val) 4082 { 4083 4084 so->so_proto = val; 4085 } 4086 4087 void 4088 so_sorwakeup(struct socket *so) 4089 { 4090 4091 sorwakeup(so); 4092 } 4093 4094 void 4095 so_sowwakeup(struct socket *so) 4096 { 4097 4098 sowwakeup(so); 4099 } 4100 4101 void 4102 so_sorwakeup_locked(struct socket *so) 4103 { 4104 4105 sorwakeup_locked(so); 4106 } 4107 4108 void 4109 so_sowwakeup_locked(struct socket *so) 4110 { 4111 4112 sowwakeup_locked(so); 4113 } 4114 4115 void 4116 so_lock(struct socket *so) 4117 { 4118 4119 SOCK_LOCK(so); 4120 } 4121 4122 void 4123 so_unlock(struct socket *so) 4124 { 4125 4126 SOCK_UNLOCK(so); 4127 } 4128