1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2008 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 * 96 * NOTE: With regard to VNETs the general rule is that callers do not set 97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 99 * and sorflush(), which are usually called from a pre-set VNET context. 100 * sopoll() currently does not need a VNET context to be set. 101 */ 102 103 #include <sys/cdefs.h> 104 __FBSDID("$FreeBSD$"); 105 106 #include "opt_inet.h" 107 #include "opt_inet6.h" 108 #include "opt_compat.h" 109 110 #include <sys/param.h> 111 #include <sys/systm.h> 112 #include <sys/fcntl.h> 113 #include <sys/limits.h> 114 #include <sys/lock.h> 115 #include <sys/mac.h> 116 #include <sys/malloc.h> 117 #include <sys/mbuf.h> 118 #include <sys/mutex.h> 119 #include <sys/domain.h> 120 #include <sys/file.h> /* for struct knote */ 121 #include <sys/kernel.h> 122 #include <sys/event.h> 123 #include <sys/eventhandler.h> 124 #include <sys/poll.h> 125 #include <sys/proc.h> 126 #include <sys/protosw.h> 127 #include <sys/socket.h> 128 #include <sys/socketvar.h> 129 #include <sys/resourcevar.h> 130 #include <net/route.h> 131 #include <sys/signalvar.h> 132 #include <sys/stat.h> 133 #include <sys/sx.h> 134 #include <sys/sysctl.h> 135 #include <sys/uio.h> 136 #include <sys/jail.h> 137 #include <sys/syslog.h> 138 #include <netinet/in.h> 139 140 #include <net/vnet.h> 141 142 #include <security/mac/mac_framework.h> 143 144 #include <vm/uma.h> 145 146 #ifdef COMPAT_FREEBSD32 147 #include <sys/mount.h> 148 #include <sys/sysent.h> 149 #include <compat/freebsd32/freebsd32.h> 150 #endif 151 152 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 153 int flags); 154 155 static void filt_sordetach(struct knote *kn); 156 static int filt_soread(struct knote *kn, long hint); 157 static void filt_sowdetach(struct knote *kn); 158 static int filt_sowrite(struct knote *kn, long hint); 159 static int filt_solisten(struct knote *kn, long hint); 160 161 static struct filterops solisten_filtops = { 162 .f_isfd = 1, 163 .f_detach = filt_sordetach, 164 .f_event = filt_solisten, 165 }; 166 static struct filterops soread_filtops = { 167 .f_isfd = 1, 168 .f_detach = filt_sordetach, 169 .f_event = filt_soread, 170 }; 171 static struct filterops sowrite_filtops = { 172 .f_isfd = 1, 173 .f_detach = filt_sowdetach, 174 .f_event = filt_sowrite, 175 }; 176 177 so_gen_t so_gencnt; /* generation count for sockets */ 178 179 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 180 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 181 182 #define VNET_SO_ASSERT(so) \ 183 VNET_ASSERT(curvnet != NULL, \ 184 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 185 186 /* 187 * Limit on the number of connections in the listen queue waiting 188 * for accept(2). 189 * NB: The orginal sysctl somaxconn is still available but hidden 190 * to prevent confusion about the actual purpose of this number. 191 */ 192 static int somaxconn = SOMAXCONN; 193 194 static int 195 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 196 { 197 int error; 198 int val; 199 200 val = somaxconn; 201 error = sysctl_handle_int(oidp, &val, 0, req); 202 if (error || !req->newptr ) 203 return (error); 204 205 if (val < 1 || val > USHRT_MAX) 206 return (EINVAL); 207 208 somaxconn = val; 209 return (0); 210 } 211 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 212 0, sizeof(int), sysctl_somaxconn, "I", 213 "Maximum listen socket pending connection accept queue size"); 214 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 215 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 216 0, sizeof(int), sysctl_somaxconn, "I", 217 "Maximum listen socket pending connection accept queue size (compat)"); 218 219 static int numopensockets; 220 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 221 &numopensockets, 0, "Number of open sockets"); 222 223 /* 224 * accept_mtx locks down per-socket fields relating to accept queues. See 225 * socketvar.h for an annotation of the protected fields of struct socket. 226 */ 227 struct mtx accept_mtx; 228 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 229 230 /* 231 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 232 * so_gencnt field. 233 */ 234 static struct mtx so_global_mtx; 235 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 236 237 /* 238 * General IPC sysctl name space, used by sockets and a variety of other IPC 239 * types. 240 */ 241 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 242 243 /* 244 * Initialize the socket subsystem and set up the socket 245 * memory allocator. 246 */ 247 static uma_zone_t socket_zone; 248 int maxsockets; 249 250 static void 251 socket_zone_change(void *tag) 252 { 253 254 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 255 } 256 257 static void 258 socket_init(void *tag) 259 { 260 261 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 262 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 263 maxsockets = uma_zone_set_max(socket_zone, maxsockets); 264 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); 265 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 266 EVENTHANDLER_PRI_FIRST); 267 } 268 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 269 270 /* 271 * Initialise maxsockets. This SYSINIT must be run after 272 * tunable_mbinit(). 273 */ 274 static void 275 init_maxsockets(void *ignored) 276 { 277 278 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 279 maxsockets = imax(maxsockets, maxfiles); 280 } 281 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 282 283 /* 284 * Sysctl to get and set the maximum global sockets limit. Notify protocols 285 * of the change so that they can update their dependent limits as required. 286 */ 287 static int 288 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 289 { 290 int error, newmaxsockets; 291 292 newmaxsockets = maxsockets; 293 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 294 if (error == 0 && req->newptr) { 295 if (newmaxsockets > maxsockets && 296 newmaxsockets <= maxfiles) { 297 maxsockets = newmaxsockets; 298 EVENTHANDLER_INVOKE(maxsockets_change); 299 } else 300 error = EINVAL; 301 } 302 return (error); 303 } 304 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 305 &maxsockets, 0, sysctl_maxsockets, "IU", 306 "Maximum number of sockets avaliable"); 307 308 /* 309 * Socket operation routines. These routines are called by the routines in 310 * sys_socket.c or from a system process, and implement the semantics of 311 * socket operations by switching out to the protocol specific routines. 312 */ 313 314 /* 315 * Get a socket structure from our zone, and initialize it. Note that it 316 * would probably be better to allocate socket and PCB at the same time, but 317 * I'm not convinced that all the protocols can be easily modified to do 318 * this. 319 * 320 * soalloc() returns a socket with a ref count of 0. 321 */ 322 static struct socket * 323 soalloc(struct vnet *vnet) 324 { 325 struct socket *so; 326 327 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 328 if (so == NULL) 329 return (NULL); 330 #ifdef MAC 331 if (mac_socket_init(so, M_NOWAIT) != 0) { 332 uma_zfree(socket_zone, so); 333 return (NULL); 334 } 335 #endif 336 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 337 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 338 sx_init(&so->so_snd.sb_sx, "so_snd_sx"); 339 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); 340 TAILQ_INIT(&so->so_aiojobq); 341 mtx_lock(&so_global_mtx); 342 so->so_gencnt = ++so_gencnt; 343 ++numopensockets; 344 #ifdef VIMAGE 345 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 346 __func__, __LINE__, so)); 347 vnet->vnet_sockcnt++; 348 so->so_vnet = vnet; 349 #endif 350 mtx_unlock(&so_global_mtx); 351 return (so); 352 } 353 354 /* 355 * Free the storage associated with a socket at the socket layer, tear down 356 * locks, labels, etc. All protocol state is assumed already to have been 357 * torn down (and possibly never set up) by the caller. 358 */ 359 static void 360 sodealloc(struct socket *so) 361 { 362 363 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 364 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 365 366 mtx_lock(&so_global_mtx); 367 so->so_gencnt = ++so_gencnt; 368 --numopensockets; /* Could be below, but faster here. */ 369 #ifdef VIMAGE 370 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 371 __func__, __LINE__, so)); 372 so->so_vnet->vnet_sockcnt--; 373 #endif 374 mtx_unlock(&so_global_mtx); 375 if (so->so_rcv.sb_hiwat) 376 (void)chgsbsize(so->so_cred->cr_uidinfo, 377 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 378 if (so->so_snd.sb_hiwat) 379 (void)chgsbsize(so->so_cred->cr_uidinfo, 380 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 381 /* remove acccept filter if one is present. */ 382 if (so->so_accf != NULL) 383 do_setopt_accept_filter(so, NULL); 384 #ifdef MAC 385 mac_socket_destroy(so); 386 #endif 387 crfree(so->so_cred); 388 sx_destroy(&so->so_snd.sb_sx); 389 sx_destroy(&so->so_rcv.sb_sx); 390 SOCKBUF_LOCK_DESTROY(&so->so_snd); 391 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 392 uma_zfree(socket_zone, so); 393 } 394 395 /* 396 * socreate returns a socket with a ref count of 1. The socket should be 397 * closed with soclose(). 398 */ 399 int 400 socreate(int dom, struct socket **aso, int type, int proto, 401 struct ucred *cred, struct thread *td) 402 { 403 struct protosw *prp; 404 struct socket *so; 405 int error; 406 407 if (proto) 408 prp = pffindproto(dom, proto, type); 409 else 410 prp = pffindtype(dom, type); 411 412 if (prp == NULL) { 413 /* No support for domain. */ 414 if (pffinddomain(dom) == NULL) 415 return (EAFNOSUPPORT); 416 /* No support for socket type. */ 417 if (proto == 0 && type != 0) 418 return (EPROTOTYPE); 419 return (EPROTONOSUPPORT); 420 } 421 if (prp->pr_usrreqs->pru_attach == NULL || 422 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 423 return (EPROTONOSUPPORT); 424 425 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 426 return (EPROTONOSUPPORT); 427 428 if (prp->pr_type != type) 429 return (EPROTOTYPE); 430 so = soalloc(CRED_TO_VNET(cred)); 431 if (so == NULL) 432 return (ENOBUFS); 433 434 TAILQ_INIT(&so->so_incomp); 435 TAILQ_INIT(&so->so_comp); 436 so->so_type = type; 437 so->so_cred = crhold(cred); 438 if ((prp->pr_domain->dom_family == PF_INET) || 439 (prp->pr_domain->dom_family == PF_INET6) || 440 (prp->pr_domain->dom_family == PF_ROUTE)) 441 so->so_fibnum = td->td_proc->p_fibnum; 442 else 443 so->so_fibnum = 0; 444 so->so_proto = prp; 445 #ifdef MAC 446 mac_socket_create(cred, so); 447 #endif 448 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 449 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 450 so->so_count = 1; 451 /* 452 * Auto-sizing of socket buffers is managed by the protocols and 453 * the appropriate flags must be set in the pru_attach function. 454 */ 455 CURVNET_SET(so->so_vnet); 456 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 457 CURVNET_RESTORE(); 458 if (error) { 459 KASSERT(so->so_count == 1, ("socreate: so_count %d", 460 so->so_count)); 461 so->so_count = 0; 462 sodealloc(so); 463 return (error); 464 } 465 *aso = so; 466 return (0); 467 } 468 469 #ifdef REGRESSION 470 static int regression_sonewconn_earlytest = 1; 471 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 472 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 473 #endif 474 475 /* 476 * When an attempt at a new connection is noted on a socket which accepts 477 * connections, sonewconn is called. If the connection is possible (subject 478 * to space constraints, etc.) then we allocate a new structure, propoerly 479 * linked into the data structure of the original socket, and return this. 480 * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. 481 * 482 * Note: the ref count on the socket is 0 on return. 483 */ 484 struct socket * 485 sonewconn(struct socket *head, int connstatus) 486 { 487 static struct timeval lastover; 488 static struct timeval overinterval = { 60, 0 }; 489 static int overcount; 490 491 struct socket *so; 492 int over; 493 494 ACCEPT_LOCK(); 495 over = (head->so_qlen > 3 * head->so_qlimit / 2); 496 ACCEPT_UNLOCK(); 497 #ifdef REGRESSION 498 if (regression_sonewconn_earlytest && over) { 499 #else 500 if (over) { 501 #endif 502 overcount++; 503 504 if (ratecheck(&lastover, &overinterval)) { 505 log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " 506 "%i already in queue awaiting acceptance " 507 "(%d occurrences)\n", 508 __func__, head->so_pcb, head->so_qlen, overcount); 509 510 overcount = 0; 511 } 512 513 return (NULL); 514 } 515 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 516 __func__, __LINE__, head)); 517 so = soalloc(head->so_vnet); 518 if (so == NULL) { 519 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 520 "limit reached or out of memory\n", 521 __func__, head->so_pcb); 522 return (NULL); 523 } 524 if ((head->so_options & SO_ACCEPTFILTER) != 0) 525 connstatus = 0; 526 so->so_head = head; 527 so->so_type = head->so_type; 528 so->so_options = head->so_options &~ SO_ACCEPTCONN; 529 so->so_linger = head->so_linger; 530 so->so_state = head->so_state | SS_NOFDREF; 531 so->so_fibnum = head->so_fibnum; 532 so->so_proto = head->so_proto; 533 so->so_cred = crhold(head->so_cred); 534 #ifdef MAC 535 mac_socket_newconn(head, so); 536 #endif 537 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 538 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 539 VNET_SO_ASSERT(head); 540 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 541 sodealloc(so); 542 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 543 __func__, head->so_pcb); 544 return (NULL); 545 } 546 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 547 sodealloc(so); 548 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 549 __func__, head->so_pcb); 550 return (NULL); 551 } 552 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 553 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 554 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 555 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 556 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 557 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 558 so->so_state |= connstatus; 559 ACCEPT_LOCK(); 560 /* 561 * The accept socket may be tearing down but we just 562 * won a race on the ACCEPT_LOCK. 563 * However, if sctp_peeloff() is called on a 1-to-many 564 * style socket, the SO_ACCEPTCONN doesn't need to be set. 565 */ 566 if (!(head->so_options & SO_ACCEPTCONN) && 567 ((head->so_proto->pr_protocol != IPPROTO_SCTP) || 568 (head->so_type != SOCK_SEQPACKET))) { 569 SOCK_LOCK(so); 570 so->so_head = NULL; 571 sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ 572 return (NULL); 573 } 574 if (connstatus) { 575 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 576 so->so_qstate |= SQ_COMP; 577 head->so_qlen++; 578 } else { 579 /* 580 * Keep removing sockets from the head until there's room for 581 * us to insert on the tail. In pre-locking revisions, this 582 * was a simple if(), but as we could be racing with other 583 * threads and soabort() requires dropping locks, we must 584 * loop waiting for the condition to be true. 585 */ 586 while (head->so_incqlen > head->so_qlimit) { 587 struct socket *sp; 588 sp = TAILQ_FIRST(&head->so_incomp); 589 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 590 head->so_incqlen--; 591 sp->so_qstate &= ~SQ_INCOMP; 592 sp->so_head = NULL; 593 ACCEPT_UNLOCK(); 594 soabort(sp); 595 ACCEPT_LOCK(); 596 } 597 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 598 so->so_qstate |= SQ_INCOMP; 599 head->so_incqlen++; 600 } 601 ACCEPT_UNLOCK(); 602 if (connstatus) { 603 sorwakeup(head); 604 wakeup_one(&head->so_timeo); 605 } 606 return (so); 607 } 608 609 int 610 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 611 { 612 int error; 613 614 CURVNET_SET(so->so_vnet); 615 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 616 CURVNET_RESTORE(); 617 return (error); 618 } 619 620 int 621 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 622 { 623 int error; 624 625 CURVNET_SET(so->so_vnet); 626 error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); 627 CURVNET_RESTORE(); 628 return (error); 629 } 630 631 /* 632 * solisten() transitions a socket from a non-listening state to a listening 633 * state, but can also be used to update the listen queue depth on an 634 * existing listen socket. The protocol will call back into the sockets 635 * layer using solisten_proto_check() and solisten_proto() to check and set 636 * socket-layer listen state. Call backs are used so that the protocol can 637 * acquire both protocol and socket layer locks in whatever order is required 638 * by the protocol. 639 * 640 * Protocol implementors are advised to hold the socket lock across the 641 * socket-layer test and set to avoid races at the socket layer. 642 */ 643 int 644 solisten(struct socket *so, int backlog, struct thread *td) 645 { 646 int error; 647 648 CURVNET_SET(so->so_vnet); 649 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 650 CURVNET_RESTORE(); 651 return (error); 652 } 653 654 int 655 solisten_proto_check(struct socket *so) 656 { 657 658 SOCK_LOCK_ASSERT(so); 659 660 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 661 SS_ISDISCONNECTING)) 662 return (EINVAL); 663 return (0); 664 } 665 666 void 667 solisten_proto(struct socket *so, int backlog) 668 { 669 670 SOCK_LOCK_ASSERT(so); 671 672 if (backlog < 0 || backlog > somaxconn) 673 backlog = somaxconn; 674 so->so_qlimit = backlog; 675 so->so_options |= SO_ACCEPTCONN; 676 } 677 678 /* 679 * Evaluate the reference count and named references on a socket; if no 680 * references remain, free it. This should be called whenever a reference is 681 * released, such as in sorele(), but also when named reference flags are 682 * cleared in socket or protocol code. 683 * 684 * sofree() will free the socket if: 685 * 686 * - There are no outstanding file descriptor references or related consumers 687 * (so_count == 0). 688 * 689 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 690 * 691 * - The protocol does not have an outstanding strong reference on the socket 692 * (SS_PROTOREF). 693 * 694 * - The socket is not in a completed connection queue, so a process has been 695 * notified that it is present. If it is removed, the user process may 696 * block in accept() despite select() saying the socket was ready. 697 */ 698 void 699 sofree(struct socket *so) 700 { 701 struct protosw *pr = so->so_proto; 702 struct socket *head; 703 704 ACCEPT_LOCK_ASSERT(); 705 SOCK_LOCK_ASSERT(so); 706 707 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 708 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 709 SOCK_UNLOCK(so); 710 ACCEPT_UNLOCK(); 711 return; 712 } 713 714 head = so->so_head; 715 if (head != NULL) { 716 KASSERT((so->so_qstate & SQ_COMP) != 0 || 717 (so->so_qstate & SQ_INCOMP) != 0, 718 ("sofree: so_head != NULL, but neither SQ_COMP nor " 719 "SQ_INCOMP")); 720 KASSERT((so->so_qstate & SQ_COMP) == 0 || 721 (so->so_qstate & SQ_INCOMP) == 0, 722 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 723 TAILQ_REMOVE(&head->so_incomp, so, so_list); 724 head->so_incqlen--; 725 so->so_qstate &= ~SQ_INCOMP; 726 so->so_head = NULL; 727 } 728 KASSERT((so->so_qstate & SQ_COMP) == 0 && 729 (so->so_qstate & SQ_INCOMP) == 0, 730 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 731 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 732 if (so->so_options & SO_ACCEPTCONN) { 733 KASSERT((TAILQ_EMPTY(&so->so_comp)), 734 ("sofree: so_comp populated")); 735 KASSERT((TAILQ_EMPTY(&so->so_incomp)), 736 ("sofree: so_incomp populated")); 737 } 738 SOCK_UNLOCK(so); 739 ACCEPT_UNLOCK(); 740 741 VNET_SO_ASSERT(so); 742 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 743 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 744 if (pr->pr_usrreqs->pru_detach != NULL) 745 (*pr->pr_usrreqs->pru_detach)(so); 746 747 /* 748 * From this point on, we assume that no other references to this 749 * socket exist anywhere else in the stack. Therefore, no locks need 750 * to be acquired or held. 751 * 752 * We used to do a lot of socket buffer and socket locking here, as 753 * well as invoke sorflush() and perform wakeups. The direct call to 754 * dom_dispose() and sbrelease_internal() are an inlining of what was 755 * necessary from sorflush(). 756 * 757 * Notice that the socket buffer and kqueue state are torn down 758 * before calling pru_detach. This means that protocols shold not 759 * assume they can perform socket wakeups, etc, in their detach code. 760 */ 761 sbdestroy(&so->so_snd, so); 762 sbdestroy(&so->so_rcv, so); 763 seldrain(&so->so_snd.sb_sel); 764 seldrain(&so->so_rcv.sb_sel); 765 knlist_destroy(&so->so_rcv.sb_sel.si_note); 766 knlist_destroy(&so->so_snd.sb_sel.si_note); 767 sodealloc(so); 768 } 769 770 /* 771 * Close a socket on last file table reference removal. Initiate disconnect 772 * if connected. Free socket when disconnect complete. 773 * 774 * This function will sorele() the socket. Note that soclose() may be called 775 * prior to the ref count reaching zero. The actual socket structure will 776 * not be freed until the ref count reaches zero. 777 */ 778 int 779 soclose(struct socket *so) 780 { 781 int error = 0; 782 783 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 784 785 CURVNET_SET(so->so_vnet); 786 funsetown(&so->so_sigio); 787 if (so->so_state & SS_ISCONNECTED) { 788 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 789 error = sodisconnect(so); 790 if (error) { 791 if (error == ENOTCONN) 792 error = 0; 793 goto drop; 794 } 795 } 796 if (so->so_options & SO_LINGER) { 797 if ((so->so_state & SS_ISDISCONNECTING) && 798 (so->so_state & SS_NBIO)) 799 goto drop; 800 while (so->so_state & SS_ISCONNECTED) { 801 error = tsleep(&so->so_timeo, 802 PSOCK | PCATCH, "soclos", 803 so->so_linger * hz); 804 if (error) 805 break; 806 } 807 } 808 } 809 810 drop: 811 if (so->so_proto->pr_usrreqs->pru_close != NULL) 812 (*so->so_proto->pr_usrreqs->pru_close)(so); 813 ACCEPT_LOCK(); 814 if (so->so_options & SO_ACCEPTCONN) { 815 struct socket *sp; 816 /* 817 * Prevent new additions to the accept queues due 818 * to ACCEPT_LOCK races while we are draining them. 819 */ 820 so->so_options &= ~SO_ACCEPTCONN; 821 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 822 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 823 so->so_incqlen--; 824 sp->so_qstate &= ~SQ_INCOMP; 825 sp->so_head = NULL; 826 ACCEPT_UNLOCK(); 827 soabort(sp); 828 ACCEPT_LOCK(); 829 } 830 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 831 TAILQ_REMOVE(&so->so_comp, sp, so_list); 832 so->so_qlen--; 833 sp->so_qstate &= ~SQ_COMP; 834 sp->so_head = NULL; 835 ACCEPT_UNLOCK(); 836 soabort(sp); 837 ACCEPT_LOCK(); 838 } 839 KASSERT((TAILQ_EMPTY(&so->so_comp)), 840 ("%s: so_comp populated", __func__)); 841 KASSERT((TAILQ_EMPTY(&so->so_incomp)), 842 ("%s: so_incomp populated", __func__)); 843 } 844 SOCK_LOCK(so); 845 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 846 so->so_state |= SS_NOFDREF; 847 sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ 848 CURVNET_RESTORE(); 849 return (error); 850 } 851 852 /* 853 * soabort() is used to abruptly tear down a connection, such as when a 854 * resource limit is reached (listen queue depth exceeded), or if a listen 855 * socket is closed while there are sockets waiting to be accepted. 856 * 857 * This interface is tricky, because it is called on an unreferenced socket, 858 * and must be called only by a thread that has actually removed the socket 859 * from the listen queue it was on, or races with other threads are risked. 860 * 861 * This interface will call into the protocol code, so must not be called 862 * with any socket locks held. Protocols do call it while holding their own 863 * recursible protocol mutexes, but this is something that should be subject 864 * to review in the future. 865 */ 866 void 867 soabort(struct socket *so) 868 { 869 870 /* 871 * In as much as is possible, assert that no references to this 872 * socket are held. This is not quite the same as asserting that the 873 * current thread is responsible for arranging for no references, but 874 * is as close as we can get for now. 875 */ 876 KASSERT(so->so_count == 0, ("soabort: so_count")); 877 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 878 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 879 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 880 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 881 VNET_SO_ASSERT(so); 882 883 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 884 (*so->so_proto->pr_usrreqs->pru_abort)(so); 885 ACCEPT_LOCK(); 886 SOCK_LOCK(so); 887 sofree(so); 888 } 889 890 int 891 soaccept(struct socket *so, struct sockaddr **nam) 892 { 893 int error; 894 895 SOCK_LOCK(so); 896 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 897 so->so_state &= ~SS_NOFDREF; 898 SOCK_UNLOCK(so); 899 900 CURVNET_SET(so->so_vnet); 901 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 902 CURVNET_RESTORE(); 903 return (error); 904 } 905 906 int 907 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 908 { 909 910 return (soconnectat(AT_FDCWD, so, nam, td)); 911 } 912 913 int 914 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 915 { 916 int error; 917 918 if (so->so_options & SO_ACCEPTCONN) 919 return (EOPNOTSUPP); 920 921 CURVNET_SET(so->so_vnet); 922 /* 923 * If protocol is connection-based, can only connect once. 924 * Otherwise, if connected, try to disconnect first. This allows 925 * user to disconnect by connecting to, e.g., a null address. 926 */ 927 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 928 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 929 (error = sodisconnect(so)))) { 930 error = EISCONN; 931 } else { 932 /* 933 * Prevent accumulated error from previous connection from 934 * biting us. 935 */ 936 so->so_error = 0; 937 if (fd == AT_FDCWD) { 938 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, 939 nam, td); 940 } else { 941 error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, 942 so, nam, td); 943 } 944 } 945 CURVNET_RESTORE(); 946 947 return (error); 948 } 949 950 int 951 soconnect2(struct socket *so1, struct socket *so2) 952 { 953 int error; 954 955 CURVNET_SET(so1->so_vnet); 956 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 957 CURVNET_RESTORE(); 958 return (error); 959 } 960 961 int 962 sodisconnect(struct socket *so) 963 { 964 int error; 965 966 if ((so->so_state & SS_ISCONNECTED) == 0) 967 return (ENOTCONN); 968 if (so->so_state & SS_ISDISCONNECTING) 969 return (EALREADY); 970 VNET_SO_ASSERT(so); 971 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 972 return (error); 973 } 974 975 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 976 977 int 978 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 979 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 980 { 981 long space; 982 ssize_t resid; 983 int clen = 0, error, dontroute; 984 985 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 986 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 987 ("sosend_dgram: !PR_ATOMIC")); 988 989 if (uio != NULL) 990 resid = uio->uio_resid; 991 else 992 resid = top->m_pkthdr.len; 993 /* 994 * In theory resid should be unsigned. However, space must be 995 * signed, as it might be less than 0 if we over-committed, and we 996 * must use a signed comparison of space and resid. On the other 997 * hand, a negative resid causes us to loop sending 0-length 998 * segments to the protocol. 999 */ 1000 if (resid < 0) { 1001 error = EINVAL; 1002 goto out; 1003 } 1004 1005 dontroute = 1006 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1007 if (td != NULL) 1008 td->td_ru.ru_msgsnd++; 1009 if (control != NULL) 1010 clen = control->m_len; 1011 1012 SOCKBUF_LOCK(&so->so_snd); 1013 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1014 SOCKBUF_UNLOCK(&so->so_snd); 1015 error = EPIPE; 1016 goto out; 1017 } 1018 if (so->so_error) { 1019 error = so->so_error; 1020 so->so_error = 0; 1021 SOCKBUF_UNLOCK(&so->so_snd); 1022 goto out; 1023 } 1024 if ((so->so_state & SS_ISCONNECTED) == 0) { 1025 /* 1026 * `sendto' and `sendmsg' is allowed on a connection-based 1027 * socket if it supports implied connect. Return ENOTCONN if 1028 * not connected and no address is supplied. 1029 */ 1030 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1031 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1032 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1033 !(resid == 0 && clen != 0)) { 1034 SOCKBUF_UNLOCK(&so->so_snd); 1035 error = ENOTCONN; 1036 goto out; 1037 } 1038 } else if (addr == NULL) { 1039 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1040 error = ENOTCONN; 1041 else 1042 error = EDESTADDRREQ; 1043 SOCKBUF_UNLOCK(&so->so_snd); 1044 goto out; 1045 } 1046 } 1047 1048 /* 1049 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1050 * problem and need fixing. 1051 */ 1052 space = sbspace(&so->so_snd); 1053 if (flags & MSG_OOB) 1054 space += 1024; 1055 space -= clen; 1056 SOCKBUF_UNLOCK(&so->so_snd); 1057 if (resid > space) { 1058 error = EMSGSIZE; 1059 goto out; 1060 } 1061 if (uio == NULL) { 1062 resid = 0; 1063 if (flags & MSG_EOR) 1064 top->m_flags |= M_EOR; 1065 } else { 1066 /* 1067 * Copy the data from userland into a mbuf chain. 1068 * If no data is to be copied in, a single empty mbuf 1069 * is returned. 1070 */ 1071 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1072 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1073 if (top == NULL) { 1074 error = EFAULT; /* only possible error */ 1075 goto out; 1076 } 1077 space -= resid - uio->uio_resid; 1078 resid = uio->uio_resid; 1079 } 1080 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1081 /* 1082 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1083 * than with. 1084 */ 1085 if (dontroute) { 1086 SOCK_LOCK(so); 1087 so->so_options |= SO_DONTROUTE; 1088 SOCK_UNLOCK(so); 1089 } 1090 /* 1091 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1092 * of date. We could have recieved a reset packet in an interrupt or 1093 * maybe we slept while doing page faults in uiomove() etc. We could 1094 * probably recheck again inside the locking protection here, but 1095 * there are probably other places that this also happens. We must 1096 * rethink this. 1097 */ 1098 VNET_SO_ASSERT(so); 1099 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1100 (flags & MSG_OOB) ? PRUS_OOB : 1101 /* 1102 * If the user set MSG_EOF, the protocol understands this flag and 1103 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1104 */ 1105 ((flags & MSG_EOF) && 1106 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1107 (resid <= 0)) ? 1108 PRUS_EOF : 1109 /* If there is more to send set PRUS_MORETOCOME */ 1110 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1111 top, addr, control, td); 1112 if (dontroute) { 1113 SOCK_LOCK(so); 1114 so->so_options &= ~SO_DONTROUTE; 1115 SOCK_UNLOCK(so); 1116 } 1117 clen = 0; 1118 control = NULL; 1119 top = NULL; 1120 out: 1121 if (top != NULL) 1122 m_freem(top); 1123 if (control != NULL) 1124 m_freem(control); 1125 return (error); 1126 } 1127 1128 /* 1129 * Send on a socket. If send must go all at once and message is larger than 1130 * send buffering, then hard error. Lock against other senders. If must go 1131 * all at once and not enough room now, then inform user that this would 1132 * block and do nothing. Otherwise, if nonblocking, send as much as 1133 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1134 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1135 * in mbuf chain must be small enough to send all at once. 1136 * 1137 * Returns nonzero on error, timeout or signal; callers must check for short 1138 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1139 * on return. 1140 */ 1141 int 1142 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1143 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1144 { 1145 long space; 1146 ssize_t resid; 1147 int clen = 0, error, dontroute; 1148 int atomic = sosendallatonce(so) || top; 1149 1150 if (uio != NULL) 1151 resid = uio->uio_resid; 1152 else 1153 resid = top->m_pkthdr.len; 1154 /* 1155 * In theory resid should be unsigned. However, space must be 1156 * signed, as it might be less than 0 if we over-committed, and we 1157 * must use a signed comparison of space and resid. On the other 1158 * hand, a negative resid causes us to loop sending 0-length 1159 * segments to the protocol. 1160 * 1161 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1162 * type sockets since that's an error. 1163 */ 1164 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1165 error = EINVAL; 1166 goto out; 1167 } 1168 1169 dontroute = 1170 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1171 (so->so_proto->pr_flags & PR_ATOMIC); 1172 if (td != NULL) 1173 td->td_ru.ru_msgsnd++; 1174 if (control != NULL) 1175 clen = control->m_len; 1176 1177 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1178 if (error) 1179 goto out; 1180 1181 restart: 1182 do { 1183 SOCKBUF_LOCK(&so->so_snd); 1184 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1185 SOCKBUF_UNLOCK(&so->so_snd); 1186 error = EPIPE; 1187 goto release; 1188 } 1189 if (so->so_error) { 1190 error = so->so_error; 1191 so->so_error = 0; 1192 SOCKBUF_UNLOCK(&so->so_snd); 1193 goto release; 1194 } 1195 if ((so->so_state & SS_ISCONNECTED) == 0) { 1196 /* 1197 * `sendto' and `sendmsg' is allowed on a connection- 1198 * based socket if it supports implied connect. 1199 * Return ENOTCONN if not connected and no address is 1200 * supplied. 1201 */ 1202 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1203 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1204 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1205 !(resid == 0 && clen != 0)) { 1206 SOCKBUF_UNLOCK(&so->so_snd); 1207 error = ENOTCONN; 1208 goto release; 1209 } 1210 } else if (addr == NULL) { 1211 SOCKBUF_UNLOCK(&so->so_snd); 1212 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1213 error = ENOTCONN; 1214 else 1215 error = EDESTADDRREQ; 1216 goto release; 1217 } 1218 } 1219 space = sbspace(&so->so_snd); 1220 if (flags & MSG_OOB) 1221 space += 1024; 1222 if ((atomic && resid > so->so_snd.sb_hiwat) || 1223 clen > so->so_snd.sb_hiwat) { 1224 SOCKBUF_UNLOCK(&so->so_snd); 1225 error = EMSGSIZE; 1226 goto release; 1227 } 1228 if (space < resid + clen && 1229 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1230 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1231 SOCKBUF_UNLOCK(&so->so_snd); 1232 error = EWOULDBLOCK; 1233 goto release; 1234 } 1235 error = sbwait(&so->so_snd); 1236 SOCKBUF_UNLOCK(&so->so_snd); 1237 if (error) 1238 goto release; 1239 goto restart; 1240 } 1241 SOCKBUF_UNLOCK(&so->so_snd); 1242 space -= clen; 1243 do { 1244 if (uio == NULL) { 1245 resid = 0; 1246 if (flags & MSG_EOR) 1247 top->m_flags |= M_EOR; 1248 } else { 1249 /* 1250 * Copy the data from userland into a mbuf 1251 * chain. If no data is to be copied in, 1252 * a single empty mbuf is returned. 1253 */ 1254 top = m_uiotombuf(uio, M_WAITOK, space, 1255 (atomic ? max_hdr : 0), 1256 (atomic ? M_PKTHDR : 0) | 1257 ((flags & MSG_EOR) ? M_EOR : 0)); 1258 if (top == NULL) { 1259 error = EFAULT; /* only possible error */ 1260 goto release; 1261 } 1262 space -= resid - uio->uio_resid; 1263 resid = uio->uio_resid; 1264 } 1265 if (dontroute) { 1266 SOCK_LOCK(so); 1267 so->so_options |= SO_DONTROUTE; 1268 SOCK_UNLOCK(so); 1269 } 1270 /* 1271 * XXX all the SBS_CANTSENDMORE checks previously 1272 * done could be out of date. We could have recieved 1273 * a reset packet in an interrupt or maybe we slept 1274 * while doing page faults in uiomove() etc. We 1275 * could probably recheck again inside the locking 1276 * protection here, but there are probably other 1277 * places that this also happens. We must rethink 1278 * this. 1279 */ 1280 VNET_SO_ASSERT(so); 1281 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1282 (flags & MSG_OOB) ? PRUS_OOB : 1283 /* 1284 * If the user set MSG_EOF, the protocol understands 1285 * this flag and nothing left to send then use 1286 * PRU_SEND_EOF instead of PRU_SEND. 1287 */ 1288 ((flags & MSG_EOF) && 1289 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1290 (resid <= 0)) ? 1291 PRUS_EOF : 1292 /* If there is more to send set PRUS_MORETOCOME. */ 1293 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1294 top, addr, control, td); 1295 if (dontroute) { 1296 SOCK_LOCK(so); 1297 so->so_options &= ~SO_DONTROUTE; 1298 SOCK_UNLOCK(so); 1299 } 1300 clen = 0; 1301 control = NULL; 1302 top = NULL; 1303 if (error) 1304 goto release; 1305 } while (resid && space > 0); 1306 } while (resid); 1307 1308 release: 1309 sbunlock(&so->so_snd); 1310 out: 1311 if (top != NULL) 1312 m_freem(top); 1313 if (control != NULL) 1314 m_freem(control); 1315 return (error); 1316 } 1317 1318 int 1319 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1320 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1321 { 1322 int error; 1323 1324 CURVNET_SET(so->so_vnet); 1325 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1326 control, flags, td); 1327 CURVNET_RESTORE(); 1328 return (error); 1329 } 1330 1331 /* 1332 * The part of soreceive() that implements reading non-inline out-of-band 1333 * data from a socket. For more complete comments, see soreceive(), from 1334 * which this code originated. 1335 * 1336 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1337 * unable to return an mbuf chain to the caller. 1338 */ 1339 static int 1340 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1341 { 1342 struct protosw *pr = so->so_proto; 1343 struct mbuf *m; 1344 int error; 1345 1346 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1347 VNET_SO_ASSERT(so); 1348 1349 m = m_get(M_WAITOK, MT_DATA); 1350 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1351 if (error) 1352 goto bad; 1353 do { 1354 error = uiomove(mtod(m, void *), 1355 (int) min(uio->uio_resid, m->m_len), uio); 1356 m = m_free(m); 1357 } while (uio->uio_resid && error == 0 && m); 1358 bad: 1359 if (m != NULL) 1360 m_freem(m); 1361 return (error); 1362 } 1363 1364 /* 1365 * Following replacement or removal of the first mbuf on the first mbuf chain 1366 * of a socket buffer, push necessary state changes back into the socket 1367 * buffer so that other consumers see the values consistently. 'nextrecord' 1368 * is the callers locally stored value of the original value of 1369 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1370 * NOTE: 'nextrecord' may be NULL. 1371 */ 1372 static __inline void 1373 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1374 { 1375 1376 SOCKBUF_LOCK_ASSERT(sb); 1377 /* 1378 * First, update for the new value of nextrecord. If necessary, make 1379 * it the first record. 1380 */ 1381 if (sb->sb_mb != NULL) 1382 sb->sb_mb->m_nextpkt = nextrecord; 1383 else 1384 sb->sb_mb = nextrecord; 1385 1386 /* 1387 * Now update any dependent socket buffer fields to reflect the new 1388 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1389 * addition of a second clause that takes care of the case where 1390 * sb_mb has been updated, but remains the last record. 1391 */ 1392 if (sb->sb_mb == NULL) { 1393 sb->sb_mbtail = NULL; 1394 sb->sb_lastrecord = NULL; 1395 } else if (sb->sb_mb->m_nextpkt == NULL) 1396 sb->sb_lastrecord = sb->sb_mb; 1397 } 1398 1399 /* 1400 * Implement receive operations on a socket. We depend on the way that 1401 * records are added to the sockbuf by sbappend. In particular, each record 1402 * (mbufs linked through m_next) must begin with an address if the protocol 1403 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1404 * data, and then zero or more mbufs of data. In order to allow parallelism 1405 * between network receive and copying to user space, as well as avoid 1406 * sleeping with a mutex held, we release the socket buffer mutex during the 1407 * user space copy. Although the sockbuf is locked, new data may still be 1408 * appended, and thus we must maintain consistency of the sockbuf during that 1409 * time. 1410 * 1411 * The caller may receive the data as a single mbuf chain by supplying an 1412 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1413 * the count in uio_resid. 1414 */ 1415 int 1416 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1417 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1418 { 1419 struct mbuf *m, **mp; 1420 int flags, error, offset; 1421 ssize_t len; 1422 struct protosw *pr = so->so_proto; 1423 struct mbuf *nextrecord; 1424 int moff, type = 0; 1425 ssize_t orig_resid = uio->uio_resid; 1426 1427 mp = mp0; 1428 if (psa != NULL) 1429 *psa = NULL; 1430 if (controlp != NULL) 1431 *controlp = NULL; 1432 if (flagsp != NULL) 1433 flags = *flagsp &~ MSG_EOR; 1434 else 1435 flags = 0; 1436 if (flags & MSG_OOB) 1437 return (soreceive_rcvoob(so, uio, flags)); 1438 if (mp != NULL) 1439 *mp = NULL; 1440 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1441 && uio->uio_resid) { 1442 VNET_SO_ASSERT(so); 1443 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1444 } 1445 1446 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1447 if (error) 1448 return (error); 1449 1450 restart: 1451 SOCKBUF_LOCK(&so->so_rcv); 1452 m = so->so_rcv.sb_mb; 1453 /* 1454 * If we have less data than requested, block awaiting more (subject 1455 * to any timeout) if: 1456 * 1. the current count is less than the low water mark, or 1457 * 2. MSG_DONTWAIT is not set 1458 */ 1459 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1460 so->so_rcv.sb_cc < uio->uio_resid) && 1461 so->so_rcv.sb_cc < so->so_rcv.sb_lowat && 1462 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1463 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1464 ("receive: m == %p so->so_rcv.sb_cc == %u", 1465 m, so->so_rcv.sb_cc)); 1466 if (so->so_error) { 1467 if (m != NULL) 1468 goto dontblock; 1469 error = so->so_error; 1470 if ((flags & MSG_PEEK) == 0) 1471 so->so_error = 0; 1472 SOCKBUF_UNLOCK(&so->so_rcv); 1473 goto release; 1474 } 1475 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1476 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1477 if (m == NULL) { 1478 SOCKBUF_UNLOCK(&so->so_rcv); 1479 goto release; 1480 } else 1481 goto dontblock; 1482 } 1483 for (; m != NULL; m = m->m_next) 1484 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1485 m = so->so_rcv.sb_mb; 1486 goto dontblock; 1487 } 1488 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1489 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1490 SOCKBUF_UNLOCK(&so->so_rcv); 1491 error = ENOTCONN; 1492 goto release; 1493 } 1494 if (uio->uio_resid == 0) { 1495 SOCKBUF_UNLOCK(&so->so_rcv); 1496 goto release; 1497 } 1498 if ((so->so_state & SS_NBIO) || 1499 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1500 SOCKBUF_UNLOCK(&so->so_rcv); 1501 error = EWOULDBLOCK; 1502 goto release; 1503 } 1504 SBLASTRECORDCHK(&so->so_rcv); 1505 SBLASTMBUFCHK(&so->so_rcv); 1506 error = sbwait(&so->so_rcv); 1507 SOCKBUF_UNLOCK(&so->so_rcv); 1508 if (error) 1509 goto release; 1510 goto restart; 1511 } 1512 dontblock: 1513 /* 1514 * From this point onward, we maintain 'nextrecord' as a cache of the 1515 * pointer to the next record in the socket buffer. We must keep the 1516 * various socket buffer pointers and local stack versions of the 1517 * pointers in sync, pushing out modifications before dropping the 1518 * socket buffer mutex, and re-reading them when picking it up. 1519 * 1520 * Otherwise, we will race with the network stack appending new data 1521 * or records onto the socket buffer by using inconsistent/stale 1522 * versions of the field, possibly resulting in socket buffer 1523 * corruption. 1524 * 1525 * By holding the high-level sblock(), we prevent simultaneous 1526 * readers from pulling off the front of the socket buffer. 1527 */ 1528 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1529 if (uio->uio_td) 1530 uio->uio_td->td_ru.ru_msgrcv++; 1531 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1532 SBLASTRECORDCHK(&so->so_rcv); 1533 SBLASTMBUFCHK(&so->so_rcv); 1534 nextrecord = m->m_nextpkt; 1535 if (pr->pr_flags & PR_ADDR) { 1536 KASSERT(m->m_type == MT_SONAME, 1537 ("m->m_type == %d", m->m_type)); 1538 orig_resid = 0; 1539 if (psa != NULL) 1540 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1541 M_NOWAIT); 1542 if (flags & MSG_PEEK) { 1543 m = m->m_next; 1544 } else { 1545 sbfree(&so->so_rcv, m); 1546 so->so_rcv.sb_mb = m_free(m); 1547 m = so->so_rcv.sb_mb; 1548 sockbuf_pushsync(&so->so_rcv, nextrecord); 1549 } 1550 } 1551 1552 /* 1553 * Process one or more MT_CONTROL mbufs present before any data mbufs 1554 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1555 * just copy the data; if !MSG_PEEK, we call into the protocol to 1556 * perform externalization (or freeing if controlp == NULL). 1557 */ 1558 if (m != NULL && m->m_type == MT_CONTROL) { 1559 struct mbuf *cm = NULL, *cmn; 1560 struct mbuf **cme = &cm; 1561 1562 do { 1563 if (flags & MSG_PEEK) { 1564 if (controlp != NULL) { 1565 *controlp = m_copy(m, 0, m->m_len); 1566 controlp = &(*controlp)->m_next; 1567 } 1568 m = m->m_next; 1569 } else { 1570 sbfree(&so->so_rcv, m); 1571 so->so_rcv.sb_mb = m->m_next; 1572 m->m_next = NULL; 1573 *cme = m; 1574 cme = &(*cme)->m_next; 1575 m = so->so_rcv.sb_mb; 1576 } 1577 } while (m != NULL && m->m_type == MT_CONTROL); 1578 if ((flags & MSG_PEEK) == 0) 1579 sockbuf_pushsync(&so->so_rcv, nextrecord); 1580 while (cm != NULL) { 1581 cmn = cm->m_next; 1582 cm->m_next = NULL; 1583 if (pr->pr_domain->dom_externalize != NULL) { 1584 SOCKBUF_UNLOCK(&so->so_rcv); 1585 VNET_SO_ASSERT(so); 1586 error = (*pr->pr_domain->dom_externalize) 1587 (cm, controlp, flags); 1588 SOCKBUF_LOCK(&so->so_rcv); 1589 } else if (controlp != NULL) 1590 *controlp = cm; 1591 else 1592 m_freem(cm); 1593 if (controlp != NULL) { 1594 orig_resid = 0; 1595 while (*controlp != NULL) 1596 controlp = &(*controlp)->m_next; 1597 } 1598 cm = cmn; 1599 } 1600 if (m != NULL) 1601 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1602 else 1603 nextrecord = so->so_rcv.sb_mb; 1604 orig_resid = 0; 1605 } 1606 if (m != NULL) { 1607 if ((flags & MSG_PEEK) == 0) { 1608 KASSERT(m->m_nextpkt == nextrecord, 1609 ("soreceive: post-control, nextrecord !sync")); 1610 if (nextrecord == NULL) { 1611 KASSERT(so->so_rcv.sb_mb == m, 1612 ("soreceive: post-control, sb_mb!=m")); 1613 KASSERT(so->so_rcv.sb_lastrecord == m, 1614 ("soreceive: post-control, lastrecord!=m")); 1615 } 1616 } 1617 type = m->m_type; 1618 if (type == MT_OOBDATA) 1619 flags |= MSG_OOB; 1620 } else { 1621 if ((flags & MSG_PEEK) == 0) { 1622 KASSERT(so->so_rcv.sb_mb == nextrecord, 1623 ("soreceive: sb_mb != nextrecord")); 1624 if (so->so_rcv.sb_mb == NULL) { 1625 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1626 ("soreceive: sb_lastercord != NULL")); 1627 } 1628 } 1629 } 1630 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1631 SBLASTRECORDCHK(&so->so_rcv); 1632 SBLASTMBUFCHK(&so->so_rcv); 1633 1634 /* 1635 * Now continue to read any data mbufs off of the head of the socket 1636 * buffer until the read request is satisfied. Note that 'type' is 1637 * used to store the type of any mbuf reads that have happened so far 1638 * such that soreceive() can stop reading if the type changes, which 1639 * causes soreceive() to return only one of regular data and inline 1640 * out-of-band data in a single socket receive operation. 1641 */ 1642 moff = 0; 1643 offset = 0; 1644 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1645 /* 1646 * If the type of mbuf has changed since the last mbuf 1647 * examined ('type'), end the receive operation. 1648 */ 1649 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1650 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 1651 if (type != m->m_type) 1652 break; 1653 } else if (type == MT_OOBDATA) 1654 break; 1655 else 1656 KASSERT(m->m_type == MT_DATA, 1657 ("m->m_type == %d", m->m_type)); 1658 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1659 len = uio->uio_resid; 1660 if (so->so_oobmark && len > so->so_oobmark - offset) 1661 len = so->so_oobmark - offset; 1662 if (len > m->m_len - moff) 1663 len = m->m_len - moff; 1664 /* 1665 * If mp is set, just pass back the mbufs. Otherwise copy 1666 * them out via the uio, then free. Sockbuf must be 1667 * consistent here (points to current mbuf, it points to next 1668 * record) when we drop priority; we must note any additions 1669 * to the sockbuf when we block interrupts again. 1670 */ 1671 if (mp == NULL) { 1672 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1673 SBLASTRECORDCHK(&so->so_rcv); 1674 SBLASTMBUFCHK(&so->so_rcv); 1675 SOCKBUF_UNLOCK(&so->so_rcv); 1676 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1677 SOCKBUF_LOCK(&so->so_rcv); 1678 if (error) { 1679 /* 1680 * The MT_SONAME mbuf has already been removed 1681 * from the record, so it is necessary to 1682 * remove the data mbufs, if any, to preserve 1683 * the invariant in the case of PR_ADDR that 1684 * requires MT_SONAME mbufs at the head of 1685 * each record. 1686 */ 1687 if (m && pr->pr_flags & PR_ATOMIC && 1688 ((flags & MSG_PEEK) == 0)) 1689 (void)sbdroprecord_locked(&so->so_rcv); 1690 SOCKBUF_UNLOCK(&so->so_rcv); 1691 goto release; 1692 } 1693 } else 1694 uio->uio_resid -= len; 1695 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1696 if (len == m->m_len - moff) { 1697 if (m->m_flags & M_EOR) 1698 flags |= MSG_EOR; 1699 if (flags & MSG_PEEK) { 1700 m = m->m_next; 1701 moff = 0; 1702 } else { 1703 nextrecord = m->m_nextpkt; 1704 sbfree(&so->so_rcv, m); 1705 if (mp != NULL) { 1706 m->m_nextpkt = NULL; 1707 *mp = m; 1708 mp = &m->m_next; 1709 so->so_rcv.sb_mb = m = m->m_next; 1710 *mp = NULL; 1711 } else { 1712 so->so_rcv.sb_mb = m_free(m); 1713 m = so->so_rcv.sb_mb; 1714 } 1715 sockbuf_pushsync(&so->so_rcv, nextrecord); 1716 SBLASTRECORDCHK(&so->so_rcv); 1717 SBLASTMBUFCHK(&so->so_rcv); 1718 } 1719 } else { 1720 if (flags & MSG_PEEK) 1721 moff += len; 1722 else { 1723 if (mp != NULL) { 1724 if (flags & MSG_DONTWAIT) { 1725 *mp = m_copym(m, 0, len, 1726 M_NOWAIT); 1727 if (*mp == NULL) { 1728 /* 1729 * m_copym() couldn't 1730 * allocate an mbuf. 1731 * Adjust uio_resid back 1732 * (it was adjusted 1733 * down by len bytes, 1734 * which we didn't end 1735 * up "copying" over). 1736 */ 1737 uio->uio_resid += len; 1738 break; 1739 } 1740 } else { 1741 SOCKBUF_UNLOCK(&so->so_rcv); 1742 *mp = m_copym(m, 0, len, 1743 M_WAITOK); 1744 SOCKBUF_LOCK(&so->so_rcv); 1745 } 1746 } 1747 m->m_data += len; 1748 m->m_len -= len; 1749 so->so_rcv.sb_cc -= len; 1750 } 1751 } 1752 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1753 if (so->so_oobmark) { 1754 if ((flags & MSG_PEEK) == 0) { 1755 so->so_oobmark -= len; 1756 if (so->so_oobmark == 0) { 1757 so->so_rcv.sb_state |= SBS_RCVATMARK; 1758 break; 1759 } 1760 } else { 1761 offset += len; 1762 if (offset == so->so_oobmark) 1763 break; 1764 } 1765 } 1766 if (flags & MSG_EOR) 1767 break; 1768 /* 1769 * If the MSG_WAITALL flag is set (for non-atomic socket), we 1770 * must not quit until "uio->uio_resid == 0" or an error 1771 * termination. If a signal/timeout occurs, return with a 1772 * short count but without error. Keep sockbuf locked 1773 * against other readers. 1774 */ 1775 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1776 !sosendallatonce(so) && nextrecord == NULL) { 1777 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1778 if (so->so_error || 1779 so->so_rcv.sb_state & SBS_CANTRCVMORE) 1780 break; 1781 /* 1782 * Notify the protocol that some data has been 1783 * drained before blocking. 1784 */ 1785 if (pr->pr_flags & PR_WANTRCVD) { 1786 SOCKBUF_UNLOCK(&so->so_rcv); 1787 VNET_SO_ASSERT(so); 1788 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1789 SOCKBUF_LOCK(&so->so_rcv); 1790 } 1791 SBLASTRECORDCHK(&so->so_rcv); 1792 SBLASTMBUFCHK(&so->so_rcv); 1793 /* 1794 * We could receive some data while was notifying 1795 * the protocol. Skip blocking in this case. 1796 */ 1797 if (so->so_rcv.sb_mb == NULL) { 1798 error = sbwait(&so->so_rcv); 1799 if (error) { 1800 SOCKBUF_UNLOCK(&so->so_rcv); 1801 goto release; 1802 } 1803 } 1804 m = so->so_rcv.sb_mb; 1805 if (m != NULL) 1806 nextrecord = m->m_nextpkt; 1807 } 1808 } 1809 1810 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1811 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1812 flags |= MSG_TRUNC; 1813 if ((flags & MSG_PEEK) == 0) 1814 (void) sbdroprecord_locked(&so->so_rcv); 1815 } 1816 if ((flags & MSG_PEEK) == 0) { 1817 if (m == NULL) { 1818 /* 1819 * First part is an inline SB_EMPTY_FIXUP(). Second 1820 * part makes sure sb_lastrecord is up-to-date if 1821 * there is still data in the socket buffer. 1822 */ 1823 so->so_rcv.sb_mb = nextrecord; 1824 if (so->so_rcv.sb_mb == NULL) { 1825 so->so_rcv.sb_mbtail = NULL; 1826 so->so_rcv.sb_lastrecord = NULL; 1827 } else if (nextrecord->m_nextpkt == NULL) 1828 so->so_rcv.sb_lastrecord = nextrecord; 1829 } 1830 SBLASTRECORDCHK(&so->so_rcv); 1831 SBLASTMBUFCHK(&so->so_rcv); 1832 /* 1833 * If soreceive() is being done from the socket callback, 1834 * then don't need to generate ACK to peer to update window, 1835 * since ACK will be generated on return to TCP. 1836 */ 1837 if (!(flags & MSG_SOCALLBCK) && 1838 (pr->pr_flags & PR_WANTRCVD)) { 1839 SOCKBUF_UNLOCK(&so->so_rcv); 1840 VNET_SO_ASSERT(so); 1841 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1842 SOCKBUF_LOCK(&so->so_rcv); 1843 } 1844 } 1845 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1846 if (orig_resid == uio->uio_resid && orig_resid && 1847 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1848 SOCKBUF_UNLOCK(&so->so_rcv); 1849 goto restart; 1850 } 1851 SOCKBUF_UNLOCK(&so->so_rcv); 1852 1853 if (flagsp != NULL) 1854 *flagsp |= flags; 1855 release: 1856 sbunlock(&so->so_rcv); 1857 return (error); 1858 } 1859 1860 /* 1861 * Optimized version of soreceive() for stream (TCP) sockets. 1862 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled. 1863 */ 1864 int 1865 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 1866 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1867 { 1868 int len = 0, error = 0, flags, oresid; 1869 struct sockbuf *sb; 1870 struct mbuf *m, *n = NULL; 1871 1872 /* We only do stream sockets. */ 1873 if (so->so_type != SOCK_STREAM) 1874 return (EINVAL); 1875 if (psa != NULL) 1876 *psa = NULL; 1877 if (controlp != NULL) 1878 return (EINVAL); 1879 if (flagsp != NULL) 1880 flags = *flagsp &~ MSG_EOR; 1881 else 1882 flags = 0; 1883 if (flags & MSG_OOB) 1884 return (soreceive_rcvoob(so, uio, flags)); 1885 if (mp0 != NULL) 1886 *mp0 = NULL; 1887 1888 sb = &so->so_rcv; 1889 1890 /* Prevent other readers from entering the socket. */ 1891 error = sblock(sb, SBLOCKWAIT(flags)); 1892 if (error) 1893 goto out; 1894 SOCKBUF_LOCK(sb); 1895 1896 /* Easy one, no space to copyout anything. */ 1897 if (uio->uio_resid == 0) { 1898 error = EINVAL; 1899 goto out; 1900 } 1901 oresid = uio->uio_resid; 1902 1903 /* We will never ever get anything unless we are or were connected. */ 1904 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1905 error = ENOTCONN; 1906 goto out; 1907 } 1908 1909 restart: 1910 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1911 1912 /* Abort if socket has reported problems. */ 1913 if (so->so_error) { 1914 if (sb->sb_cc > 0) 1915 goto deliver; 1916 if (oresid > uio->uio_resid) 1917 goto out; 1918 error = so->so_error; 1919 if (!(flags & MSG_PEEK)) 1920 so->so_error = 0; 1921 goto out; 1922 } 1923 1924 /* Door is closed. Deliver what is left, if any. */ 1925 if (sb->sb_state & SBS_CANTRCVMORE) { 1926 if (sb->sb_cc > 0) 1927 goto deliver; 1928 else 1929 goto out; 1930 } 1931 1932 /* Socket buffer is empty and we shall not block. */ 1933 if (sb->sb_cc == 0 && 1934 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1935 error = EAGAIN; 1936 goto out; 1937 } 1938 1939 /* Socket buffer got some data that we shall deliver now. */ 1940 if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && 1941 ((sb->sb_flags & SS_NBIO) || 1942 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1943 sb->sb_cc >= sb->sb_lowat || 1944 sb->sb_cc >= uio->uio_resid || 1945 sb->sb_cc >= sb->sb_hiwat) ) { 1946 goto deliver; 1947 } 1948 1949 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1950 if ((flags & MSG_WAITALL) && 1951 (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat)) 1952 goto deliver; 1953 1954 /* 1955 * Wait and block until (more) data comes in. 1956 * NB: Drops the sockbuf lock during wait. 1957 */ 1958 error = sbwait(sb); 1959 if (error) 1960 goto out; 1961 goto restart; 1962 1963 deliver: 1964 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1965 KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); 1966 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1967 1968 /* Statistics. */ 1969 if (uio->uio_td) 1970 uio->uio_td->td_ru.ru_msgrcv++; 1971 1972 /* Fill uio until full or current end of socket buffer is reached. */ 1973 len = min(uio->uio_resid, sb->sb_cc); 1974 if (mp0 != NULL) { 1975 /* Dequeue as many mbufs as possible. */ 1976 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1977 if (*mp0 == NULL) 1978 *mp0 = sb->sb_mb; 1979 else 1980 m_cat(*mp0, sb->sb_mb); 1981 for (m = sb->sb_mb; 1982 m != NULL && m->m_len <= len; 1983 m = m->m_next) { 1984 len -= m->m_len; 1985 uio->uio_resid -= m->m_len; 1986 sbfree(sb, m); 1987 n = m; 1988 } 1989 n->m_next = NULL; 1990 sb->sb_mb = m; 1991 sb->sb_lastrecord = sb->sb_mb; 1992 if (sb->sb_mb == NULL) 1993 SB_EMPTY_FIXUP(sb); 1994 } 1995 /* Copy the remainder. */ 1996 if (len > 0) { 1997 KASSERT(sb->sb_mb != NULL, 1998 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1999 2000 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 2001 if (m == NULL) 2002 len = 0; /* Don't flush data from sockbuf. */ 2003 else 2004 uio->uio_resid -= len; 2005 if (*mp0 != NULL) 2006 m_cat(*mp0, m); 2007 else 2008 *mp0 = m; 2009 if (*mp0 == NULL) { 2010 error = ENOBUFS; 2011 goto out; 2012 } 2013 } 2014 } else { 2015 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2016 SOCKBUF_UNLOCK(sb); 2017 error = m_mbuftouio(uio, sb->sb_mb, len); 2018 SOCKBUF_LOCK(sb); 2019 if (error) 2020 goto out; 2021 } 2022 SBLASTRECORDCHK(sb); 2023 SBLASTMBUFCHK(sb); 2024 2025 /* 2026 * Remove the delivered data from the socket buffer unless we 2027 * were only peeking. 2028 */ 2029 if (!(flags & MSG_PEEK)) { 2030 if (len > 0) 2031 sbdrop_locked(sb, len); 2032 2033 /* Notify protocol that we drained some data. */ 2034 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2035 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2036 !(flags & MSG_SOCALLBCK))) { 2037 SOCKBUF_UNLOCK(sb); 2038 VNET_SO_ASSERT(so); 2039 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 2040 SOCKBUF_LOCK(sb); 2041 } 2042 } 2043 2044 /* 2045 * For MSG_WAITALL we may have to loop again and wait for 2046 * more data to come in. 2047 */ 2048 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2049 goto restart; 2050 out: 2051 SOCKBUF_LOCK_ASSERT(sb); 2052 SBLASTRECORDCHK(sb); 2053 SBLASTMBUFCHK(sb); 2054 SOCKBUF_UNLOCK(sb); 2055 sbunlock(sb); 2056 return (error); 2057 } 2058 2059 /* 2060 * Optimized version of soreceive() for simple datagram cases from userspace. 2061 * Unlike in the stream case, we're able to drop a datagram if copyout() 2062 * fails, and because we handle datagrams atomically, we don't need to use a 2063 * sleep lock to prevent I/O interlacing. 2064 */ 2065 int 2066 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2067 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2068 { 2069 struct mbuf *m, *m2; 2070 int flags, error; 2071 ssize_t len; 2072 struct protosw *pr = so->so_proto; 2073 struct mbuf *nextrecord; 2074 2075 if (psa != NULL) 2076 *psa = NULL; 2077 if (controlp != NULL) 2078 *controlp = NULL; 2079 if (flagsp != NULL) 2080 flags = *flagsp &~ MSG_EOR; 2081 else 2082 flags = 0; 2083 2084 /* 2085 * For any complicated cases, fall back to the full 2086 * soreceive_generic(). 2087 */ 2088 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) 2089 return (soreceive_generic(so, psa, uio, mp0, controlp, 2090 flagsp)); 2091 2092 /* 2093 * Enforce restrictions on use. 2094 */ 2095 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2096 ("soreceive_dgram: wantrcvd")); 2097 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2098 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2099 ("soreceive_dgram: SBS_RCVATMARK")); 2100 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2101 ("soreceive_dgram: P_CONNREQUIRED")); 2102 2103 /* 2104 * Loop blocking while waiting for a datagram. 2105 */ 2106 SOCKBUF_LOCK(&so->so_rcv); 2107 while ((m = so->so_rcv.sb_mb) == NULL) { 2108 KASSERT(so->so_rcv.sb_cc == 0, 2109 ("soreceive_dgram: sb_mb NULL but sb_cc %u", 2110 so->so_rcv.sb_cc)); 2111 if (so->so_error) { 2112 error = so->so_error; 2113 so->so_error = 0; 2114 SOCKBUF_UNLOCK(&so->so_rcv); 2115 return (error); 2116 } 2117 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2118 uio->uio_resid == 0) { 2119 SOCKBUF_UNLOCK(&so->so_rcv); 2120 return (0); 2121 } 2122 if ((so->so_state & SS_NBIO) || 2123 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2124 SOCKBUF_UNLOCK(&so->so_rcv); 2125 return (EWOULDBLOCK); 2126 } 2127 SBLASTRECORDCHK(&so->so_rcv); 2128 SBLASTMBUFCHK(&so->so_rcv); 2129 error = sbwait(&so->so_rcv); 2130 if (error) { 2131 SOCKBUF_UNLOCK(&so->so_rcv); 2132 return (error); 2133 } 2134 } 2135 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2136 2137 if (uio->uio_td) 2138 uio->uio_td->td_ru.ru_msgrcv++; 2139 SBLASTRECORDCHK(&so->so_rcv); 2140 SBLASTMBUFCHK(&so->so_rcv); 2141 nextrecord = m->m_nextpkt; 2142 if (nextrecord == NULL) { 2143 KASSERT(so->so_rcv.sb_lastrecord == m, 2144 ("soreceive_dgram: lastrecord != m")); 2145 } 2146 2147 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2148 ("soreceive_dgram: m_nextpkt != nextrecord")); 2149 2150 /* 2151 * Pull 'm' and its chain off the front of the packet queue. 2152 */ 2153 so->so_rcv.sb_mb = NULL; 2154 sockbuf_pushsync(&so->so_rcv, nextrecord); 2155 2156 /* 2157 * Walk 'm's chain and free that many bytes from the socket buffer. 2158 */ 2159 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2160 sbfree(&so->so_rcv, m2); 2161 2162 /* 2163 * Do a few last checks before we let go of the lock. 2164 */ 2165 SBLASTRECORDCHK(&so->so_rcv); 2166 SBLASTMBUFCHK(&so->so_rcv); 2167 SOCKBUF_UNLOCK(&so->so_rcv); 2168 2169 if (pr->pr_flags & PR_ADDR) { 2170 KASSERT(m->m_type == MT_SONAME, 2171 ("m->m_type == %d", m->m_type)); 2172 if (psa != NULL) 2173 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2174 M_NOWAIT); 2175 m = m_free(m); 2176 } 2177 if (m == NULL) { 2178 /* XXXRW: Can this happen? */ 2179 return (0); 2180 } 2181 2182 /* 2183 * Packet to copyout() is now in 'm' and it is disconnected from the 2184 * queue. 2185 * 2186 * Process one or more MT_CONTROL mbufs present before any data mbufs 2187 * in the first mbuf chain on the socket buffer. We call into the 2188 * protocol to perform externalization (or freeing if controlp == 2189 * NULL). 2190 */ 2191 if (m->m_type == MT_CONTROL) { 2192 struct mbuf *cm = NULL, *cmn; 2193 struct mbuf **cme = &cm; 2194 2195 do { 2196 m2 = m->m_next; 2197 m->m_next = NULL; 2198 *cme = m; 2199 cme = &(*cme)->m_next; 2200 m = m2; 2201 } while (m != NULL && m->m_type == MT_CONTROL); 2202 while (cm != NULL) { 2203 cmn = cm->m_next; 2204 cm->m_next = NULL; 2205 if (pr->pr_domain->dom_externalize != NULL) { 2206 error = (*pr->pr_domain->dom_externalize) 2207 (cm, controlp, flags); 2208 } else if (controlp != NULL) 2209 *controlp = cm; 2210 else 2211 m_freem(cm); 2212 if (controlp != NULL) { 2213 while (*controlp != NULL) 2214 controlp = &(*controlp)->m_next; 2215 } 2216 cm = cmn; 2217 } 2218 } 2219 KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data")); 2220 2221 while (m != NULL && uio->uio_resid > 0) { 2222 len = uio->uio_resid; 2223 if (len > m->m_len) 2224 len = m->m_len; 2225 error = uiomove(mtod(m, char *), (int)len, uio); 2226 if (error) { 2227 m_freem(m); 2228 return (error); 2229 } 2230 if (len == m->m_len) 2231 m = m_free(m); 2232 else { 2233 m->m_data += len; 2234 m->m_len -= len; 2235 } 2236 } 2237 if (m != NULL) 2238 flags |= MSG_TRUNC; 2239 m_freem(m); 2240 if (flagsp != NULL) 2241 *flagsp |= flags; 2242 return (0); 2243 } 2244 2245 int 2246 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2247 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2248 { 2249 int error; 2250 2251 CURVNET_SET(so->so_vnet); 2252 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, 2253 controlp, flagsp)); 2254 CURVNET_RESTORE(); 2255 return (error); 2256 } 2257 2258 int 2259 soshutdown(struct socket *so, int how) 2260 { 2261 struct protosw *pr = so->so_proto; 2262 int error; 2263 2264 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2265 return (EINVAL); 2266 2267 CURVNET_SET(so->so_vnet); 2268 if (pr->pr_usrreqs->pru_flush != NULL) 2269 (*pr->pr_usrreqs->pru_flush)(so, how); 2270 if (how != SHUT_WR) 2271 sorflush(so); 2272 if (how != SHUT_RD) { 2273 error = (*pr->pr_usrreqs->pru_shutdown)(so); 2274 wakeup(&so->so_timeo); 2275 CURVNET_RESTORE(); 2276 return (error); 2277 } 2278 wakeup(&so->so_timeo); 2279 CURVNET_RESTORE(); 2280 return (0); 2281 } 2282 2283 void 2284 sorflush(struct socket *so) 2285 { 2286 struct sockbuf *sb = &so->so_rcv; 2287 struct protosw *pr = so->so_proto; 2288 struct sockbuf asb; 2289 2290 VNET_SO_ASSERT(so); 2291 2292 /* 2293 * In order to avoid calling dom_dispose with the socket buffer mutex 2294 * held, and in order to generally avoid holding the lock for a long 2295 * time, we make a copy of the socket buffer and clear the original 2296 * (except locks, state). The new socket buffer copy won't have 2297 * initialized locks so we can only call routines that won't use or 2298 * assert those locks. 2299 * 2300 * Dislodge threads currently blocked in receive and wait to acquire 2301 * a lock against other simultaneous readers before clearing the 2302 * socket buffer. Don't let our acquire be interrupted by a signal 2303 * despite any existing socket disposition on interruptable waiting. 2304 */ 2305 socantrcvmore(so); 2306 (void) sblock(sb, SBL_WAIT | SBL_NOINTR); 2307 2308 /* 2309 * Invalidate/clear most of the sockbuf structure, but leave selinfo 2310 * and mutex data unchanged. 2311 */ 2312 SOCKBUF_LOCK(sb); 2313 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 2314 bcopy(&sb->sb_startzero, &asb.sb_startzero, 2315 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2316 bzero(&sb->sb_startzero, 2317 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2318 SOCKBUF_UNLOCK(sb); 2319 sbunlock(sb); 2320 2321 /* 2322 * Dispose of special rights and flush the socket buffer. Don't call 2323 * any unsafe routines (that rely on locks being initialized) on asb. 2324 */ 2325 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 2326 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 2327 sbrelease_internal(&asb, so); 2328 } 2329 2330 /* 2331 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2332 * additional variant to handle the case where the option value needs to be 2333 * some kind of integer, but not a specific size. In addition to their use 2334 * here, these functions are also called by the protocol-level pr_ctloutput() 2335 * routines. 2336 */ 2337 int 2338 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2339 { 2340 size_t valsize; 2341 2342 /* 2343 * If the user gives us more than we wanted, we ignore it, but if we 2344 * don't get the minimum length the caller wants, we return EINVAL. 2345 * On success, sopt->sopt_valsize is set to however much we actually 2346 * retrieved. 2347 */ 2348 if ((valsize = sopt->sopt_valsize) < minlen) 2349 return EINVAL; 2350 if (valsize > len) 2351 sopt->sopt_valsize = valsize = len; 2352 2353 if (sopt->sopt_td != NULL) 2354 return (copyin(sopt->sopt_val, buf, valsize)); 2355 2356 bcopy(sopt->sopt_val, buf, valsize); 2357 return (0); 2358 } 2359 2360 /* 2361 * Kernel version of setsockopt(2). 2362 * 2363 * XXX: optlen is size_t, not socklen_t 2364 */ 2365 int 2366 so_setsockopt(struct socket *so, int level, int optname, void *optval, 2367 size_t optlen) 2368 { 2369 struct sockopt sopt; 2370 2371 sopt.sopt_level = level; 2372 sopt.sopt_name = optname; 2373 sopt.sopt_dir = SOPT_SET; 2374 sopt.sopt_val = optval; 2375 sopt.sopt_valsize = optlen; 2376 sopt.sopt_td = NULL; 2377 return (sosetopt(so, &sopt)); 2378 } 2379 2380 int 2381 sosetopt(struct socket *so, struct sockopt *sopt) 2382 { 2383 int error, optval; 2384 struct linger l; 2385 struct timeval tv; 2386 sbintime_t val; 2387 uint32_t val32; 2388 #ifdef MAC 2389 struct mac extmac; 2390 #endif 2391 2392 CURVNET_SET(so->so_vnet); 2393 error = 0; 2394 if (sopt->sopt_level != SOL_SOCKET) { 2395 if (so->so_proto->pr_ctloutput != NULL) { 2396 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2397 CURVNET_RESTORE(); 2398 return (error); 2399 } 2400 error = ENOPROTOOPT; 2401 } else { 2402 switch (sopt->sopt_name) { 2403 case SO_ACCEPTFILTER: 2404 error = do_setopt_accept_filter(so, sopt); 2405 if (error) 2406 goto bad; 2407 break; 2408 2409 case SO_LINGER: 2410 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2411 if (error) 2412 goto bad; 2413 2414 SOCK_LOCK(so); 2415 so->so_linger = l.l_linger; 2416 if (l.l_onoff) 2417 so->so_options |= SO_LINGER; 2418 else 2419 so->so_options &= ~SO_LINGER; 2420 SOCK_UNLOCK(so); 2421 break; 2422 2423 case SO_DEBUG: 2424 case SO_KEEPALIVE: 2425 case SO_DONTROUTE: 2426 case SO_USELOOPBACK: 2427 case SO_BROADCAST: 2428 case SO_REUSEADDR: 2429 case SO_REUSEPORT: 2430 case SO_OOBINLINE: 2431 case SO_TIMESTAMP: 2432 case SO_BINTIME: 2433 case SO_NOSIGPIPE: 2434 case SO_NO_DDP: 2435 case SO_NO_OFFLOAD: 2436 error = sooptcopyin(sopt, &optval, sizeof optval, 2437 sizeof optval); 2438 if (error) 2439 goto bad; 2440 SOCK_LOCK(so); 2441 if (optval) 2442 so->so_options |= sopt->sopt_name; 2443 else 2444 so->so_options &= ~sopt->sopt_name; 2445 SOCK_UNLOCK(so); 2446 break; 2447 2448 case SO_SETFIB: 2449 error = sooptcopyin(sopt, &optval, sizeof optval, 2450 sizeof optval); 2451 if (error) 2452 goto bad; 2453 2454 if (optval < 0 || optval >= rt_numfibs) { 2455 error = EINVAL; 2456 goto bad; 2457 } 2458 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 2459 (so->so_proto->pr_domain->dom_family == PF_INET6) || 2460 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 2461 so->so_fibnum = optval; 2462 else 2463 so->so_fibnum = 0; 2464 break; 2465 2466 case SO_USER_COOKIE: 2467 error = sooptcopyin(sopt, &val32, sizeof val32, 2468 sizeof val32); 2469 if (error) 2470 goto bad; 2471 so->so_user_cookie = val32; 2472 break; 2473 2474 case SO_SNDBUF: 2475 case SO_RCVBUF: 2476 case SO_SNDLOWAT: 2477 case SO_RCVLOWAT: 2478 error = sooptcopyin(sopt, &optval, sizeof optval, 2479 sizeof optval); 2480 if (error) 2481 goto bad; 2482 2483 /* 2484 * Values < 1 make no sense for any of these options, 2485 * so disallow them. 2486 */ 2487 if (optval < 1) { 2488 error = EINVAL; 2489 goto bad; 2490 } 2491 2492 switch (sopt->sopt_name) { 2493 case SO_SNDBUF: 2494 case SO_RCVBUF: 2495 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 2496 &so->so_snd : &so->so_rcv, (u_long)optval, 2497 so, curthread) == 0) { 2498 error = ENOBUFS; 2499 goto bad; 2500 } 2501 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : 2502 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; 2503 break; 2504 2505 /* 2506 * Make sure the low-water is never greater than the 2507 * high-water. 2508 */ 2509 case SO_SNDLOWAT: 2510 SOCKBUF_LOCK(&so->so_snd); 2511 so->so_snd.sb_lowat = 2512 (optval > so->so_snd.sb_hiwat) ? 2513 so->so_snd.sb_hiwat : optval; 2514 SOCKBUF_UNLOCK(&so->so_snd); 2515 break; 2516 case SO_RCVLOWAT: 2517 SOCKBUF_LOCK(&so->so_rcv); 2518 so->so_rcv.sb_lowat = 2519 (optval > so->so_rcv.sb_hiwat) ? 2520 so->so_rcv.sb_hiwat : optval; 2521 SOCKBUF_UNLOCK(&so->so_rcv); 2522 break; 2523 } 2524 break; 2525 2526 case SO_SNDTIMEO: 2527 case SO_RCVTIMEO: 2528 #ifdef COMPAT_FREEBSD32 2529 if (SV_CURPROC_FLAG(SV_ILP32)) { 2530 struct timeval32 tv32; 2531 2532 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2533 sizeof tv32); 2534 CP(tv32, tv, tv_sec); 2535 CP(tv32, tv, tv_usec); 2536 } else 2537 #endif 2538 error = sooptcopyin(sopt, &tv, sizeof tv, 2539 sizeof tv); 2540 if (error) 2541 goto bad; 2542 if (tv.tv_sec < 0 || tv.tv_usec < 0 || 2543 tv.tv_usec >= 1000000) { 2544 error = EDOM; 2545 goto bad; 2546 } 2547 if (tv.tv_sec > INT32_MAX) 2548 val = SBT_MAX; 2549 else 2550 val = tvtosbt(tv); 2551 switch (sopt->sopt_name) { 2552 case SO_SNDTIMEO: 2553 so->so_snd.sb_timeo = val; 2554 break; 2555 case SO_RCVTIMEO: 2556 so->so_rcv.sb_timeo = val; 2557 break; 2558 } 2559 break; 2560 2561 case SO_LABEL: 2562 #ifdef MAC 2563 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2564 sizeof extmac); 2565 if (error) 2566 goto bad; 2567 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2568 so, &extmac); 2569 #else 2570 error = EOPNOTSUPP; 2571 #endif 2572 break; 2573 2574 default: 2575 error = ENOPROTOOPT; 2576 break; 2577 } 2578 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 2579 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 2580 } 2581 bad: 2582 CURVNET_RESTORE(); 2583 return (error); 2584 } 2585 2586 /* 2587 * Helper routine for getsockopt. 2588 */ 2589 int 2590 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2591 { 2592 int error; 2593 size_t valsize; 2594 2595 error = 0; 2596 2597 /* 2598 * Documented get behavior is that we always return a value, possibly 2599 * truncated to fit in the user's buffer. Traditional behavior is 2600 * that we always tell the user precisely how much we copied, rather 2601 * than something useful like the total amount we had available for 2602 * her. Note that this interface is not idempotent; the entire 2603 * answer must generated ahead of time. 2604 */ 2605 valsize = min(len, sopt->sopt_valsize); 2606 sopt->sopt_valsize = valsize; 2607 if (sopt->sopt_val != NULL) { 2608 if (sopt->sopt_td != NULL) 2609 error = copyout(buf, sopt->sopt_val, valsize); 2610 else 2611 bcopy(buf, sopt->sopt_val, valsize); 2612 } 2613 return (error); 2614 } 2615 2616 int 2617 sogetopt(struct socket *so, struct sockopt *sopt) 2618 { 2619 int error, optval; 2620 struct linger l; 2621 struct timeval tv; 2622 #ifdef MAC 2623 struct mac extmac; 2624 #endif 2625 2626 CURVNET_SET(so->so_vnet); 2627 error = 0; 2628 if (sopt->sopt_level != SOL_SOCKET) { 2629 if (so->so_proto->pr_ctloutput != NULL) 2630 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2631 else 2632 error = ENOPROTOOPT; 2633 CURVNET_RESTORE(); 2634 return (error); 2635 } else { 2636 switch (sopt->sopt_name) { 2637 case SO_ACCEPTFILTER: 2638 error = do_getopt_accept_filter(so, sopt); 2639 break; 2640 2641 case SO_LINGER: 2642 SOCK_LOCK(so); 2643 l.l_onoff = so->so_options & SO_LINGER; 2644 l.l_linger = so->so_linger; 2645 SOCK_UNLOCK(so); 2646 error = sooptcopyout(sopt, &l, sizeof l); 2647 break; 2648 2649 case SO_USELOOPBACK: 2650 case SO_DONTROUTE: 2651 case SO_DEBUG: 2652 case SO_KEEPALIVE: 2653 case SO_REUSEADDR: 2654 case SO_REUSEPORT: 2655 case SO_BROADCAST: 2656 case SO_OOBINLINE: 2657 case SO_ACCEPTCONN: 2658 case SO_TIMESTAMP: 2659 case SO_BINTIME: 2660 case SO_NOSIGPIPE: 2661 optval = so->so_options & sopt->sopt_name; 2662 integer: 2663 error = sooptcopyout(sopt, &optval, sizeof optval); 2664 break; 2665 2666 case SO_TYPE: 2667 optval = so->so_type; 2668 goto integer; 2669 2670 case SO_PROTOCOL: 2671 optval = so->so_proto->pr_protocol; 2672 goto integer; 2673 2674 case SO_ERROR: 2675 SOCK_LOCK(so); 2676 optval = so->so_error; 2677 so->so_error = 0; 2678 SOCK_UNLOCK(so); 2679 goto integer; 2680 2681 case SO_SNDBUF: 2682 optval = so->so_snd.sb_hiwat; 2683 goto integer; 2684 2685 case SO_RCVBUF: 2686 optval = so->so_rcv.sb_hiwat; 2687 goto integer; 2688 2689 case SO_SNDLOWAT: 2690 optval = so->so_snd.sb_lowat; 2691 goto integer; 2692 2693 case SO_RCVLOWAT: 2694 optval = so->so_rcv.sb_lowat; 2695 goto integer; 2696 2697 case SO_SNDTIMEO: 2698 case SO_RCVTIMEO: 2699 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? 2700 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2701 #ifdef COMPAT_FREEBSD32 2702 if (SV_CURPROC_FLAG(SV_ILP32)) { 2703 struct timeval32 tv32; 2704 2705 CP(tv, tv32, tv_sec); 2706 CP(tv, tv32, tv_usec); 2707 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2708 } else 2709 #endif 2710 error = sooptcopyout(sopt, &tv, sizeof tv); 2711 break; 2712 2713 case SO_LABEL: 2714 #ifdef MAC 2715 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2716 sizeof(extmac)); 2717 if (error) 2718 goto bad; 2719 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2720 so, &extmac); 2721 if (error) 2722 goto bad; 2723 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2724 #else 2725 error = EOPNOTSUPP; 2726 #endif 2727 break; 2728 2729 case SO_PEERLABEL: 2730 #ifdef MAC 2731 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2732 sizeof(extmac)); 2733 if (error) 2734 goto bad; 2735 error = mac_getsockopt_peerlabel( 2736 sopt->sopt_td->td_ucred, so, &extmac); 2737 if (error) 2738 goto bad; 2739 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2740 #else 2741 error = EOPNOTSUPP; 2742 #endif 2743 break; 2744 2745 case SO_LISTENQLIMIT: 2746 optval = so->so_qlimit; 2747 goto integer; 2748 2749 case SO_LISTENQLEN: 2750 optval = so->so_qlen; 2751 goto integer; 2752 2753 case SO_LISTENINCQLEN: 2754 optval = so->so_incqlen; 2755 goto integer; 2756 2757 default: 2758 error = ENOPROTOOPT; 2759 break; 2760 } 2761 } 2762 #ifdef MAC 2763 bad: 2764 #endif 2765 CURVNET_RESTORE(); 2766 return (error); 2767 } 2768 2769 int 2770 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2771 { 2772 struct mbuf *m, *m_prev; 2773 int sopt_size = sopt->sopt_valsize; 2774 2775 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 2776 if (m == NULL) 2777 return ENOBUFS; 2778 if (sopt_size > MLEN) { 2779 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); 2780 if ((m->m_flags & M_EXT) == 0) { 2781 m_free(m); 2782 return ENOBUFS; 2783 } 2784 m->m_len = min(MCLBYTES, sopt_size); 2785 } else { 2786 m->m_len = min(MLEN, sopt_size); 2787 } 2788 sopt_size -= m->m_len; 2789 *mp = m; 2790 m_prev = m; 2791 2792 while (sopt_size) { 2793 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); 2794 if (m == NULL) { 2795 m_freem(*mp); 2796 return ENOBUFS; 2797 } 2798 if (sopt_size > MLEN) { 2799 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : 2800 M_NOWAIT); 2801 if ((m->m_flags & M_EXT) == 0) { 2802 m_freem(m); 2803 m_freem(*mp); 2804 return ENOBUFS; 2805 } 2806 m->m_len = min(MCLBYTES, sopt_size); 2807 } else { 2808 m->m_len = min(MLEN, sopt_size); 2809 } 2810 sopt_size -= m->m_len; 2811 m_prev->m_next = m; 2812 m_prev = m; 2813 } 2814 return (0); 2815 } 2816 2817 int 2818 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2819 { 2820 struct mbuf *m0 = m; 2821 2822 if (sopt->sopt_val == NULL) 2823 return (0); 2824 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2825 if (sopt->sopt_td != NULL) { 2826 int error; 2827 2828 error = copyin(sopt->sopt_val, mtod(m, char *), 2829 m->m_len); 2830 if (error != 0) { 2831 m_freem(m0); 2832 return(error); 2833 } 2834 } else 2835 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2836 sopt->sopt_valsize -= m->m_len; 2837 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2838 m = m->m_next; 2839 } 2840 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2841 panic("ip6_sooptmcopyin"); 2842 return (0); 2843 } 2844 2845 int 2846 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2847 { 2848 struct mbuf *m0 = m; 2849 size_t valsize = 0; 2850 2851 if (sopt->sopt_val == NULL) 2852 return (0); 2853 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2854 if (sopt->sopt_td != NULL) { 2855 int error; 2856 2857 error = copyout(mtod(m, char *), sopt->sopt_val, 2858 m->m_len); 2859 if (error != 0) { 2860 m_freem(m0); 2861 return(error); 2862 } 2863 } else 2864 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2865 sopt->sopt_valsize -= m->m_len; 2866 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2867 valsize += m->m_len; 2868 m = m->m_next; 2869 } 2870 if (m != NULL) { 2871 /* enough soopt buffer should be given from user-land */ 2872 m_freem(m0); 2873 return(EINVAL); 2874 } 2875 sopt->sopt_valsize = valsize; 2876 return (0); 2877 } 2878 2879 /* 2880 * sohasoutofband(): protocol notifies socket layer of the arrival of new 2881 * out-of-band data, which will then notify socket consumers. 2882 */ 2883 void 2884 sohasoutofband(struct socket *so) 2885 { 2886 2887 if (so->so_sigio != NULL) 2888 pgsigio(&so->so_sigio, SIGURG, 0); 2889 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2890 } 2891 2892 int 2893 sopoll(struct socket *so, int events, struct ucred *active_cred, 2894 struct thread *td) 2895 { 2896 2897 /* 2898 * We do not need to set or assert curvnet as long as everyone uses 2899 * sopoll_generic(). 2900 */ 2901 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 2902 td)); 2903 } 2904 2905 int 2906 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 2907 struct thread *td) 2908 { 2909 int revents = 0; 2910 2911 SOCKBUF_LOCK(&so->so_snd); 2912 SOCKBUF_LOCK(&so->so_rcv); 2913 if (events & (POLLIN | POLLRDNORM)) 2914 if (soreadabledata(so)) 2915 revents |= events & (POLLIN | POLLRDNORM); 2916 2917 if (events & (POLLOUT | POLLWRNORM)) 2918 if (sowriteable(so)) 2919 revents |= events & (POLLOUT | POLLWRNORM); 2920 2921 if (events & (POLLPRI | POLLRDBAND)) 2922 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2923 revents |= events & (POLLPRI | POLLRDBAND); 2924 2925 if ((events & POLLINIGNEOF) == 0) { 2926 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2927 revents |= events & (POLLIN | POLLRDNORM); 2928 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 2929 revents |= POLLHUP; 2930 } 2931 } 2932 2933 if (revents == 0) { 2934 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 2935 selrecord(td, &so->so_rcv.sb_sel); 2936 so->so_rcv.sb_flags |= SB_SEL; 2937 } 2938 2939 if (events & (POLLOUT | POLLWRNORM)) { 2940 selrecord(td, &so->so_snd.sb_sel); 2941 so->so_snd.sb_flags |= SB_SEL; 2942 } 2943 } 2944 2945 SOCKBUF_UNLOCK(&so->so_rcv); 2946 SOCKBUF_UNLOCK(&so->so_snd); 2947 return (revents); 2948 } 2949 2950 int 2951 soo_kqfilter(struct file *fp, struct knote *kn) 2952 { 2953 struct socket *so = kn->kn_fp->f_data; 2954 struct sockbuf *sb; 2955 2956 switch (kn->kn_filter) { 2957 case EVFILT_READ: 2958 if (so->so_options & SO_ACCEPTCONN) 2959 kn->kn_fop = &solisten_filtops; 2960 else 2961 kn->kn_fop = &soread_filtops; 2962 sb = &so->so_rcv; 2963 break; 2964 case EVFILT_WRITE: 2965 kn->kn_fop = &sowrite_filtops; 2966 sb = &so->so_snd; 2967 break; 2968 default: 2969 return (EINVAL); 2970 } 2971 2972 SOCKBUF_LOCK(sb); 2973 knlist_add(&sb->sb_sel.si_note, kn, 1); 2974 sb->sb_flags |= SB_KNOTE; 2975 SOCKBUF_UNLOCK(sb); 2976 return (0); 2977 } 2978 2979 /* 2980 * Some routines that return EOPNOTSUPP for entry points that are not 2981 * supported by a protocol. Fill in as needed. 2982 */ 2983 int 2984 pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 2985 { 2986 2987 return EOPNOTSUPP; 2988 } 2989 2990 int 2991 pru_attach_notsupp(struct socket *so, int proto, struct thread *td) 2992 { 2993 2994 return EOPNOTSUPP; 2995 } 2996 2997 int 2998 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 2999 { 3000 3001 return EOPNOTSUPP; 3002 } 3003 3004 int 3005 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3006 struct thread *td) 3007 { 3008 3009 return EOPNOTSUPP; 3010 } 3011 3012 int 3013 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3014 { 3015 3016 return EOPNOTSUPP; 3017 } 3018 3019 int 3020 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, 3021 struct thread *td) 3022 { 3023 3024 return EOPNOTSUPP; 3025 } 3026 3027 int 3028 pru_connect2_notsupp(struct socket *so1, struct socket *so2) 3029 { 3030 3031 return EOPNOTSUPP; 3032 } 3033 3034 int 3035 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 3036 struct ifnet *ifp, struct thread *td) 3037 { 3038 3039 return EOPNOTSUPP; 3040 } 3041 3042 int 3043 pru_disconnect_notsupp(struct socket *so) 3044 { 3045 3046 return EOPNOTSUPP; 3047 } 3048 3049 int 3050 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) 3051 { 3052 3053 return EOPNOTSUPP; 3054 } 3055 3056 int 3057 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) 3058 { 3059 3060 return EOPNOTSUPP; 3061 } 3062 3063 int 3064 pru_rcvd_notsupp(struct socket *so, int flags) 3065 { 3066 3067 return EOPNOTSUPP; 3068 } 3069 3070 int 3071 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 3072 { 3073 3074 return EOPNOTSUPP; 3075 } 3076 3077 int 3078 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, 3079 struct sockaddr *addr, struct mbuf *control, struct thread *td) 3080 { 3081 3082 return EOPNOTSUPP; 3083 } 3084 3085 /* 3086 * This isn't really a ``null'' operation, but it's the default one and 3087 * doesn't do anything destructive. 3088 */ 3089 int 3090 pru_sense_null(struct socket *so, struct stat *sb) 3091 { 3092 3093 sb->st_blksize = so->so_snd.sb_hiwat; 3094 return 0; 3095 } 3096 3097 int 3098 pru_shutdown_notsupp(struct socket *so) 3099 { 3100 3101 return EOPNOTSUPP; 3102 } 3103 3104 int 3105 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) 3106 { 3107 3108 return EOPNOTSUPP; 3109 } 3110 3111 int 3112 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, 3113 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 3114 { 3115 3116 return EOPNOTSUPP; 3117 } 3118 3119 int 3120 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, 3121 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3122 { 3123 3124 return EOPNOTSUPP; 3125 } 3126 3127 int 3128 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, 3129 struct thread *td) 3130 { 3131 3132 return EOPNOTSUPP; 3133 } 3134 3135 static void 3136 filt_sordetach(struct knote *kn) 3137 { 3138 struct socket *so = kn->kn_fp->f_data; 3139 3140 SOCKBUF_LOCK(&so->so_rcv); 3141 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 3142 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 3143 so->so_rcv.sb_flags &= ~SB_KNOTE; 3144 SOCKBUF_UNLOCK(&so->so_rcv); 3145 } 3146 3147 /*ARGSUSED*/ 3148 static int 3149 filt_soread(struct knote *kn, long hint) 3150 { 3151 struct socket *so; 3152 3153 so = kn->kn_fp->f_data; 3154 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3155 3156 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 3157 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3158 kn->kn_flags |= EV_EOF; 3159 kn->kn_fflags = so->so_error; 3160 return (1); 3161 } else if (so->so_error) /* temporary udp error */ 3162 return (1); 3163 else if (kn->kn_sfflags & NOTE_LOWAT) 3164 return (kn->kn_data >= kn->kn_sdata); 3165 else 3166 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 3167 } 3168 3169 static void 3170 filt_sowdetach(struct knote *kn) 3171 { 3172 struct socket *so = kn->kn_fp->f_data; 3173 3174 SOCKBUF_LOCK(&so->so_snd); 3175 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 3176 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 3177 so->so_snd.sb_flags &= ~SB_KNOTE; 3178 SOCKBUF_UNLOCK(&so->so_snd); 3179 } 3180 3181 /*ARGSUSED*/ 3182 static int 3183 filt_sowrite(struct knote *kn, long hint) 3184 { 3185 struct socket *so; 3186 3187 so = kn->kn_fp->f_data; 3188 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3189 kn->kn_data = sbspace(&so->so_snd); 3190 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3191 kn->kn_flags |= EV_EOF; 3192 kn->kn_fflags = so->so_error; 3193 return (1); 3194 } else if (so->so_error) /* temporary udp error */ 3195 return (1); 3196 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3197 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3198 return (0); 3199 else if (kn->kn_sfflags & NOTE_LOWAT) 3200 return (kn->kn_data >= kn->kn_sdata); 3201 else 3202 return (kn->kn_data >= so->so_snd.sb_lowat); 3203 } 3204 3205 /*ARGSUSED*/ 3206 static int 3207 filt_solisten(struct knote *kn, long hint) 3208 { 3209 struct socket *so = kn->kn_fp->f_data; 3210 3211 kn->kn_data = so->so_qlen; 3212 return (!TAILQ_EMPTY(&so->so_comp)); 3213 } 3214 3215 int 3216 socheckuid(struct socket *so, uid_t uid) 3217 { 3218 3219 if (so == NULL) 3220 return (EPERM); 3221 if (so->so_cred->cr_uid != uid) 3222 return (EPERM); 3223 return (0); 3224 } 3225 3226 /* 3227 * These functions are used by protocols to notify the socket layer (and its 3228 * consumers) of state changes in the sockets driven by protocol-side events. 3229 */ 3230 3231 /* 3232 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3233 * 3234 * Normal sequence from the active (originating) side is that 3235 * soisconnecting() is called during processing of connect() call, resulting 3236 * in an eventual call to soisconnected() if/when the connection is 3237 * established. When the connection is torn down soisdisconnecting() is 3238 * called during processing of disconnect() call, and soisdisconnected() is 3239 * called when the connection to the peer is totally severed. The semantics 3240 * of these routines are such that connectionless protocols can call 3241 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3242 * calls when setting up a ``connection'' takes no time. 3243 * 3244 * From the passive side, a socket is created with two queues of sockets: 3245 * so_incomp for connections in progress and so_comp for connections already 3246 * made and awaiting user acceptance. As a protocol is preparing incoming 3247 * connections, it creates a socket structure queued on so_incomp by calling 3248 * sonewconn(). When the connection is established, soisconnected() is 3249 * called, and transfers the socket structure to so_comp, making it available 3250 * to accept(). 3251 * 3252 * If a socket is closed with sockets on either so_incomp or so_comp, these 3253 * sockets are dropped. 3254 * 3255 * If higher-level protocols are implemented in the kernel, the wakeups done 3256 * here will sometimes cause software-interrupt process scheduling. 3257 */ 3258 void 3259 soisconnecting(struct socket *so) 3260 { 3261 3262 SOCK_LOCK(so); 3263 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3264 so->so_state |= SS_ISCONNECTING; 3265 SOCK_UNLOCK(so); 3266 } 3267 3268 void 3269 soisconnected(struct socket *so) 3270 { 3271 struct socket *head; 3272 int ret; 3273 3274 restart: 3275 ACCEPT_LOCK(); 3276 SOCK_LOCK(so); 3277 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3278 so->so_state |= SS_ISCONNECTED; 3279 head = so->so_head; 3280 if (head != NULL && (so->so_qstate & SQ_INCOMP)) { 3281 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3282 SOCK_UNLOCK(so); 3283 TAILQ_REMOVE(&head->so_incomp, so, so_list); 3284 head->so_incqlen--; 3285 so->so_qstate &= ~SQ_INCOMP; 3286 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 3287 head->so_qlen++; 3288 so->so_qstate |= SQ_COMP; 3289 ACCEPT_UNLOCK(); 3290 sorwakeup(head); 3291 wakeup_one(&head->so_timeo); 3292 } else { 3293 ACCEPT_UNLOCK(); 3294 soupcall_set(so, SO_RCV, 3295 head->so_accf->so_accept_filter->accf_callback, 3296 head->so_accf->so_accept_filter_arg); 3297 so->so_options &= ~SO_ACCEPTFILTER; 3298 ret = head->so_accf->so_accept_filter->accf_callback(so, 3299 head->so_accf->so_accept_filter_arg, M_NOWAIT); 3300 if (ret == SU_ISCONNECTED) 3301 soupcall_clear(so, SO_RCV); 3302 SOCK_UNLOCK(so); 3303 if (ret == SU_ISCONNECTED) 3304 goto restart; 3305 } 3306 return; 3307 } 3308 SOCK_UNLOCK(so); 3309 ACCEPT_UNLOCK(); 3310 wakeup(&so->so_timeo); 3311 sorwakeup(so); 3312 sowwakeup(so); 3313 } 3314 3315 void 3316 soisdisconnecting(struct socket *so) 3317 { 3318 3319 /* 3320 * Note: This code assumes that SOCK_LOCK(so) and 3321 * SOCKBUF_LOCK(&so->so_rcv) are the same. 3322 */ 3323 SOCKBUF_LOCK(&so->so_rcv); 3324 so->so_state &= ~SS_ISCONNECTING; 3325 so->so_state |= SS_ISDISCONNECTING; 3326 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 3327 sorwakeup_locked(so); 3328 SOCKBUF_LOCK(&so->so_snd); 3329 so->so_snd.sb_state |= SBS_CANTSENDMORE; 3330 sowwakeup_locked(so); 3331 wakeup(&so->so_timeo); 3332 } 3333 3334 void 3335 soisdisconnected(struct socket *so) 3336 { 3337 3338 /* 3339 * Note: This code assumes that SOCK_LOCK(so) and 3340 * SOCKBUF_LOCK(&so->so_rcv) are the same. 3341 */ 3342 SOCKBUF_LOCK(&so->so_rcv); 3343 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3344 so->so_state |= SS_ISDISCONNECTED; 3345 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 3346 sorwakeup_locked(so); 3347 SOCKBUF_LOCK(&so->so_snd); 3348 so->so_snd.sb_state |= SBS_CANTSENDMORE; 3349 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); 3350 sowwakeup_locked(so); 3351 wakeup(&so->so_timeo); 3352 } 3353 3354 /* 3355 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3356 */ 3357 struct sockaddr * 3358 sodupsockaddr(const struct sockaddr *sa, int mflags) 3359 { 3360 struct sockaddr *sa2; 3361 3362 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3363 if (sa2) 3364 bcopy(sa, sa2, sa->sa_len); 3365 return sa2; 3366 } 3367 3368 /* 3369 * Register per-socket buffer upcalls. 3370 */ 3371 void 3372 soupcall_set(struct socket *so, int which, 3373 int (*func)(struct socket *, void *, int), void *arg) 3374 { 3375 struct sockbuf *sb; 3376 3377 switch (which) { 3378 case SO_RCV: 3379 sb = &so->so_rcv; 3380 break; 3381 case SO_SND: 3382 sb = &so->so_snd; 3383 break; 3384 default: 3385 panic("soupcall_set: bad which"); 3386 } 3387 SOCKBUF_LOCK_ASSERT(sb); 3388 #if 0 3389 /* XXX: accf_http actually wants to do this on purpose. */ 3390 KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall")); 3391 #endif 3392 sb->sb_upcall = func; 3393 sb->sb_upcallarg = arg; 3394 sb->sb_flags |= SB_UPCALL; 3395 } 3396 3397 void 3398 soupcall_clear(struct socket *so, int which) 3399 { 3400 struct sockbuf *sb; 3401 3402 switch (which) { 3403 case SO_RCV: 3404 sb = &so->so_rcv; 3405 break; 3406 case SO_SND: 3407 sb = &so->so_snd; 3408 break; 3409 default: 3410 panic("soupcall_clear: bad which"); 3411 } 3412 SOCKBUF_LOCK_ASSERT(sb); 3413 KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); 3414 sb->sb_upcall = NULL; 3415 sb->sb_upcallarg = NULL; 3416 sb->sb_flags &= ~SB_UPCALL; 3417 } 3418 3419 /* 3420 * Create an external-format (``xsocket'') structure using the information in 3421 * the kernel-format socket structure pointed to by so. This is done to 3422 * reduce the spew of irrelevant information over this interface, to isolate 3423 * user code from changes in the kernel structure, and potentially to provide 3424 * information-hiding if we decide that some of this information should be 3425 * hidden from users. 3426 */ 3427 void 3428 sotoxsocket(struct socket *so, struct xsocket *xso) 3429 { 3430 3431 xso->xso_len = sizeof *xso; 3432 xso->xso_so = so; 3433 xso->so_type = so->so_type; 3434 xso->so_options = so->so_options; 3435 xso->so_linger = so->so_linger; 3436 xso->so_state = so->so_state; 3437 xso->so_pcb = so->so_pcb; 3438 xso->xso_protocol = so->so_proto->pr_protocol; 3439 xso->xso_family = so->so_proto->pr_domain->dom_family; 3440 xso->so_qlen = so->so_qlen; 3441 xso->so_incqlen = so->so_incqlen; 3442 xso->so_qlimit = so->so_qlimit; 3443 xso->so_timeo = so->so_timeo; 3444 xso->so_error = so->so_error; 3445 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 3446 xso->so_oobmark = so->so_oobmark; 3447 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 3448 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 3449 xso->so_uid = so->so_cred->cr_uid; 3450 } 3451 3452 3453 /* 3454 * Socket accessor functions to provide external consumers with 3455 * a safe interface to socket state 3456 * 3457 */ 3458 3459 void 3460 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), 3461 void *arg) 3462 { 3463 3464 TAILQ_FOREACH(so, &so->so_comp, so_list) 3465 func(so, arg); 3466 } 3467 3468 struct sockbuf * 3469 so_sockbuf_rcv(struct socket *so) 3470 { 3471 3472 return (&so->so_rcv); 3473 } 3474 3475 struct sockbuf * 3476 so_sockbuf_snd(struct socket *so) 3477 { 3478 3479 return (&so->so_snd); 3480 } 3481 3482 int 3483 so_state_get(const struct socket *so) 3484 { 3485 3486 return (so->so_state); 3487 } 3488 3489 void 3490 so_state_set(struct socket *so, int val) 3491 { 3492 3493 so->so_state = val; 3494 } 3495 3496 int 3497 so_options_get(const struct socket *so) 3498 { 3499 3500 return (so->so_options); 3501 } 3502 3503 void 3504 so_options_set(struct socket *so, int val) 3505 { 3506 3507 so->so_options = val; 3508 } 3509 3510 int 3511 so_error_get(const struct socket *so) 3512 { 3513 3514 return (so->so_error); 3515 } 3516 3517 void 3518 so_error_set(struct socket *so, int val) 3519 { 3520 3521 so->so_error = val; 3522 } 3523 3524 int 3525 so_linger_get(const struct socket *so) 3526 { 3527 3528 return (so->so_linger); 3529 } 3530 3531 void 3532 so_linger_set(struct socket *so, int val) 3533 { 3534 3535 so->so_linger = val; 3536 } 3537 3538 struct protosw * 3539 so_protosw_get(const struct socket *so) 3540 { 3541 3542 return (so->so_proto); 3543 } 3544 3545 void 3546 so_protosw_set(struct socket *so, struct protosw *val) 3547 { 3548 3549 so->so_proto = val; 3550 } 3551 3552 void 3553 so_sorwakeup(struct socket *so) 3554 { 3555 3556 sorwakeup(so); 3557 } 3558 3559 void 3560 so_sowwakeup(struct socket *so) 3561 { 3562 3563 sowwakeup(so); 3564 } 3565 3566 void 3567 so_sorwakeup_locked(struct socket *so) 3568 { 3569 3570 sorwakeup_locked(so); 3571 } 3572 3573 void 3574 so_sowwakeup_locked(struct socket *so) 3575 { 3576 3577 sowwakeup_locked(so); 3578 } 3579 3580 void 3581 so_lock(struct socket *so) 3582 { 3583 3584 SOCK_LOCK(so); 3585 } 3586 3587 void 3588 so_unlock(struct socket *so) 3589 { 3590 3591 SOCK_UNLOCK(so); 3592 } 3593