1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2008 Robert N. M. Watson 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 33 */ 34 35 /* 36 * Comments on the socket life cycle: 37 * 38 * soalloc() sets of socket layer state for a socket, called only by 39 * socreate() and sonewconn(). Socket layer private. 40 * 41 * sodealloc() tears down socket layer state for a socket, called only by 42 * sofree() and sonewconn(). Socket layer private. 43 * 44 * pru_attach() associates protocol layer state with an allocated socket; 45 * called only once, may fail, aborting socket allocation. This is called 46 * from socreate() and sonewconn(). Socket layer private. 47 * 48 * pru_detach() disassociates protocol layer state from an attached socket, 49 * and will be called exactly once for sockets in which pru_attach() has 50 * been successfully called. If pru_attach() returned an error, 51 * pru_detach() will not be called. Socket layer private. 52 * 53 * pru_abort() and pru_close() notify the protocol layer that the last 54 * consumer of a socket is starting to tear down the socket, and that the 55 * protocol should terminate the connection. Historically, pru_abort() also 56 * detached protocol state from the socket state, but this is no longer the 57 * case. 58 * 59 * socreate() creates a socket and attaches protocol state. This is a public 60 * interface that may be used by socket layer consumers to create new 61 * sockets. 62 * 63 * sonewconn() creates a socket and attaches protocol state. This is a 64 * public interface that may be used by protocols to create new sockets when 65 * a new connection is received and will be available for accept() on a 66 * listen socket. 67 * 68 * soclose() destroys a socket after possibly waiting for it to disconnect. 69 * This is a public interface that socket consumers should use to close and 70 * release a socket when done with it. 71 * 72 * soabort() destroys a socket without waiting for it to disconnect (used 73 * only for incoming connections that are already partially or fully 74 * connected). This is used internally by the socket layer when clearing 75 * listen socket queues (due to overflow or close on the listen socket), but 76 * is also a public interface protocols may use to abort connections in 77 * their incomplete listen queues should they no longer be required. Sockets 78 * placed in completed connection listen queues should not be aborted for 79 * reasons described in the comment above the soclose() implementation. This 80 * is not a general purpose close routine, and except in the specific 81 * circumstances described here, should not be used. 82 * 83 * sofree() will free a socket and its protocol state if all references on 84 * the socket have been released, and is the public interface to attempt to 85 * free a socket when a reference is removed. This is a socket layer private 86 * interface. 87 * 88 * NOTE: In addition to socreate() and soclose(), which provide a single 89 * socket reference to the consumer to be managed as required, there are two 90 * calls to explicitly manage socket references, soref(), and sorele(). 91 * Currently, these are generally required only when transitioning a socket 92 * from a listen queue to a file descriptor, in order to prevent garbage 93 * collection of the socket at an untimely moment. For a number of reasons, 94 * these interfaces are not preferred, and should be avoided. 95 * 96 * NOTE: With regard to VNETs the general rule is that callers do not set 97 * curvnet. Exceptions to this rule include soabort(), sodisconnect(), 98 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() 99 * and sorflush(), which are usually called from a pre-set VNET context. 100 * sopoll() currently does not need a VNET context to be set. 101 */ 102 103 #include <sys/cdefs.h> 104 __FBSDID("$FreeBSD$"); 105 106 #include "opt_inet.h" 107 #include "opt_inet6.h" 108 #include "opt_zero.h" 109 #include "opt_compat.h" 110 111 #include <sys/param.h> 112 #include <sys/systm.h> 113 #include <sys/fcntl.h> 114 #include <sys/limits.h> 115 #include <sys/lock.h> 116 #include <sys/mac.h> 117 #include <sys/malloc.h> 118 #include <sys/mbuf.h> 119 #include <sys/mutex.h> 120 #include <sys/domain.h> 121 #include <sys/file.h> /* for struct knote */ 122 #include <sys/kernel.h> 123 #include <sys/event.h> 124 #include <sys/eventhandler.h> 125 #include <sys/poll.h> 126 #include <sys/proc.h> 127 #include <sys/protosw.h> 128 #include <sys/socket.h> 129 #include <sys/socketvar.h> 130 #include <sys/resourcevar.h> 131 #include <net/route.h> 132 #include <sys/signalvar.h> 133 #include <sys/stat.h> 134 #include <sys/sx.h> 135 #include <sys/sysctl.h> 136 #include <sys/uio.h> 137 #include <sys/jail.h> 138 #include <sys/syslog.h> 139 140 #include <net/vnet.h> 141 142 #include <security/mac/mac_framework.h> 143 144 #include <vm/uma.h> 145 146 #ifdef COMPAT_FREEBSD32 147 #include <sys/mount.h> 148 #include <sys/sysent.h> 149 #include <compat/freebsd32/freebsd32.h> 150 #endif 151 152 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 153 int flags); 154 155 static void filt_sordetach(struct knote *kn); 156 static int filt_soread(struct knote *kn, long hint); 157 static void filt_sowdetach(struct knote *kn); 158 static int filt_sowrite(struct knote *kn, long hint); 159 static int filt_solisten(struct knote *kn, long hint); 160 161 static struct filterops solisten_filtops = { 162 .f_isfd = 1, 163 .f_detach = filt_sordetach, 164 .f_event = filt_solisten, 165 }; 166 static struct filterops soread_filtops = { 167 .f_isfd = 1, 168 .f_detach = filt_sordetach, 169 .f_event = filt_soread, 170 }; 171 static struct filterops sowrite_filtops = { 172 .f_isfd = 1, 173 .f_detach = filt_sowdetach, 174 .f_event = filt_sowrite, 175 }; 176 177 so_gen_t so_gencnt; /* generation count for sockets */ 178 179 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 180 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 181 182 #define VNET_SO_ASSERT(so) \ 183 VNET_ASSERT(curvnet != NULL, \ 184 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 185 186 /* 187 * Limit on the number of connections in the listen queue waiting 188 * for accept(2). 189 * NB: The orginal sysctl somaxconn is still available but hidden 190 * to prevent confusion about the actual purpose of this number. 191 */ 192 static int somaxconn = SOMAXCONN; 193 194 static int 195 sysctl_somaxconn(SYSCTL_HANDLER_ARGS) 196 { 197 int error; 198 int val; 199 200 val = somaxconn; 201 error = sysctl_handle_int(oidp, &val, 0, req); 202 if (error || !req->newptr ) 203 return (error); 204 205 if (val < 1 || val > USHRT_MAX) 206 return (EINVAL); 207 208 somaxconn = val; 209 return (0); 210 } 211 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW, 212 0, sizeof(int), sysctl_somaxconn, "I", 213 "Maximum listen socket pending connection accept queue size"); 214 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, 215 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP, 216 0, sizeof(int), sysctl_somaxconn, "I", 217 "Maximum listen socket pending connection accept queue size (compat)"); 218 219 static int numopensockets; 220 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 221 &numopensockets, 0, "Number of open sockets"); 222 223 #if defined(SOCKET_SEND_COW) || defined(SOCKET_RECV_PFLIP) 224 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 225 "Zero copy controls"); 226 #ifdef SOCKET_RECV_PFLIP 227 int so_zero_copy_receive = 1; 228 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 229 &so_zero_copy_receive, 0, "Enable zero copy receive"); 230 #endif 231 #ifdef SOCKET_SEND_COW 232 int so_zero_copy_send = 1; 233 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 234 &so_zero_copy_send, 0, "Enable zero copy send"); 235 #endif /* SOCKET_SEND_COW */ 236 #endif /* SOCKET_SEND_COW || SOCKET_RECV_PFLIP */ 237 238 /* 239 * accept_mtx locks down per-socket fields relating to accept queues. See 240 * socketvar.h for an annotation of the protected fields of struct socket. 241 */ 242 struct mtx accept_mtx; 243 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 244 245 /* 246 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 247 * so_gencnt field. 248 */ 249 static struct mtx so_global_mtx; 250 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 251 252 /* 253 * General IPC sysctl name space, used by sockets and a variety of other IPC 254 * types. 255 */ 256 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 257 258 /* 259 * Initialize the socket subsystem and set up the socket 260 * memory allocator. 261 */ 262 uma_zone_t socket_zone; 263 int maxsockets; 264 265 static void 266 socket_zone_change(void *tag) 267 { 268 269 uma_zone_set_max(socket_zone, maxsockets); 270 } 271 272 static void 273 socket_init(void *tag) 274 { 275 276 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, 277 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 278 uma_zone_set_max(socket_zone, maxsockets); 279 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, 280 EVENTHANDLER_PRI_FIRST); 281 } 282 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); 283 284 /* 285 * Initialise maxsockets. This SYSINIT must be run after 286 * tunable_mbinit(). 287 */ 288 static void 289 init_maxsockets(void *ignored) 290 { 291 292 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 293 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 294 } 295 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 296 297 /* 298 * Sysctl to get and set the maximum global sockets limit. Notify protocols 299 * of the change so that they can update their dependent limits as required. 300 */ 301 static int 302 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 303 { 304 int error, newmaxsockets; 305 306 newmaxsockets = maxsockets; 307 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); 308 if (error == 0 && req->newptr) { 309 if (newmaxsockets > maxsockets) { 310 maxsockets = newmaxsockets; 311 if (maxsockets > ((maxfiles / 4) * 3)) { 312 maxfiles = (maxsockets * 5) / 4; 313 maxfilesperproc = (maxfiles * 9) / 10; 314 } 315 EVENTHANDLER_INVOKE(maxsockets_change); 316 } else 317 error = EINVAL; 318 } 319 return (error); 320 } 321 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 322 &maxsockets, 0, sysctl_maxsockets, "IU", 323 "Maximum number of sockets avaliable"); 324 325 /* 326 * Socket operation routines. These routines are called by the routines in 327 * sys_socket.c or from a system process, and implement the semantics of 328 * socket operations by switching out to the protocol specific routines. 329 */ 330 331 /* 332 * Get a socket structure from our zone, and initialize it. Note that it 333 * would probably be better to allocate socket and PCB at the same time, but 334 * I'm not convinced that all the protocols can be easily modified to do 335 * this. 336 * 337 * soalloc() returns a socket with a ref count of 0. 338 */ 339 static struct socket * 340 soalloc(struct vnet *vnet) 341 { 342 struct socket *so; 343 344 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); 345 if (so == NULL) 346 return (NULL); 347 #ifdef MAC 348 if (mac_socket_init(so, M_NOWAIT) != 0) { 349 uma_zfree(socket_zone, so); 350 return (NULL); 351 } 352 #endif 353 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 354 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 355 sx_init(&so->so_snd.sb_sx, "so_snd_sx"); 356 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); 357 TAILQ_INIT(&so->so_aiojobq); 358 mtx_lock(&so_global_mtx); 359 so->so_gencnt = ++so_gencnt; 360 ++numopensockets; 361 #ifdef VIMAGE 362 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", 363 __func__, __LINE__, so)); 364 vnet->vnet_sockcnt++; 365 so->so_vnet = vnet; 366 #endif 367 mtx_unlock(&so_global_mtx); 368 return (so); 369 } 370 371 /* 372 * Free the storage associated with a socket at the socket layer, tear down 373 * locks, labels, etc. All protocol state is assumed already to have been 374 * torn down (and possibly never set up) by the caller. 375 */ 376 static void 377 sodealloc(struct socket *so) 378 { 379 380 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 381 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 382 383 mtx_lock(&so_global_mtx); 384 so->so_gencnt = ++so_gencnt; 385 --numopensockets; /* Could be below, but faster here. */ 386 #ifdef VIMAGE 387 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", 388 __func__, __LINE__, so)); 389 so->so_vnet->vnet_sockcnt--; 390 #endif 391 mtx_unlock(&so_global_mtx); 392 if (so->so_rcv.sb_hiwat) 393 (void)chgsbsize(so->so_cred->cr_uidinfo, 394 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 395 if (so->so_snd.sb_hiwat) 396 (void)chgsbsize(so->so_cred->cr_uidinfo, 397 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 398 #ifdef INET 399 /* remove acccept filter if one is present. */ 400 if (so->so_accf != NULL) 401 do_setopt_accept_filter(so, NULL); 402 #endif 403 #ifdef MAC 404 mac_socket_destroy(so); 405 #endif 406 crfree(so->so_cred); 407 sx_destroy(&so->so_snd.sb_sx); 408 sx_destroy(&so->so_rcv.sb_sx); 409 SOCKBUF_LOCK_DESTROY(&so->so_snd); 410 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 411 uma_zfree(socket_zone, so); 412 } 413 414 /* 415 * socreate returns a socket with a ref count of 1. The socket should be 416 * closed with soclose(). 417 */ 418 int 419 socreate(int dom, struct socket **aso, int type, int proto, 420 struct ucred *cred, struct thread *td) 421 { 422 struct protosw *prp; 423 struct socket *so; 424 int error; 425 426 if (proto) 427 prp = pffindproto(dom, proto, type); 428 else 429 prp = pffindtype(dom, type); 430 431 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 432 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 433 return (EPROTONOSUPPORT); 434 435 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) 436 return (EPROTONOSUPPORT); 437 438 if (prp->pr_type != type) 439 return (EPROTOTYPE); 440 so = soalloc(CRED_TO_VNET(cred)); 441 if (so == NULL) 442 return (ENOBUFS); 443 444 TAILQ_INIT(&so->so_incomp); 445 TAILQ_INIT(&so->so_comp); 446 so->so_type = type; 447 so->so_cred = crhold(cred); 448 if ((prp->pr_domain->dom_family == PF_INET) || 449 (prp->pr_domain->dom_family == PF_INET6) || 450 (prp->pr_domain->dom_family == PF_ROUTE)) 451 so->so_fibnum = td->td_proc->p_fibnum; 452 else 453 so->so_fibnum = 0; 454 so->so_proto = prp; 455 #ifdef MAC 456 mac_socket_create(cred, so); 457 #endif 458 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 459 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 460 so->so_count = 1; 461 /* 462 * Auto-sizing of socket buffers is managed by the protocols and 463 * the appropriate flags must be set in the pru_attach function. 464 */ 465 CURVNET_SET(so->so_vnet); 466 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 467 CURVNET_RESTORE(); 468 if (error) { 469 KASSERT(so->so_count == 1, ("socreate: so_count %d", 470 so->so_count)); 471 so->so_count = 0; 472 sodealloc(so); 473 return (error); 474 } 475 *aso = so; 476 return (0); 477 } 478 479 #ifdef REGRESSION 480 static int regression_sonewconn_earlytest = 1; 481 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 482 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 483 #endif 484 485 /* 486 * When an attempt at a new connection is noted on a socket which accepts 487 * connections, sonewconn is called. If the connection is possible (subject 488 * to space constraints, etc.) then we allocate a new structure, propoerly 489 * linked into the data structure of the original socket, and return this. 490 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 491 * 492 * Note: the ref count on the socket is 0 on return. 493 */ 494 struct socket * 495 sonewconn(struct socket *head, int connstatus) 496 { 497 struct socket *so; 498 int over; 499 500 ACCEPT_LOCK(); 501 over = (head->so_qlen > 3 * head->so_qlimit / 2); 502 ACCEPT_UNLOCK(); 503 #ifdef REGRESSION 504 if (regression_sonewconn_earlytest && over) { 505 #else 506 if (over) { 507 #endif 508 log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " 509 "%i already in queue awaiting acceptance\n", 510 __func__, head->so_pcb, over); 511 return (NULL); 512 } 513 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", 514 __func__, __LINE__, head)); 515 so = soalloc(head->so_vnet); 516 if (so == NULL) { 517 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " 518 "limit reached or out of memory\n", 519 __func__, head->so_pcb); 520 return (NULL); 521 } 522 if ((head->so_options & SO_ACCEPTFILTER) != 0) 523 connstatus = 0; 524 so->so_head = head; 525 so->so_type = head->so_type; 526 so->so_options = head->so_options &~ SO_ACCEPTCONN; 527 so->so_linger = head->so_linger; 528 so->so_state = head->so_state | SS_NOFDREF; 529 so->so_fibnum = head->so_fibnum; 530 so->so_proto = head->so_proto; 531 so->so_cred = crhold(head->so_cred); 532 #ifdef MAC 533 mac_socket_newconn(head, so); 534 #endif 535 knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 536 knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 537 VNET_SO_ASSERT(head); 538 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { 539 sodealloc(so); 540 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", 541 __func__, head->so_pcb); 542 return (NULL); 543 } 544 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 545 sodealloc(so); 546 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", 547 __func__, head->so_pcb); 548 return (NULL); 549 } 550 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; 551 so->so_snd.sb_lowat = head->so_snd.sb_lowat; 552 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; 553 so->so_snd.sb_timeo = head->so_snd.sb_timeo; 554 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; 555 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; 556 so->so_state |= connstatus; 557 ACCEPT_LOCK(); 558 if (connstatus) { 559 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 560 so->so_qstate |= SQ_COMP; 561 head->so_qlen++; 562 } else { 563 /* 564 * Keep removing sockets from the head until there's room for 565 * us to insert on the tail. In pre-locking revisions, this 566 * was a simple if(), but as we could be racing with other 567 * threads and soabort() requires dropping locks, we must 568 * loop waiting for the condition to be true. 569 */ 570 while (head->so_incqlen > head->so_qlimit) { 571 struct socket *sp; 572 sp = TAILQ_FIRST(&head->so_incomp); 573 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 574 head->so_incqlen--; 575 sp->so_qstate &= ~SQ_INCOMP; 576 sp->so_head = NULL; 577 ACCEPT_UNLOCK(); 578 soabort(sp); 579 ACCEPT_LOCK(); 580 } 581 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 582 so->so_qstate |= SQ_INCOMP; 583 head->so_incqlen++; 584 } 585 ACCEPT_UNLOCK(); 586 if (connstatus) { 587 sorwakeup(head); 588 wakeup_one(&head->so_timeo); 589 } 590 return (so); 591 } 592 593 int 594 sobind(struct socket *so, struct sockaddr *nam, struct thread *td) 595 { 596 int error; 597 598 CURVNET_SET(so->so_vnet); 599 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 600 CURVNET_RESTORE(); 601 return error; 602 } 603 604 /* 605 * solisten() transitions a socket from a non-listening state to a listening 606 * state, but can also be used to update the listen queue depth on an 607 * existing listen socket. The protocol will call back into the sockets 608 * layer using solisten_proto_check() and solisten_proto() to check and set 609 * socket-layer listen state. Call backs are used so that the protocol can 610 * acquire both protocol and socket layer locks in whatever order is required 611 * by the protocol. 612 * 613 * Protocol implementors are advised to hold the socket lock across the 614 * socket-layer test and set to avoid races at the socket layer. 615 */ 616 int 617 solisten(struct socket *so, int backlog, struct thread *td) 618 { 619 int error; 620 621 CURVNET_SET(so->so_vnet); 622 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); 623 CURVNET_RESTORE(); 624 return error; 625 } 626 627 int 628 solisten_proto_check(struct socket *so) 629 { 630 631 SOCK_LOCK_ASSERT(so); 632 633 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 634 SS_ISDISCONNECTING)) 635 return (EINVAL); 636 return (0); 637 } 638 639 void 640 solisten_proto(struct socket *so, int backlog) 641 { 642 643 SOCK_LOCK_ASSERT(so); 644 645 if (backlog < 0 || backlog > somaxconn) 646 backlog = somaxconn; 647 so->so_qlimit = backlog; 648 so->so_options |= SO_ACCEPTCONN; 649 } 650 651 /* 652 * Evaluate the reference count and named references on a socket; if no 653 * references remain, free it. This should be called whenever a reference is 654 * released, such as in sorele(), but also when named reference flags are 655 * cleared in socket or protocol code. 656 * 657 * sofree() will free the socket if: 658 * 659 * - There are no outstanding file descriptor references or related consumers 660 * (so_count == 0). 661 * 662 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 663 * 664 * - The protocol does not have an outstanding strong reference on the socket 665 * (SS_PROTOREF). 666 * 667 * - The socket is not in a completed connection queue, so a process has been 668 * notified that it is present. If it is removed, the user process may 669 * block in accept() despite select() saying the socket was ready. 670 */ 671 void 672 sofree(struct socket *so) 673 { 674 struct protosw *pr = so->so_proto; 675 struct socket *head; 676 677 ACCEPT_LOCK_ASSERT(); 678 SOCK_LOCK_ASSERT(so); 679 680 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 681 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 682 SOCK_UNLOCK(so); 683 ACCEPT_UNLOCK(); 684 return; 685 } 686 687 head = so->so_head; 688 if (head != NULL) { 689 KASSERT((so->so_qstate & SQ_COMP) != 0 || 690 (so->so_qstate & SQ_INCOMP) != 0, 691 ("sofree: so_head != NULL, but neither SQ_COMP nor " 692 "SQ_INCOMP")); 693 KASSERT((so->so_qstate & SQ_COMP) == 0 || 694 (so->so_qstate & SQ_INCOMP) == 0, 695 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 696 TAILQ_REMOVE(&head->so_incomp, so, so_list); 697 head->so_incqlen--; 698 so->so_qstate &= ~SQ_INCOMP; 699 so->so_head = NULL; 700 } 701 KASSERT((so->so_qstate & SQ_COMP) == 0 && 702 (so->so_qstate & SQ_INCOMP) == 0, 703 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 704 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 705 if (so->so_options & SO_ACCEPTCONN) { 706 KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated")); 707 KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated")); 708 } 709 SOCK_UNLOCK(so); 710 ACCEPT_UNLOCK(); 711 712 VNET_SO_ASSERT(so); 713 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 714 (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb); 715 if (pr->pr_usrreqs->pru_detach != NULL) 716 (*pr->pr_usrreqs->pru_detach)(so); 717 718 /* 719 * From this point on, we assume that no other references to this 720 * socket exist anywhere else in the stack. Therefore, no locks need 721 * to be acquired or held. 722 * 723 * We used to do a lot of socket buffer and socket locking here, as 724 * well as invoke sorflush() and perform wakeups. The direct call to 725 * dom_dispose() and sbrelease_internal() are an inlining of what was 726 * necessary from sorflush(). 727 * 728 * Notice that the socket buffer and kqueue state are torn down 729 * before calling pru_detach. This means that protocols shold not 730 * assume they can perform socket wakeups, etc, in their detach code. 731 */ 732 sbdestroy(&so->so_snd, so); 733 sbdestroy(&so->so_rcv, so); 734 seldrain(&so->so_snd.sb_sel); 735 seldrain(&so->so_rcv.sb_sel); 736 knlist_destroy(&so->so_rcv.sb_sel.si_note); 737 knlist_destroy(&so->so_snd.sb_sel.si_note); 738 sodealloc(so); 739 } 740 741 /* 742 * Close a socket on last file table reference removal. Initiate disconnect 743 * if connected. Free socket when disconnect complete. 744 * 745 * This function will sorele() the socket. Note that soclose() may be called 746 * prior to the ref count reaching zero. The actual socket structure will 747 * not be freed until the ref count reaches zero. 748 */ 749 int 750 soclose(struct socket *so) 751 { 752 int error = 0; 753 754 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 755 756 CURVNET_SET(so->so_vnet); 757 funsetown(&so->so_sigio); 758 if (so->so_state & SS_ISCONNECTED) { 759 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 760 error = sodisconnect(so); 761 if (error) { 762 if (error == ENOTCONN) 763 error = 0; 764 goto drop; 765 } 766 } 767 if (so->so_options & SO_LINGER) { 768 if ((so->so_state & SS_ISDISCONNECTING) && 769 (so->so_state & SS_NBIO)) 770 goto drop; 771 while (so->so_state & SS_ISCONNECTED) { 772 error = tsleep(&so->so_timeo, 773 PSOCK | PCATCH, "soclos", so->so_linger * hz); 774 if (error) 775 break; 776 } 777 } 778 } 779 780 drop: 781 if (so->so_proto->pr_usrreqs->pru_close != NULL) 782 (*so->so_proto->pr_usrreqs->pru_close)(so); 783 if (so->so_options & SO_ACCEPTCONN) { 784 struct socket *sp; 785 ACCEPT_LOCK(); 786 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 787 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 788 so->so_incqlen--; 789 sp->so_qstate &= ~SQ_INCOMP; 790 sp->so_head = NULL; 791 ACCEPT_UNLOCK(); 792 soabort(sp); 793 ACCEPT_LOCK(); 794 } 795 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 796 TAILQ_REMOVE(&so->so_comp, sp, so_list); 797 so->so_qlen--; 798 sp->so_qstate &= ~SQ_COMP; 799 sp->so_head = NULL; 800 ACCEPT_UNLOCK(); 801 soabort(sp); 802 ACCEPT_LOCK(); 803 } 804 ACCEPT_UNLOCK(); 805 } 806 ACCEPT_LOCK(); 807 SOCK_LOCK(so); 808 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 809 so->so_state |= SS_NOFDREF; 810 sorele(so); 811 CURVNET_RESTORE(); 812 return (error); 813 } 814 815 /* 816 * soabort() is used to abruptly tear down a connection, such as when a 817 * resource limit is reached (listen queue depth exceeded), or if a listen 818 * socket is closed while there are sockets waiting to be accepted. 819 * 820 * This interface is tricky, because it is called on an unreferenced socket, 821 * and must be called only by a thread that has actually removed the socket 822 * from the listen queue it was on, or races with other threads are risked. 823 * 824 * This interface will call into the protocol code, so must not be called 825 * with any socket locks held. Protocols do call it while holding their own 826 * recursible protocol mutexes, but this is something that should be subject 827 * to review in the future. 828 */ 829 void 830 soabort(struct socket *so) 831 { 832 833 /* 834 * In as much as is possible, assert that no references to this 835 * socket are held. This is not quite the same as asserting that the 836 * current thread is responsible for arranging for no references, but 837 * is as close as we can get for now. 838 */ 839 KASSERT(so->so_count == 0, ("soabort: so_count")); 840 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 841 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 842 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 843 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 844 VNET_SO_ASSERT(so); 845 846 if (so->so_proto->pr_usrreqs->pru_abort != NULL) 847 (*so->so_proto->pr_usrreqs->pru_abort)(so); 848 ACCEPT_LOCK(); 849 SOCK_LOCK(so); 850 sofree(so); 851 } 852 853 int 854 soaccept(struct socket *so, struct sockaddr **nam) 855 { 856 int error; 857 858 SOCK_LOCK(so); 859 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 860 so->so_state &= ~SS_NOFDREF; 861 SOCK_UNLOCK(so); 862 863 CURVNET_SET(so->so_vnet); 864 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 865 CURVNET_RESTORE(); 866 return (error); 867 } 868 869 int 870 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) 871 { 872 int error; 873 874 if (so->so_options & SO_ACCEPTCONN) 875 return (EOPNOTSUPP); 876 877 CURVNET_SET(so->so_vnet); 878 /* 879 * If protocol is connection-based, can only connect once. 880 * Otherwise, if connected, try to disconnect first. This allows 881 * user to disconnect by connecting to, e.g., a null address. 882 */ 883 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 884 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 885 (error = sodisconnect(so)))) { 886 error = EISCONN; 887 } else { 888 /* 889 * Prevent accumulated error from previous connection from 890 * biting us. 891 */ 892 so->so_error = 0; 893 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 894 } 895 CURVNET_RESTORE(); 896 897 return (error); 898 } 899 900 int 901 soconnect2(struct socket *so1, struct socket *so2) 902 { 903 int error; 904 905 CURVNET_SET(so1->so_vnet); 906 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 907 CURVNET_RESTORE(); 908 return (error); 909 } 910 911 int 912 sodisconnect(struct socket *so) 913 { 914 int error; 915 916 if ((so->so_state & SS_ISCONNECTED) == 0) 917 return (ENOTCONN); 918 if (so->so_state & SS_ISDISCONNECTING) 919 return (EALREADY); 920 VNET_SO_ASSERT(so); 921 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 922 return (error); 923 } 924 925 #ifdef SOCKET_SEND_COW 926 struct so_zerocopy_stats{ 927 int size_ok; 928 int align_ok; 929 int found_ifp; 930 }; 931 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 932 933 /* 934 * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise 935 * sosend_dgram() and sosend_generic() use m_uiotombuf(). 936 * 937 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 938 * all of the data referenced by the uio. If desired, it uses zero-copy. 939 * *space will be updated to reflect data copied in. 940 * 941 * NB: If atomic I/O is requested, the caller must already have checked that 942 * space can hold resid bytes. 943 * 944 * NB: In the event of an error, the caller may need to free the partial 945 * chain pointed to by *mpp. The contents of both *uio and *space may be 946 * modified even in the case of an error. 947 */ 948 static int 949 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 950 int flags) 951 { 952 struct mbuf *m, **mp, *top; 953 long len; 954 ssize_t resid; 955 int error; 956 int cow_send; 957 958 *retmp = top = NULL; 959 mp = ⊤ 960 len = 0; 961 resid = uio->uio_resid; 962 error = 0; 963 do { 964 cow_send = 0; 965 if (resid >= MINCLSIZE) { 966 if (top == NULL) { 967 m = m_gethdr(M_WAITOK, MT_DATA); 968 m->m_pkthdr.len = 0; 969 m->m_pkthdr.rcvif = NULL; 970 } else 971 m = m_get(M_WAITOK, MT_DATA); 972 if (so_zero_copy_send && 973 resid >= PAGE_SIZE && 974 *space >= PAGE_SIZE && 975 uio->uio_iov->iov_len >= PAGE_SIZE) { 976 so_zerocp_stats.size_ok++; 977 so_zerocp_stats.align_ok++; 978 cow_send = socow_setup(m, uio); 979 len = cow_send; 980 } 981 if (!cow_send) { 982 m_clget(m, M_WAITOK); 983 len = min(min(MCLBYTES, resid), *space); 984 } 985 } else { 986 if (top == NULL) { 987 m = m_gethdr(M_WAIT, MT_DATA); 988 m->m_pkthdr.len = 0; 989 m->m_pkthdr.rcvif = NULL; 990 991 len = min(min(MHLEN, resid), *space); 992 /* 993 * For datagram protocols, leave room 994 * for protocol headers in first mbuf. 995 */ 996 if (atomic && m && len < MHLEN) 997 MH_ALIGN(m, len); 998 } else { 999 m = m_get(M_WAIT, MT_DATA); 1000 len = min(min(MLEN, resid), *space); 1001 } 1002 } 1003 if (m == NULL) { 1004 error = ENOBUFS; 1005 goto out; 1006 } 1007 1008 *space -= len; 1009 if (cow_send) 1010 error = 0; 1011 else 1012 error = uiomove(mtod(m, void *), (int)len, uio); 1013 resid = uio->uio_resid; 1014 m->m_len = len; 1015 *mp = m; 1016 top->m_pkthdr.len += len; 1017 if (error) 1018 goto out; 1019 mp = &m->m_next; 1020 if (resid <= 0) { 1021 if (flags & MSG_EOR) 1022 top->m_flags |= M_EOR; 1023 break; 1024 } 1025 } while (*space > 0 && atomic); 1026 out: 1027 *retmp = top; 1028 return (error); 1029 } 1030 #endif /* SOCKET_SEND_COW */ 1031 1032 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1033 1034 int 1035 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, 1036 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1037 { 1038 long space; 1039 ssize_t resid; 1040 int clen = 0, error, dontroute; 1041 #ifdef SOCKET_SEND_COW 1042 int atomic = sosendallatonce(so) || top; 1043 #endif 1044 1045 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); 1046 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 1047 ("sosend_dgram: !PR_ATOMIC")); 1048 1049 if (uio != NULL) 1050 resid = uio->uio_resid; 1051 else 1052 resid = top->m_pkthdr.len; 1053 /* 1054 * In theory resid should be unsigned. However, space must be 1055 * signed, as it might be less than 0 if we over-committed, and we 1056 * must use a signed comparison of space and resid. On the other 1057 * hand, a negative resid causes us to loop sending 0-length 1058 * segments to the protocol. 1059 */ 1060 if (resid < 0) { 1061 error = EINVAL; 1062 goto out; 1063 } 1064 1065 dontroute = 1066 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 1067 if (td != NULL) 1068 td->td_ru.ru_msgsnd++; 1069 if (control != NULL) 1070 clen = control->m_len; 1071 1072 SOCKBUF_LOCK(&so->so_snd); 1073 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1074 SOCKBUF_UNLOCK(&so->so_snd); 1075 error = EPIPE; 1076 goto out; 1077 } 1078 if (so->so_error) { 1079 error = so->so_error; 1080 so->so_error = 0; 1081 SOCKBUF_UNLOCK(&so->so_snd); 1082 goto out; 1083 } 1084 if ((so->so_state & SS_ISCONNECTED) == 0) { 1085 /* 1086 * `sendto' and `sendmsg' is allowed on a connection-based 1087 * socket if it supports implied connect. Return ENOTCONN if 1088 * not connected and no address is supplied. 1089 */ 1090 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1091 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1092 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1093 !(resid == 0 && clen != 0)) { 1094 SOCKBUF_UNLOCK(&so->so_snd); 1095 error = ENOTCONN; 1096 goto out; 1097 } 1098 } else if (addr == NULL) { 1099 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1100 error = ENOTCONN; 1101 else 1102 error = EDESTADDRREQ; 1103 SOCKBUF_UNLOCK(&so->so_snd); 1104 goto out; 1105 } 1106 } 1107 1108 /* 1109 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1110 * problem and need fixing. 1111 */ 1112 space = sbspace(&so->so_snd); 1113 if (flags & MSG_OOB) 1114 space += 1024; 1115 space -= clen; 1116 SOCKBUF_UNLOCK(&so->so_snd); 1117 if (resid > space) { 1118 error = EMSGSIZE; 1119 goto out; 1120 } 1121 if (uio == NULL) { 1122 resid = 0; 1123 if (flags & MSG_EOR) 1124 top->m_flags |= M_EOR; 1125 } else { 1126 #ifdef SOCKET_SEND_COW 1127 error = sosend_copyin(uio, &top, atomic, &space, flags); 1128 if (error) 1129 goto out; 1130 #else 1131 /* 1132 * Copy the data from userland into a mbuf chain. 1133 * If no data is to be copied in, a single empty mbuf 1134 * is returned. 1135 */ 1136 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, 1137 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); 1138 if (top == NULL) { 1139 error = EFAULT; /* only possible error */ 1140 goto out; 1141 } 1142 space -= resid - uio->uio_resid; 1143 #endif /* SOCKET_SEND_COW */ 1144 resid = uio->uio_resid; 1145 } 1146 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1147 /* 1148 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1149 * than with. 1150 */ 1151 if (dontroute) { 1152 SOCK_LOCK(so); 1153 so->so_options |= SO_DONTROUTE; 1154 SOCK_UNLOCK(so); 1155 } 1156 /* 1157 * XXX all the SBS_CANTSENDMORE checks previously done could be out 1158 * of date. We could have recieved a reset packet in an interrupt or 1159 * maybe we slept while doing page faults in uiomove() etc. We could 1160 * probably recheck again inside the locking protection here, but 1161 * there are probably other places that this also happens. We must 1162 * rethink this. 1163 */ 1164 VNET_SO_ASSERT(so); 1165 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1166 (flags & MSG_OOB) ? PRUS_OOB : 1167 /* 1168 * If the user set MSG_EOF, the protocol understands this flag and 1169 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. 1170 */ 1171 ((flags & MSG_EOF) && 1172 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1173 (resid <= 0)) ? 1174 PRUS_EOF : 1175 /* If there is more to send set PRUS_MORETOCOME */ 1176 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1177 top, addr, control, td); 1178 if (dontroute) { 1179 SOCK_LOCK(so); 1180 so->so_options &= ~SO_DONTROUTE; 1181 SOCK_UNLOCK(so); 1182 } 1183 clen = 0; 1184 control = NULL; 1185 top = NULL; 1186 out: 1187 if (top != NULL) 1188 m_freem(top); 1189 if (control != NULL) 1190 m_freem(control); 1191 return (error); 1192 } 1193 1194 /* 1195 * Send on a socket. If send must go all at once and message is larger than 1196 * send buffering, then hard error. Lock against other senders. If must go 1197 * all at once and not enough room now, then inform user that this would 1198 * block and do nothing. Otherwise, if nonblocking, send as much as 1199 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1200 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1201 * in mbuf chain must be small enough to send all at once. 1202 * 1203 * Returns nonzero on error, timeout or signal; callers must check for short 1204 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1205 * on return. 1206 */ 1207 int 1208 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, 1209 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1210 { 1211 long space; 1212 ssize_t resid; 1213 int clen = 0, error, dontroute; 1214 int atomic = sosendallatonce(so) || top; 1215 1216 if (uio != NULL) 1217 resid = uio->uio_resid; 1218 else 1219 resid = top->m_pkthdr.len; 1220 /* 1221 * In theory resid should be unsigned. However, space must be 1222 * signed, as it might be less than 0 if we over-committed, and we 1223 * must use a signed comparison of space and resid. On the other 1224 * hand, a negative resid causes us to loop sending 0-length 1225 * segments to the protocol. 1226 * 1227 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1228 * type sockets since that's an error. 1229 */ 1230 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1231 error = EINVAL; 1232 goto out; 1233 } 1234 1235 dontroute = 1236 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1237 (so->so_proto->pr_flags & PR_ATOMIC); 1238 if (td != NULL) 1239 td->td_ru.ru_msgsnd++; 1240 if (control != NULL) 1241 clen = control->m_len; 1242 1243 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1244 if (error) 1245 goto out; 1246 1247 restart: 1248 do { 1249 SOCKBUF_LOCK(&so->so_snd); 1250 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1251 SOCKBUF_UNLOCK(&so->so_snd); 1252 error = EPIPE; 1253 goto release; 1254 } 1255 if (so->so_error) { 1256 error = so->so_error; 1257 so->so_error = 0; 1258 SOCKBUF_UNLOCK(&so->so_snd); 1259 goto release; 1260 } 1261 if ((so->so_state & SS_ISCONNECTED) == 0) { 1262 /* 1263 * `sendto' and `sendmsg' is allowed on a connection- 1264 * based socket if it supports implied connect. 1265 * Return ENOTCONN if not connected and no address is 1266 * supplied. 1267 */ 1268 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1269 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1270 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1271 !(resid == 0 && clen != 0)) { 1272 SOCKBUF_UNLOCK(&so->so_snd); 1273 error = ENOTCONN; 1274 goto release; 1275 } 1276 } else if (addr == NULL) { 1277 SOCKBUF_UNLOCK(&so->so_snd); 1278 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 1279 error = ENOTCONN; 1280 else 1281 error = EDESTADDRREQ; 1282 goto release; 1283 } 1284 } 1285 space = sbspace(&so->so_snd); 1286 if (flags & MSG_OOB) 1287 space += 1024; 1288 if ((atomic && resid > so->so_snd.sb_hiwat) || 1289 clen > so->so_snd.sb_hiwat) { 1290 SOCKBUF_UNLOCK(&so->so_snd); 1291 error = EMSGSIZE; 1292 goto release; 1293 } 1294 if (space < resid + clen && 1295 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1296 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { 1297 SOCKBUF_UNLOCK(&so->so_snd); 1298 error = EWOULDBLOCK; 1299 goto release; 1300 } 1301 error = sbwait(&so->so_snd); 1302 SOCKBUF_UNLOCK(&so->so_snd); 1303 if (error) 1304 goto release; 1305 goto restart; 1306 } 1307 SOCKBUF_UNLOCK(&so->so_snd); 1308 space -= clen; 1309 do { 1310 if (uio == NULL) { 1311 resid = 0; 1312 if (flags & MSG_EOR) 1313 top->m_flags |= M_EOR; 1314 } else { 1315 #ifdef SOCKET_SEND_COW 1316 error = sosend_copyin(uio, &top, atomic, 1317 &space, flags); 1318 if (error != 0) 1319 goto release; 1320 #else 1321 /* 1322 * Copy the data from userland into a mbuf 1323 * chain. If no data is to be copied in, 1324 * a single empty mbuf is returned. 1325 */ 1326 top = m_uiotombuf(uio, M_WAITOK, space, 1327 (atomic ? max_hdr : 0), 1328 (atomic ? M_PKTHDR : 0) | 1329 ((flags & MSG_EOR) ? M_EOR : 0)); 1330 if (top == NULL) { 1331 error = EFAULT; /* only possible error */ 1332 goto release; 1333 } 1334 space -= resid - uio->uio_resid; 1335 #endif /* SOCKET_SEND_COW */ 1336 resid = uio->uio_resid; 1337 } 1338 if (dontroute) { 1339 SOCK_LOCK(so); 1340 so->so_options |= SO_DONTROUTE; 1341 SOCK_UNLOCK(so); 1342 } 1343 /* 1344 * XXX all the SBS_CANTSENDMORE checks previously 1345 * done could be out of date. We could have recieved 1346 * a reset packet in an interrupt or maybe we slept 1347 * while doing page faults in uiomove() etc. We 1348 * could probably recheck again inside the locking 1349 * protection here, but there are probably other 1350 * places that this also happens. We must rethink 1351 * this. 1352 */ 1353 VNET_SO_ASSERT(so); 1354 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1355 (flags & MSG_OOB) ? PRUS_OOB : 1356 /* 1357 * If the user set MSG_EOF, the protocol understands 1358 * this flag and nothing left to send then use 1359 * PRU_SEND_EOF instead of PRU_SEND. 1360 */ 1361 ((flags & MSG_EOF) && 1362 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1363 (resid <= 0)) ? 1364 PRUS_EOF : 1365 /* If there is more to send set PRUS_MORETOCOME. */ 1366 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1367 top, addr, control, td); 1368 if (dontroute) { 1369 SOCK_LOCK(so); 1370 so->so_options &= ~SO_DONTROUTE; 1371 SOCK_UNLOCK(so); 1372 } 1373 clen = 0; 1374 control = NULL; 1375 top = NULL; 1376 if (error) 1377 goto release; 1378 } while (resid && space > 0); 1379 } while (resid); 1380 1381 release: 1382 sbunlock(&so->so_snd); 1383 out: 1384 if (top != NULL) 1385 m_freem(top); 1386 if (control != NULL) 1387 m_freem(control); 1388 return (error); 1389 } 1390 1391 int 1392 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1393 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1394 { 1395 int error; 1396 1397 CURVNET_SET(so->so_vnet); 1398 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, 1399 control, flags, td); 1400 CURVNET_RESTORE(); 1401 return (error); 1402 } 1403 1404 /* 1405 * The part of soreceive() that implements reading non-inline out-of-band 1406 * data from a socket. For more complete comments, see soreceive(), from 1407 * which this code originated. 1408 * 1409 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1410 * unable to return an mbuf chain to the caller. 1411 */ 1412 static int 1413 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1414 { 1415 struct protosw *pr = so->so_proto; 1416 struct mbuf *m; 1417 int error; 1418 1419 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1420 VNET_SO_ASSERT(so); 1421 1422 m = m_get(M_WAIT, MT_DATA); 1423 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1424 if (error) 1425 goto bad; 1426 do { 1427 #ifdef SOCKET_RECV_PFLIP 1428 if (so_zero_copy_receive) { 1429 int disposable; 1430 1431 if ((m->m_flags & M_EXT) 1432 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1433 disposable = 1; 1434 else 1435 disposable = 0; 1436 1437 error = uiomoveco(mtod(m, void *), 1438 min(uio->uio_resid, m->m_len), 1439 uio, disposable); 1440 } else 1441 #endif /* SOCKET_RECV_PFLIP */ 1442 error = uiomove(mtod(m, void *), 1443 (int) min(uio->uio_resid, m->m_len), uio); 1444 m = m_free(m); 1445 } while (uio->uio_resid && error == 0 && m); 1446 bad: 1447 if (m != NULL) 1448 m_freem(m); 1449 return (error); 1450 } 1451 1452 /* 1453 * Following replacement or removal of the first mbuf on the first mbuf chain 1454 * of a socket buffer, push necessary state changes back into the socket 1455 * buffer so that other consumers see the values consistently. 'nextrecord' 1456 * is the callers locally stored value of the original value of 1457 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1458 * NOTE: 'nextrecord' may be NULL. 1459 */ 1460 static __inline void 1461 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1462 { 1463 1464 SOCKBUF_LOCK_ASSERT(sb); 1465 /* 1466 * First, update for the new value of nextrecord. If necessary, make 1467 * it the first record. 1468 */ 1469 if (sb->sb_mb != NULL) 1470 sb->sb_mb->m_nextpkt = nextrecord; 1471 else 1472 sb->sb_mb = nextrecord; 1473 1474 /* 1475 * Now update any dependent socket buffer fields to reflect the new 1476 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1477 * addition of a second clause that takes care of the case where 1478 * sb_mb has been updated, but remains the last record. 1479 */ 1480 if (sb->sb_mb == NULL) { 1481 sb->sb_mbtail = NULL; 1482 sb->sb_lastrecord = NULL; 1483 } else if (sb->sb_mb->m_nextpkt == NULL) 1484 sb->sb_lastrecord = sb->sb_mb; 1485 } 1486 1487 1488 /* 1489 * Implement receive operations on a socket. We depend on the way that 1490 * records are added to the sockbuf by sbappend. In particular, each record 1491 * (mbufs linked through m_next) must begin with an address if the protocol 1492 * so specifies, followed by an optional mbuf or mbufs containing ancillary 1493 * data, and then zero or more mbufs of data. In order to allow parallelism 1494 * between network receive and copying to user space, as well as avoid 1495 * sleeping with a mutex held, we release the socket buffer mutex during the 1496 * user space copy. Although the sockbuf is locked, new data may still be 1497 * appended, and thus we must maintain consistency of the sockbuf during that 1498 * time. 1499 * 1500 * The caller may receive the data as a single mbuf chain by supplying an 1501 * mbuf **mp0 for use in returning the chain. The uio is then used only for 1502 * the count in uio_resid. 1503 */ 1504 int 1505 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, 1506 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1507 { 1508 struct mbuf *m, **mp; 1509 int flags, error, offset; 1510 ssize_t len; 1511 struct protosw *pr = so->so_proto; 1512 struct mbuf *nextrecord; 1513 int moff, type = 0; 1514 ssize_t orig_resid = uio->uio_resid; 1515 1516 mp = mp0; 1517 if (psa != NULL) 1518 *psa = NULL; 1519 if (controlp != NULL) 1520 *controlp = NULL; 1521 if (flagsp != NULL) 1522 flags = *flagsp &~ MSG_EOR; 1523 else 1524 flags = 0; 1525 if (flags & MSG_OOB) 1526 return (soreceive_rcvoob(so, uio, flags)); 1527 if (mp != NULL) 1528 *mp = NULL; 1529 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1530 && uio->uio_resid) { 1531 VNET_SO_ASSERT(so); 1532 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1533 } 1534 1535 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1536 if (error) 1537 return (error); 1538 1539 restart: 1540 SOCKBUF_LOCK(&so->so_rcv); 1541 m = so->so_rcv.sb_mb; 1542 /* 1543 * If we have less data than requested, block awaiting more (subject 1544 * to any timeout) if: 1545 * 1. the current count is less than the low water mark, or 1546 * 2. MSG_DONTWAIT is not set 1547 */ 1548 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1549 so->so_rcv.sb_cc < uio->uio_resid) && 1550 so->so_rcv.sb_cc < so->so_rcv.sb_lowat && 1551 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1552 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1553 ("receive: m == %p so->so_rcv.sb_cc == %u", 1554 m, so->so_rcv.sb_cc)); 1555 if (so->so_error) { 1556 if (m != NULL) 1557 goto dontblock; 1558 error = so->so_error; 1559 if ((flags & MSG_PEEK) == 0) 1560 so->so_error = 0; 1561 SOCKBUF_UNLOCK(&so->so_rcv); 1562 goto release; 1563 } 1564 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1565 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1566 if (m == NULL) { 1567 SOCKBUF_UNLOCK(&so->so_rcv); 1568 goto release; 1569 } else 1570 goto dontblock; 1571 } 1572 for (; m != NULL; m = m->m_next) 1573 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1574 m = so->so_rcv.sb_mb; 1575 goto dontblock; 1576 } 1577 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1578 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1579 SOCKBUF_UNLOCK(&so->so_rcv); 1580 error = ENOTCONN; 1581 goto release; 1582 } 1583 if (uio->uio_resid == 0) { 1584 SOCKBUF_UNLOCK(&so->so_rcv); 1585 goto release; 1586 } 1587 if ((so->so_state & SS_NBIO) || 1588 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1589 SOCKBUF_UNLOCK(&so->so_rcv); 1590 error = EWOULDBLOCK; 1591 goto release; 1592 } 1593 SBLASTRECORDCHK(&so->so_rcv); 1594 SBLASTMBUFCHK(&so->so_rcv); 1595 error = sbwait(&so->so_rcv); 1596 SOCKBUF_UNLOCK(&so->so_rcv); 1597 if (error) 1598 goto release; 1599 goto restart; 1600 } 1601 dontblock: 1602 /* 1603 * From this point onward, we maintain 'nextrecord' as a cache of the 1604 * pointer to the next record in the socket buffer. We must keep the 1605 * various socket buffer pointers and local stack versions of the 1606 * pointers in sync, pushing out modifications before dropping the 1607 * socket buffer mutex, and re-reading them when picking it up. 1608 * 1609 * Otherwise, we will race with the network stack appending new data 1610 * or records onto the socket buffer by using inconsistent/stale 1611 * versions of the field, possibly resulting in socket buffer 1612 * corruption. 1613 * 1614 * By holding the high-level sblock(), we prevent simultaneous 1615 * readers from pulling off the front of the socket buffer. 1616 */ 1617 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1618 if (uio->uio_td) 1619 uio->uio_td->td_ru.ru_msgrcv++; 1620 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1621 SBLASTRECORDCHK(&so->so_rcv); 1622 SBLASTMBUFCHK(&so->so_rcv); 1623 nextrecord = m->m_nextpkt; 1624 if (pr->pr_flags & PR_ADDR) { 1625 KASSERT(m->m_type == MT_SONAME, 1626 ("m->m_type == %d", m->m_type)); 1627 orig_resid = 0; 1628 if (psa != NULL) 1629 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1630 M_NOWAIT); 1631 if (flags & MSG_PEEK) { 1632 m = m->m_next; 1633 } else { 1634 sbfree(&so->so_rcv, m); 1635 so->so_rcv.sb_mb = m_free(m); 1636 m = so->so_rcv.sb_mb; 1637 sockbuf_pushsync(&so->so_rcv, nextrecord); 1638 } 1639 } 1640 1641 /* 1642 * Process one or more MT_CONTROL mbufs present before any data mbufs 1643 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1644 * just copy the data; if !MSG_PEEK, we call into the protocol to 1645 * perform externalization (or freeing if controlp == NULL). 1646 */ 1647 if (m != NULL && m->m_type == MT_CONTROL) { 1648 struct mbuf *cm = NULL, *cmn; 1649 struct mbuf **cme = &cm; 1650 1651 do { 1652 if (flags & MSG_PEEK) { 1653 if (controlp != NULL) { 1654 *controlp = m_copy(m, 0, m->m_len); 1655 controlp = &(*controlp)->m_next; 1656 } 1657 m = m->m_next; 1658 } else { 1659 sbfree(&so->so_rcv, m); 1660 so->so_rcv.sb_mb = m->m_next; 1661 m->m_next = NULL; 1662 *cme = m; 1663 cme = &(*cme)->m_next; 1664 m = so->so_rcv.sb_mb; 1665 } 1666 } while (m != NULL && m->m_type == MT_CONTROL); 1667 if ((flags & MSG_PEEK) == 0) 1668 sockbuf_pushsync(&so->so_rcv, nextrecord); 1669 while (cm != NULL) { 1670 cmn = cm->m_next; 1671 cm->m_next = NULL; 1672 if (pr->pr_domain->dom_externalize != NULL) { 1673 SOCKBUF_UNLOCK(&so->so_rcv); 1674 VNET_SO_ASSERT(so); 1675 error = (*pr->pr_domain->dom_externalize) 1676 (cm, controlp); 1677 SOCKBUF_LOCK(&so->so_rcv); 1678 } else if (controlp != NULL) 1679 *controlp = cm; 1680 else 1681 m_freem(cm); 1682 if (controlp != NULL) { 1683 orig_resid = 0; 1684 while (*controlp != NULL) 1685 controlp = &(*controlp)->m_next; 1686 } 1687 cm = cmn; 1688 } 1689 if (m != NULL) 1690 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1691 else 1692 nextrecord = so->so_rcv.sb_mb; 1693 orig_resid = 0; 1694 } 1695 if (m != NULL) { 1696 if ((flags & MSG_PEEK) == 0) { 1697 KASSERT(m->m_nextpkt == nextrecord, 1698 ("soreceive: post-control, nextrecord !sync")); 1699 if (nextrecord == NULL) { 1700 KASSERT(so->so_rcv.sb_mb == m, 1701 ("soreceive: post-control, sb_mb!=m")); 1702 KASSERT(so->so_rcv.sb_lastrecord == m, 1703 ("soreceive: post-control, lastrecord!=m")); 1704 } 1705 } 1706 type = m->m_type; 1707 if (type == MT_OOBDATA) 1708 flags |= MSG_OOB; 1709 } else { 1710 if ((flags & MSG_PEEK) == 0) { 1711 KASSERT(so->so_rcv.sb_mb == nextrecord, 1712 ("soreceive: sb_mb != nextrecord")); 1713 if (so->so_rcv.sb_mb == NULL) { 1714 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1715 ("soreceive: sb_lastercord != NULL")); 1716 } 1717 } 1718 } 1719 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1720 SBLASTRECORDCHK(&so->so_rcv); 1721 SBLASTMBUFCHK(&so->so_rcv); 1722 1723 /* 1724 * Now continue to read any data mbufs off of the head of the socket 1725 * buffer until the read request is satisfied. Note that 'type' is 1726 * used to store the type of any mbuf reads that have happened so far 1727 * such that soreceive() can stop reading if the type changes, which 1728 * causes soreceive() to return only one of regular data and inline 1729 * out-of-band data in a single socket receive operation. 1730 */ 1731 moff = 0; 1732 offset = 0; 1733 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1734 /* 1735 * If the type of mbuf has changed since the last mbuf 1736 * examined ('type'), end the receive operation. 1737 */ 1738 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1739 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { 1740 if (type != m->m_type) 1741 break; 1742 } else if (type == MT_OOBDATA) 1743 break; 1744 else 1745 KASSERT(m->m_type == MT_DATA, 1746 ("m->m_type == %d", m->m_type)); 1747 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1748 len = uio->uio_resid; 1749 if (so->so_oobmark && len > so->so_oobmark - offset) 1750 len = so->so_oobmark - offset; 1751 if (len > m->m_len - moff) 1752 len = m->m_len - moff; 1753 /* 1754 * If mp is set, just pass back the mbufs. Otherwise copy 1755 * them out via the uio, then free. Sockbuf must be 1756 * consistent here (points to current mbuf, it points to next 1757 * record) when we drop priority; we must note any additions 1758 * to the sockbuf when we block interrupts again. 1759 */ 1760 if (mp == NULL) { 1761 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1762 SBLASTRECORDCHK(&so->so_rcv); 1763 SBLASTMBUFCHK(&so->so_rcv); 1764 SOCKBUF_UNLOCK(&so->so_rcv); 1765 #ifdef SOCKET_RECV_PFLIP 1766 if (so_zero_copy_receive) { 1767 int disposable; 1768 1769 if ((m->m_flags & M_EXT) 1770 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1771 disposable = 1; 1772 else 1773 disposable = 0; 1774 1775 error = uiomoveco(mtod(m, char *) + moff, 1776 (int)len, uio, 1777 disposable); 1778 } else 1779 #endif /* SOCKET_RECV_PFLIP */ 1780 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1781 SOCKBUF_LOCK(&so->so_rcv); 1782 if (error) { 1783 /* 1784 * The MT_SONAME mbuf has already been removed 1785 * from the record, so it is necessary to 1786 * remove the data mbufs, if any, to preserve 1787 * the invariant in the case of PR_ADDR that 1788 * requires MT_SONAME mbufs at the head of 1789 * each record. 1790 */ 1791 if (m && pr->pr_flags & PR_ATOMIC && 1792 ((flags & MSG_PEEK) == 0)) 1793 (void)sbdroprecord_locked(&so->so_rcv); 1794 SOCKBUF_UNLOCK(&so->so_rcv); 1795 goto release; 1796 } 1797 } else 1798 uio->uio_resid -= len; 1799 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1800 if (len == m->m_len - moff) { 1801 if (m->m_flags & M_EOR) 1802 flags |= MSG_EOR; 1803 if (flags & MSG_PEEK) { 1804 m = m->m_next; 1805 moff = 0; 1806 } else { 1807 nextrecord = m->m_nextpkt; 1808 sbfree(&so->so_rcv, m); 1809 if (mp != NULL) { 1810 *mp = m; 1811 mp = &m->m_next; 1812 so->so_rcv.sb_mb = m = m->m_next; 1813 *mp = NULL; 1814 } else { 1815 so->so_rcv.sb_mb = m_free(m); 1816 m = so->so_rcv.sb_mb; 1817 } 1818 sockbuf_pushsync(&so->so_rcv, nextrecord); 1819 SBLASTRECORDCHK(&so->so_rcv); 1820 SBLASTMBUFCHK(&so->so_rcv); 1821 } 1822 } else { 1823 if (flags & MSG_PEEK) 1824 moff += len; 1825 else { 1826 if (mp != NULL) { 1827 int copy_flag; 1828 1829 if (flags & MSG_DONTWAIT) 1830 copy_flag = M_DONTWAIT; 1831 else 1832 copy_flag = M_WAIT; 1833 if (copy_flag == M_WAIT) 1834 SOCKBUF_UNLOCK(&so->so_rcv); 1835 *mp = m_copym(m, 0, len, copy_flag); 1836 if (copy_flag == M_WAIT) 1837 SOCKBUF_LOCK(&so->so_rcv); 1838 if (*mp == NULL) { 1839 /* 1840 * m_copym() couldn't 1841 * allocate an mbuf. Adjust 1842 * uio_resid back (it was 1843 * adjusted down by len 1844 * bytes, which we didn't end 1845 * up "copying" over). 1846 */ 1847 uio->uio_resid += len; 1848 break; 1849 } 1850 } 1851 m->m_data += len; 1852 m->m_len -= len; 1853 so->so_rcv.sb_cc -= len; 1854 } 1855 } 1856 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1857 if (so->so_oobmark) { 1858 if ((flags & MSG_PEEK) == 0) { 1859 so->so_oobmark -= len; 1860 if (so->so_oobmark == 0) { 1861 so->so_rcv.sb_state |= SBS_RCVATMARK; 1862 break; 1863 } 1864 } else { 1865 offset += len; 1866 if (offset == so->so_oobmark) 1867 break; 1868 } 1869 } 1870 if (flags & MSG_EOR) 1871 break; 1872 /* 1873 * If the MSG_WAITALL flag is set (for non-atomic socket), we 1874 * must not quit until "uio->uio_resid == 0" or an error 1875 * termination. If a signal/timeout occurs, return with a 1876 * short count but without error. Keep sockbuf locked 1877 * against other readers. 1878 */ 1879 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1880 !sosendallatonce(so) && nextrecord == NULL) { 1881 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1882 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1883 break; 1884 /* 1885 * Notify the protocol that some data has been 1886 * drained before blocking. 1887 */ 1888 if (pr->pr_flags & PR_WANTRCVD) { 1889 SOCKBUF_UNLOCK(&so->so_rcv); 1890 VNET_SO_ASSERT(so); 1891 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1892 SOCKBUF_LOCK(&so->so_rcv); 1893 } 1894 SBLASTRECORDCHK(&so->so_rcv); 1895 SBLASTMBUFCHK(&so->so_rcv); 1896 /* 1897 * We could receive some data while was notifying 1898 * the protocol. Skip blocking in this case. 1899 */ 1900 if (so->so_rcv.sb_mb == NULL) { 1901 error = sbwait(&so->so_rcv); 1902 if (error) { 1903 SOCKBUF_UNLOCK(&so->so_rcv); 1904 goto release; 1905 } 1906 } 1907 m = so->so_rcv.sb_mb; 1908 if (m != NULL) 1909 nextrecord = m->m_nextpkt; 1910 } 1911 } 1912 1913 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1914 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1915 flags |= MSG_TRUNC; 1916 if ((flags & MSG_PEEK) == 0) 1917 (void) sbdroprecord_locked(&so->so_rcv); 1918 } 1919 if ((flags & MSG_PEEK) == 0) { 1920 if (m == NULL) { 1921 /* 1922 * First part is an inline SB_EMPTY_FIXUP(). Second 1923 * part makes sure sb_lastrecord is up-to-date if 1924 * there is still data in the socket buffer. 1925 */ 1926 so->so_rcv.sb_mb = nextrecord; 1927 if (so->so_rcv.sb_mb == NULL) { 1928 so->so_rcv.sb_mbtail = NULL; 1929 so->so_rcv.sb_lastrecord = NULL; 1930 } else if (nextrecord->m_nextpkt == NULL) 1931 so->so_rcv.sb_lastrecord = nextrecord; 1932 } 1933 SBLASTRECORDCHK(&so->so_rcv); 1934 SBLASTMBUFCHK(&so->so_rcv); 1935 /* 1936 * If soreceive() is being done from the socket callback, 1937 * then don't need to generate ACK to peer to update window, 1938 * since ACK will be generated on return to TCP. 1939 */ 1940 if (!(flags & MSG_SOCALLBCK) && 1941 (pr->pr_flags & PR_WANTRCVD)) { 1942 SOCKBUF_UNLOCK(&so->so_rcv); 1943 VNET_SO_ASSERT(so); 1944 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1945 SOCKBUF_LOCK(&so->so_rcv); 1946 } 1947 } 1948 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1949 if (orig_resid == uio->uio_resid && orig_resid && 1950 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1951 SOCKBUF_UNLOCK(&so->so_rcv); 1952 goto restart; 1953 } 1954 SOCKBUF_UNLOCK(&so->so_rcv); 1955 1956 if (flagsp != NULL) 1957 *flagsp |= flags; 1958 release: 1959 sbunlock(&so->so_rcv); 1960 return (error); 1961 } 1962 1963 /* 1964 * Optimized version of soreceive() for stream (TCP) sockets. 1965 * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled. 1966 */ 1967 int 1968 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, 1969 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1970 { 1971 int len = 0, error = 0, flags, oresid; 1972 struct sockbuf *sb; 1973 struct mbuf *m, *n = NULL; 1974 1975 /* We only do stream sockets. */ 1976 if (so->so_type != SOCK_STREAM) 1977 return (EINVAL); 1978 if (psa != NULL) 1979 *psa = NULL; 1980 if (controlp != NULL) 1981 return (EINVAL); 1982 if (flagsp != NULL) 1983 flags = *flagsp &~ MSG_EOR; 1984 else 1985 flags = 0; 1986 if (flags & MSG_OOB) 1987 return (soreceive_rcvoob(so, uio, flags)); 1988 if (mp0 != NULL) 1989 *mp0 = NULL; 1990 1991 sb = &so->so_rcv; 1992 1993 /* Prevent other readers from entering the socket. */ 1994 error = sblock(sb, SBLOCKWAIT(flags)); 1995 if (error) 1996 goto out; 1997 SOCKBUF_LOCK(sb); 1998 1999 /* Easy one, no space to copyout anything. */ 2000 if (uio->uio_resid == 0) { 2001 error = EINVAL; 2002 goto out; 2003 } 2004 oresid = uio->uio_resid; 2005 2006 /* We will never ever get anything unless we are or were connected. */ 2007 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 2008 error = ENOTCONN; 2009 goto out; 2010 } 2011 2012 restart: 2013 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2014 2015 /* Abort if socket has reported problems. */ 2016 if (so->so_error) { 2017 if (sb->sb_cc > 0) 2018 goto deliver; 2019 if (oresid > uio->uio_resid) 2020 goto out; 2021 error = so->so_error; 2022 if (!(flags & MSG_PEEK)) 2023 so->so_error = 0; 2024 goto out; 2025 } 2026 2027 /* Door is closed. Deliver what is left, if any. */ 2028 if (sb->sb_state & SBS_CANTRCVMORE) { 2029 if (sb->sb_cc > 0) 2030 goto deliver; 2031 else 2032 goto out; 2033 } 2034 2035 /* Socket buffer is empty and we shall not block. */ 2036 if (sb->sb_cc == 0 && 2037 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 2038 error = EAGAIN; 2039 goto out; 2040 } 2041 2042 /* Socket buffer got some data that we shall deliver now. */ 2043 if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && 2044 ((sb->sb_flags & SS_NBIO) || 2045 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 2046 sb->sb_cc >= sb->sb_lowat || 2047 sb->sb_cc >= uio->uio_resid || 2048 sb->sb_cc >= sb->sb_hiwat) ) { 2049 goto deliver; 2050 } 2051 2052 /* On MSG_WAITALL we must wait until all data or error arrives. */ 2053 if ((flags & MSG_WAITALL) && 2054 (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat)) 2055 goto deliver; 2056 2057 /* 2058 * Wait and block until (more) data comes in. 2059 * NB: Drops the sockbuf lock during wait. 2060 */ 2061 error = sbwait(sb); 2062 if (error) 2063 goto out; 2064 goto restart; 2065 2066 deliver: 2067 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2068 KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); 2069 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 2070 2071 /* Statistics. */ 2072 if (uio->uio_td) 2073 uio->uio_td->td_ru.ru_msgrcv++; 2074 2075 /* Fill uio until full or current end of socket buffer is reached. */ 2076 len = min(uio->uio_resid, sb->sb_cc); 2077 if (mp0 != NULL) { 2078 /* Dequeue as many mbufs as possible. */ 2079 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 2080 if (*mp0 == NULL) 2081 *mp0 = sb->sb_mb; 2082 else 2083 m_cat(*mp0, sb->sb_mb); 2084 for (m = sb->sb_mb; 2085 m != NULL && m->m_len <= len; 2086 m = m->m_next) { 2087 len -= m->m_len; 2088 uio->uio_resid -= m->m_len; 2089 sbfree(sb, m); 2090 n = m; 2091 } 2092 n->m_next = NULL; 2093 sb->sb_mb = m; 2094 sb->sb_lastrecord = sb->sb_mb; 2095 if (sb->sb_mb == NULL) 2096 SB_EMPTY_FIXUP(sb); 2097 } 2098 /* Copy the remainder. */ 2099 if (len > 0) { 2100 KASSERT(sb->sb_mb != NULL, 2101 ("%s: len > 0 && sb->sb_mb empty", __func__)); 2102 2103 m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT); 2104 if (m == NULL) 2105 len = 0; /* Don't flush data from sockbuf. */ 2106 else 2107 uio->uio_resid -= len; 2108 if (*mp0 != NULL) 2109 m_cat(*mp0, m); 2110 else 2111 *mp0 = m; 2112 if (*mp0 == NULL) { 2113 error = ENOBUFS; 2114 goto out; 2115 } 2116 } 2117 } else { 2118 /* NB: Must unlock socket buffer as uiomove may sleep. */ 2119 SOCKBUF_UNLOCK(sb); 2120 error = m_mbuftouio(uio, sb->sb_mb, len); 2121 SOCKBUF_LOCK(sb); 2122 if (error) 2123 goto out; 2124 } 2125 SBLASTRECORDCHK(sb); 2126 SBLASTMBUFCHK(sb); 2127 2128 /* 2129 * Remove the delivered data from the socket buffer unless we 2130 * were only peeking. 2131 */ 2132 if (!(flags & MSG_PEEK)) { 2133 if (len > 0) 2134 sbdrop_locked(sb, len); 2135 2136 /* Notify protocol that we drained some data. */ 2137 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 2138 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 2139 !(flags & MSG_SOCALLBCK))) { 2140 SOCKBUF_UNLOCK(sb); 2141 VNET_SO_ASSERT(so); 2142 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 2143 SOCKBUF_LOCK(sb); 2144 } 2145 } 2146 2147 /* 2148 * For MSG_WAITALL we may have to loop again and wait for 2149 * more data to come in. 2150 */ 2151 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 2152 goto restart; 2153 out: 2154 SOCKBUF_LOCK_ASSERT(sb); 2155 SBLASTRECORDCHK(sb); 2156 SBLASTMBUFCHK(sb); 2157 SOCKBUF_UNLOCK(sb); 2158 sbunlock(sb); 2159 return (error); 2160 } 2161 2162 /* 2163 * Optimized version of soreceive() for simple datagram cases from userspace. 2164 * Unlike in the stream case, we're able to drop a datagram if copyout() 2165 * fails, and because we handle datagrams atomically, we don't need to use a 2166 * sleep lock to prevent I/O interlacing. 2167 */ 2168 int 2169 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, 2170 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2171 { 2172 struct mbuf *m, *m2; 2173 int flags, error; 2174 ssize_t len; 2175 struct protosw *pr = so->so_proto; 2176 struct mbuf *nextrecord; 2177 2178 if (psa != NULL) 2179 *psa = NULL; 2180 if (controlp != NULL) 2181 *controlp = NULL; 2182 if (flagsp != NULL) 2183 flags = *flagsp &~ MSG_EOR; 2184 else 2185 flags = 0; 2186 2187 /* 2188 * For any complicated cases, fall back to the full 2189 * soreceive_generic(). 2190 */ 2191 if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) 2192 return (soreceive_generic(so, psa, uio, mp0, controlp, 2193 flagsp)); 2194 2195 /* 2196 * Enforce restrictions on use. 2197 */ 2198 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, 2199 ("soreceive_dgram: wantrcvd")); 2200 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); 2201 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, 2202 ("soreceive_dgram: SBS_RCVATMARK")); 2203 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, 2204 ("soreceive_dgram: P_CONNREQUIRED")); 2205 2206 /* 2207 * Loop blocking while waiting for a datagram. 2208 */ 2209 SOCKBUF_LOCK(&so->so_rcv); 2210 while ((m = so->so_rcv.sb_mb) == NULL) { 2211 KASSERT(so->so_rcv.sb_cc == 0, 2212 ("soreceive_dgram: sb_mb NULL but sb_cc %u", 2213 so->so_rcv.sb_cc)); 2214 if (so->so_error) { 2215 error = so->so_error; 2216 so->so_error = 0; 2217 SOCKBUF_UNLOCK(&so->so_rcv); 2218 return (error); 2219 } 2220 if (so->so_rcv.sb_state & SBS_CANTRCVMORE || 2221 uio->uio_resid == 0) { 2222 SOCKBUF_UNLOCK(&so->so_rcv); 2223 return (0); 2224 } 2225 if ((so->so_state & SS_NBIO) || 2226 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 2227 SOCKBUF_UNLOCK(&so->so_rcv); 2228 return (EWOULDBLOCK); 2229 } 2230 SBLASTRECORDCHK(&so->so_rcv); 2231 SBLASTMBUFCHK(&so->so_rcv); 2232 error = sbwait(&so->so_rcv); 2233 if (error) { 2234 SOCKBUF_UNLOCK(&so->so_rcv); 2235 return (error); 2236 } 2237 } 2238 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2239 2240 if (uio->uio_td) 2241 uio->uio_td->td_ru.ru_msgrcv++; 2242 SBLASTRECORDCHK(&so->so_rcv); 2243 SBLASTMBUFCHK(&so->so_rcv); 2244 nextrecord = m->m_nextpkt; 2245 if (nextrecord == NULL) { 2246 KASSERT(so->so_rcv.sb_lastrecord == m, 2247 ("soreceive_dgram: lastrecord != m")); 2248 } 2249 2250 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, 2251 ("soreceive_dgram: m_nextpkt != nextrecord")); 2252 2253 /* 2254 * Pull 'm' and its chain off the front of the packet queue. 2255 */ 2256 so->so_rcv.sb_mb = NULL; 2257 sockbuf_pushsync(&so->so_rcv, nextrecord); 2258 2259 /* 2260 * Walk 'm's chain and free that many bytes from the socket buffer. 2261 */ 2262 for (m2 = m; m2 != NULL; m2 = m2->m_next) 2263 sbfree(&so->so_rcv, m2); 2264 2265 /* 2266 * Do a few last checks before we let go of the lock. 2267 */ 2268 SBLASTRECORDCHK(&so->so_rcv); 2269 SBLASTMBUFCHK(&so->so_rcv); 2270 SOCKBUF_UNLOCK(&so->so_rcv); 2271 2272 if (pr->pr_flags & PR_ADDR) { 2273 KASSERT(m->m_type == MT_SONAME, 2274 ("m->m_type == %d", m->m_type)); 2275 if (psa != NULL) 2276 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 2277 M_NOWAIT); 2278 m = m_free(m); 2279 } 2280 if (m == NULL) { 2281 /* XXXRW: Can this happen? */ 2282 return (0); 2283 } 2284 2285 /* 2286 * Packet to copyout() is now in 'm' and it is disconnected from the 2287 * queue. 2288 * 2289 * Process one or more MT_CONTROL mbufs present before any data mbufs 2290 * in the first mbuf chain on the socket buffer. We call into the 2291 * protocol to perform externalization (or freeing if controlp == 2292 * NULL). 2293 */ 2294 if (m->m_type == MT_CONTROL) { 2295 struct mbuf *cm = NULL, *cmn; 2296 struct mbuf **cme = &cm; 2297 2298 do { 2299 m2 = m->m_next; 2300 m->m_next = NULL; 2301 *cme = m; 2302 cme = &(*cme)->m_next; 2303 m = m2; 2304 } while (m != NULL && m->m_type == MT_CONTROL); 2305 while (cm != NULL) { 2306 cmn = cm->m_next; 2307 cm->m_next = NULL; 2308 if (pr->pr_domain->dom_externalize != NULL) { 2309 error = (*pr->pr_domain->dom_externalize) 2310 (cm, controlp); 2311 } else if (controlp != NULL) 2312 *controlp = cm; 2313 else 2314 m_freem(cm); 2315 if (controlp != NULL) { 2316 while (*controlp != NULL) 2317 controlp = &(*controlp)->m_next; 2318 } 2319 cm = cmn; 2320 } 2321 } 2322 KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data")); 2323 2324 while (m != NULL && uio->uio_resid > 0) { 2325 len = uio->uio_resid; 2326 if (len > m->m_len) 2327 len = m->m_len; 2328 error = uiomove(mtod(m, char *), (int)len, uio); 2329 if (error) { 2330 m_freem(m); 2331 return (error); 2332 } 2333 if (len == m->m_len) 2334 m = m_free(m); 2335 else { 2336 m->m_data += len; 2337 m->m_len -= len; 2338 } 2339 } 2340 if (m != NULL) 2341 flags |= MSG_TRUNC; 2342 m_freem(m); 2343 if (flagsp != NULL) 2344 *flagsp |= flags; 2345 return (0); 2346 } 2347 2348 int 2349 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, 2350 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 2351 { 2352 int error; 2353 2354 CURVNET_SET(so->so_vnet); 2355 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, 2356 controlp, flagsp)); 2357 CURVNET_RESTORE(); 2358 return (error); 2359 } 2360 2361 int 2362 soshutdown(struct socket *so, int how) 2363 { 2364 struct protosw *pr = so->so_proto; 2365 int error; 2366 2367 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 2368 return (EINVAL); 2369 2370 CURVNET_SET(so->so_vnet); 2371 if (pr->pr_usrreqs->pru_flush != NULL) { 2372 (*pr->pr_usrreqs->pru_flush)(so, how); 2373 } 2374 if (how != SHUT_WR) 2375 sorflush(so); 2376 if (how != SHUT_RD) { 2377 error = (*pr->pr_usrreqs->pru_shutdown)(so); 2378 CURVNET_RESTORE(); 2379 return (error); 2380 } 2381 CURVNET_RESTORE(); 2382 return (0); 2383 } 2384 2385 void 2386 sorflush(struct socket *so) 2387 { 2388 struct sockbuf *sb = &so->so_rcv; 2389 struct protosw *pr = so->so_proto; 2390 struct sockbuf asb; 2391 2392 VNET_SO_ASSERT(so); 2393 2394 /* 2395 * In order to avoid calling dom_dispose with the socket buffer mutex 2396 * held, and in order to generally avoid holding the lock for a long 2397 * time, we make a copy of the socket buffer and clear the original 2398 * (except locks, state). The new socket buffer copy won't have 2399 * initialized locks so we can only call routines that won't use or 2400 * assert those locks. 2401 * 2402 * Dislodge threads currently blocked in receive and wait to acquire 2403 * a lock against other simultaneous readers before clearing the 2404 * socket buffer. Don't let our acquire be interrupted by a signal 2405 * despite any existing socket disposition on interruptable waiting. 2406 */ 2407 socantrcvmore(so); 2408 (void) sblock(sb, SBL_WAIT | SBL_NOINTR); 2409 2410 /* 2411 * Invalidate/clear most of the sockbuf structure, but leave selinfo 2412 * and mutex data unchanged. 2413 */ 2414 SOCKBUF_LOCK(sb); 2415 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 2416 bcopy(&sb->sb_startzero, &asb.sb_startzero, 2417 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2418 bzero(&sb->sb_startzero, 2419 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 2420 SOCKBUF_UNLOCK(sb); 2421 sbunlock(sb); 2422 2423 /* 2424 * Dispose of special rights and flush the socket buffer. Don't call 2425 * any unsafe routines (that rely on locks being initialized) on asb. 2426 */ 2427 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 2428 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 2429 sbrelease_internal(&asb, so); 2430 } 2431 2432 /* 2433 * Perhaps this routine, and sooptcopyout(), below, ought to come in an 2434 * additional variant to handle the case where the option value needs to be 2435 * some kind of integer, but not a specific size. In addition to their use 2436 * here, these functions are also called by the protocol-level pr_ctloutput() 2437 * routines. 2438 */ 2439 int 2440 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) 2441 { 2442 size_t valsize; 2443 2444 /* 2445 * If the user gives us more than we wanted, we ignore it, but if we 2446 * don't get the minimum length the caller wants, we return EINVAL. 2447 * On success, sopt->sopt_valsize is set to however much we actually 2448 * retrieved. 2449 */ 2450 if ((valsize = sopt->sopt_valsize) < minlen) 2451 return EINVAL; 2452 if (valsize > len) 2453 sopt->sopt_valsize = valsize = len; 2454 2455 if (sopt->sopt_td != NULL) 2456 return (copyin(sopt->sopt_val, buf, valsize)); 2457 2458 bcopy(sopt->sopt_val, buf, valsize); 2459 return (0); 2460 } 2461 2462 /* 2463 * Kernel version of setsockopt(2). 2464 * 2465 * XXX: optlen is size_t, not socklen_t 2466 */ 2467 int 2468 so_setsockopt(struct socket *so, int level, int optname, void *optval, 2469 size_t optlen) 2470 { 2471 struct sockopt sopt; 2472 2473 sopt.sopt_level = level; 2474 sopt.sopt_name = optname; 2475 sopt.sopt_dir = SOPT_SET; 2476 sopt.sopt_val = optval; 2477 sopt.sopt_valsize = optlen; 2478 sopt.sopt_td = NULL; 2479 return (sosetopt(so, &sopt)); 2480 } 2481 2482 int 2483 sosetopt(struct socket *so, struct sockopt *sopt) 2484 { 2485 int error, optval; 2486 struct linger l; 2487 struct timeval tv; 2488 u_long val; 2489 uint32_t val32; 2490 #ifdef MAC 2491 struct mac extmac; 2492 #endif 2493 2494 CURVNET_SET(so->so_vnet); 2495 error = 0; 2496 if (sopt->sopt_level != SOL_SOCKET) { 2497 if (so->so_proto->pr_ctloutput != NULL) { 2498 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2499 CURVNET_RESTORE(); 2500 return (error); 2501 } 2502 error = ENOPROTOOPT; 2503 } else { 2504 switch (sopt->sopt_name) { 2505 #ifdef INET 2506 case SO_ACCEPTFILTER: 2507 error = do_setopt_accept_filter(so, sopt); 2508 if (error) 2509 goto bad; 2510 break; 2511 #endif 2512 case SO_LINGER: 2513 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 2514 if (error) 2515 goto bad; 2516 2517 SOCK_LOCK(so); 2518 so->so_linger = l.l_linger; 2519 if (l.l_onoff) 2520 so->so_options |= SO_LINGER; 2521 else 2522 so->so_options &= ~SO_LINGER; 2523 SOCK_UNLOCK(so); 2524 break; 2525 2526 case SO_DEBUG: 2527 case SO_KEEPALIVE: 2528 case SO_DONTROUTE: 2529 case SO_USELOOPBACK: 2530 case SO_BROADCAST: 2531 case SO_REUSEADDR: 2532 case SO_REUSEPORT: 2533 case SO_OOBINLINE: 2534 case SO_TIMESTAMP: 2535 case SO_BINTIME: 2536 case SO_NOSIGPIPE: 2537 case SO_NO_DDP: 2538 case SO_NO_OFFLOAD: 2539 error = sooptcopyin(sopt, &optval, sizeof optval, 2540 sizeof optval); 2541 if (error) 2542 goto bad; 2543 SOCK_LOCK(so); 2544 if (optval) 2545 so->so_options |= sopt->sopt_name; 2546 else 2547 so->so_options &= ~sopt->sopt_name; 2548 SOCK_UNLOCK(so); 2549 break; 2550 2551 case SO_SETFIB: 2552 error = sooptcopyin(sopt, &optval, sizeof optval, 2553 sizeof optval); 2554 if (error) 2555 goto bad; 2556 2557 if (optval < 0 || optval >= rt_numfibs) { 2558 error = EINVAL; 2559 goto bad; 2560 } 2561 if (((so->so_proto->pr_domain->dom_family == PF_INET) || 2562 (so->so_proto->pr_domain->dom_family == PF_INET6) || 2563 (so->so_proto->pr_domain->dom_family == PF_ROUTE))) 2564 so->so_fibnum = optval; 2565 else 2566 so->so_fibnum = 0; 2567 break; 2568 2569 case SO_USER_COOKIE: 2570 error = sooptcopyin(sopt, &val32, sizeof val32, 2571 sizeof val32); 2572 if (error) 2573 goto bad; 2574 so->so_user_cookie = val32; 2575 break; 2576 2577 case SO_SNDBUF: 2578 case SO_RCVBUF: 2579 case SO_SNDLOWAT: 2580 case SO_RCVLOWAT: 2581 error = sooptcopyin(sopt, &optval, sizeof optval, 2582 sizeof optval); 2583 if (error) 2584 goto bad; 2585 2586 /* 2587 * Values < 1 make no sense for any of these options, 2588 * so disallow them. 2589 */ 2590 if (optval < 1) { 2591 error = EINVAL; 2592 goto bad; 2593 } 2594 2595 switch (sopt->sopt_name) { 2596 case SO_SNDBUF: 2597 case SO_RCVBUF: 2598 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 2599 &so->so_snd : &so->so_rcv, (u_long)optval, 2600 so, curthread) == 0) { 2601 error = ENOBUFS; 2602 goto bad; 2603 } 2604 (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : 2605 &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; 2606 break; 2607 2608 /* 2609 * Make sure the low-water is never greater than the 2610 * high-water. 2611 */ 2612 case SO_SNDLOWAT: 2613 SOCKBUF_LOCK(&so->so_snd); 2614 so->so_snd.sb_lowat = 2615 (optval > so->so_snd.sb_hiwat) ? 2616 so->so_snd.sb_hiwat : optval; 2617 SOCKBUF_UNLOCK(&so->so_snd); 2618 break; 2619 case SO_RCVLOWAT: 2620 SOCKBUF_LOCK(&so->so_rcv); 2621 so->so_rcv.sb_lowat = 2622 (optval > so->so_rcv.sb_hiwat) ? 2623 so->so_rcv.sb_hiwat : optval; 2624 SOCKBUF_UNLOCK(&so->so_rcv); 2625 break; 2626 } 2627 break; 2628 2629 case SO_SNDTIMEO: 2630 case SO_RCVTIMEO: 2631 #ifdef COMPAT_FREEBSD32 2632 if (SV_CURPROC_FLAG(SV_ILP32)) { 2633 struct timeval32 tv32; 2634 2635 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2636 sizeof tv32); 2637 CP(tv32, tv, tv_sec); 2638 CP(tv32, tv, tv_usec); 2639 } else 2640 #endif 2641 error = sooptcopyin(sopt, &tv, sizeof tv, 2642 sizeof tv); 2643 if (error) 2644 goto bad; 2645 2646 /* assert(hz > 0); */ 2647 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2648 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2649 error = EDOM; 2650 goto bad; 2651 } 2652 /* assert(tick > 0); */ 2653 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2654 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2655 if (val > INT_MAX) { 2656 error = EDOM; 2657 goto bad; 2658 } 2659 if (val == 0 && tv.tv_usec != 0) 2660 val = 1; 2661 2662 switch (sopt->sopt_name) { 2663 case SO_SNDTIMEO: 2664 so->so_snd.sb_timeo = val; 2665 break; 2666 case SO_RCVTIMEO: 2667 so->so_rcv.sb_timeo = val; 2668 break; 2669 } 2670 break; 2671 2672 case SO_LABEL: 2673 #ifdef MAC 2674 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2675 sizeof extmac); 2676 if (error) 2677 goto bad; 2678 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2679 so, &extmac); 2680 #else 2681 error = EOPNOTSUPP; 2682 #endif 2683 break; 2684 2685 default: 2686 error = ENOPROTOOPT; 2687 break; 2688 } 2689 if (error == 0 && so->so_proto->pr_ctloutput != NULL) 2690 (void)(*so->so_proto->pr_ctloutput)(so, sopt); 2691 } 2692 bad: 2693 CURVNET_RESTORE(); 2694 return (error); 2695 } 2696 2697 /* 2698 * Helper routine for getsockopt. 2699 */ 2700 int 2701 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2702 { 2703 int error; 2704 size_t valsize; 2705 2706 error = 0; 2707 2708 /* 2709 * Documented get behavior is that we always return a value, possibly 2710 * truncated to fit in the user's buffer. Traditional behavior is 2711 * that we always tell the user precisely how much we copied, rather 2712 * than something useful like the total amount we had available for 2713 * her. Note that this interface is not idempotent; the entire 2714 * answer must generated ahead of time. 2715 */ 2716 valsize = min(len, sopt->sopt_valsize); 2717 sopt->sopt_valsize = valsize; 2718 if (sopt->sopt_val != NULL) { 2719 if (sopt->sopt_td != NULL) 2720 error = copyout(buf, sopt->sopt_val, valsize); 2721 else 2722 bcopy(buf, sopt->sopt_val, valsize); 2723 } 2724 return (error); 2725 } 2726 2727 int 2728 sogetopt(struct socket *so, struct sockopt *sopt) 2729 { 2730 int error, optval; 2731 struct linger l; 2732 struct timeval tv; 2733 #ifdef MAC 2734 struct mac extmac; 2735 #endif 2736 2737 CURVNET_SET(so->so_vnet); 2738 error = 0; 2739 if (sopt->sopt_level != SOL_SOCKET) { 2740 if (so->so_proto->pr_ctloutput != NULL) 2741 error = (*so->so_proto->pr_ctloutput)(so, sopt); 2742 else 2743 error = ENOPROTOOPT; 2744 CURVNET_RESTORE(); 2745 return (error); 2746 } else { 2747 switch (sopt->sopt_name) { 2748 #ifdef INET 2749 case SO_ACCEPTFILTER: 2750 error = do_getopt_accept_filter(so, sopt); 2751 break; 2752 #endif 2753 case SO_LINGER: 2754 SOCK_LOCK(so); 2755 l.l_onoff = so->so_options & SO_LINGER; 2756 l.l_linger = so->so_linger; 2757 SOCK_UNLOCK(so); 2758 error = sooptcopyout(sopt, &l, sizeof l); 2759 break; 2760 2761 case SO_USELOOPBACK: 2762 case SO_DONTROUTE: 2763 case SO_DEBUG: 2764 case SO_KEEPALIVE: 2765 case SO_REUSEADDR: 2766 case SO_REUSEPORT: 2767 case SO_BROADCAST: 2768 case SO_OOBINLINE: 2769 case SO_ACCEPTCONN: 2770 case SO_TIMESTAMP: 2771 case SO_BINTIME: 2772 case SO_NOSIGPIPE: 2773 optval = so->so_options & sopt->sopt_name; 2774 integer: 2775 error = sooptcopyout(sopt, &optval, sizeof optval); 2776 break; 2777 2778 case SO_TYPE: 2779 optval = so->so_type; 2780 goto integer; 2781 2782 case SO_PROTOCOL: 2783 optval = so->so_proto->pr_protocol; 2784 goto integer; 2785 2786 case SO_ERROR: 2787 SOCK_LOCK(so); 2788 optval = so->so_error; 2789 so->so_error = 0; 2790 SOCK_UNLOCK(so); 2791 goto integer; 2792 2793 case SO_SNDBUF: 2794 optval = so->so_snd.sb_hiwat; 2795 goto integer; 2796 2797 case SO_RCVBUF: 2798 optval = so->so_rcv.sb_hiwat; 2799 goto integer; 2800 2801 case SO_SNDLOWAT: 2802 optval = so->so_snd.sb_lowat; 2803 goto integer; 2804 2805 case SO_RCVLOWAT: 2806 optval = so->so_rcv.sb_lowat; 2807 goto integer; 2808 2809 case SO_SNDTIMEO: 2810 case SO_RCVTIMEO: 2811 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2812 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2813 2814 tv.tv_sec = optval / hz; 2815 tv.tv_usec = (optval % hz) * tick; 2816 #ifdef COMPAT_FREEBSD32 2817 if (SV_CURPROC_FLAG(SV_ILP32)) { 2818 struct timeval32 tv32; 2819 2820 CP(tv, tv32, tv_sec); 2821 CP(tv, tv32, tv_usec); 2822 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2823 } else 2824 #endif 2825 error = sooptcopyout(sopt, &tv, sizeof tv); 2826 break; 2827 2828 case SO_LABEL: 2829 #ifdef MAC 2830 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2831 sizeof(extmac)); 2832 if (error) 2833 goto bad; 2834 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2835 so, &extmac); 2836 if (error) 2837 goto bad; 2838 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2839 #else 2840 error = EOPNOTSUPP; 2841 #endif 2842 break; 2843 2844 case SO_PEERLABEL: 2845 #ifdef MAC 2846 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2847 sizeof(extmac)); 2848 if (error) 2849 goto bad; 2850 error = mac_getsockopt_peerlabel( 2851 sopt->sopt_td->td_ucred, so, &extmac); 2852 if (error) 2853 goto bad; 2854 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2855 #else 2856 error = EOPNOTSUPP; 2857 #endif 2858 break; 2859 2860 case SO_LISTENQLIMIT: 2861 optval = so->so_qlimit; 2862 goto integer; 2863 2864 case SO_LISTENQLEN: 2865 optval = so->so_qlen; 2866 goto integer; 2867 2868 case SO_LISTENINCQLEN: 2869 optval = so->so_incqlen; 2870 goto integer; 2871 2872 default: 2873 error = ENOPROTOOPT; 2874 break; 2875 } 2876 } 2877 #ifdef MAC 2878 bad: 2879 #endif 2880 CURVNET_RESTORE(); 2881 return (error); 2882 } 2883 2884 int 2885 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2886 { 2887 struct mbuf *m, *m_prev; 2888 int sopt_size = sopt->sopt_valsize; 2889 2890 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 2891 if (m == NULL) 2892 return ENOBUFS; 2893 if (sopt_size > MLEN) { 2894 MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT); 2895 if ((m->m_flags & M_EXT) == 0) { 2896 m_free(m); 2897 return ENOBUFS; 2898 } 2899 m->m_len = min(MCLBYTES, sopt_size); 2900 } else { 2901 m->m_len = min(MLEN, sopt_size); 2902 } 2903 sopt_size -= m->m_len; 2904 *mp = m; 2905 m_prev = m; 2906 2907 while (sopt_size) { 2908 MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); 2909 if (m == NULL) { 2910 m_freem(*mp); 2911 return ENOBUFS; 2912 } 2913 if (sopt_size > MLEN) { 2914 MCLGET(m, sopt->sopt_td != NULL ? M_WAIT : 2915 M_DONTWAIT); 2916 if ((m->m_flags & M_EXT) == 0) { 2917 m_freem(m); 2918 m_freem(*mp); 2919 return ENOBUFS; 2920 } 2921 m->m_len = min(MCLBYTES, sopt_size); 2922 } else { 2923 m->m_len = min(MLEN, sopt_size); 2924 } 2925 sopt_size -= m->m_len; 2926 m_prev->m_next = m; 2927 m_prev = m; 2928 } 2929 return (0); 2930 } 2931 2932 int 2933 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2934 { 2935 struct mbuf *m0 = m; 2936 2937 if (sopt->sopt_val == NULL) 2938 return (0); 2939 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2940 if (sopt->sopt_td != NULL) { 2941 int error; 2942 2943 error = copyin(sopt->sopt_val, mtod(m, char *), 2944 m->m_len); 2945 if (error != 0) { 2946 m_freem(m0); 2947 return(error); 2948 } 2949 } else 2950 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2951 sopt->sopt_valsize -= m->m_len; 2952 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2953 m = m->m_next; 2954 } 2955 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2956 panic("ip6_sooptmcopyin"); 2957 return (0); 2958 } 2959 2960 int 2961 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2962 { 2963 struct mbuf *m0 = m; 2964 size_t valsize = 0; 2965 2966 if (sopt->sopt_val == NULL) 2967 return (0); 2968 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2969 if (sopt->sopt_td != NULL) { 2970 int error; 2971 2972 error = copyout(mtod(m, char *), sopt->sopt_val, 2973 m->m_len); 2974 if (error != 0) { 2975 m_freem(m0); 2976 return(error); 2977 } 2978 } else 2979 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2980 sopt->sopt_valsize -= m->m_len; 2981 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2982 valsize += m->m_len; 2983 m = m->m_next; 2984 } 2985 if (m != NULL) { 2986 /* enough soopt buffer should be given from user-land */ 2987 m_freem(m0); 2988 return(EINVAL); 2989 } 2990 sopt->sopt_valsize = valsize; 2991 return (0); 2992 } 2993 2994 /* 2995 * sohasoutofband(): protocol notifies socket layer of the arrival of new 2996 * out-of-band data, which will then notify socket consumers. 2997 */ 2998 void 2999 sohasoutofband(struct socket *so) 3000 { 3001 3002 if (so->so_sigio != NULL) 3003 pgsigio(&so->so_sigio, SIGURG, 0); 3004 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 3005 } 3006 3007 int 3008 sopoll(struct socket *so, int events, struct ucred *active_cred, 3009 struct thread *td) 3010 { 3011 3012 /* 3013 * We do not need to set or assert curvnet as long as everyone uses 3014 * sopoll_generic(). 3015 */ 3016 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, 3017 td)); 3018 } 3019 3020 int 3021 sopoll_generic(struct socket *so, int events, struct ucred *active_cred, 3022 struct thread *td) 3023 { 3024 int revents = 0; 3025 3026 SOCKBUF_LOCK(&so->so_snd); 3027 SOCKBUF_LOCK(&so->so_rcv); 3028 if (events & (POLLIN | POLLRDNORM)) 3029 if (soreadabledata(so)) 3030 revents |= events & (POLLIN | POLLRDNORM); 3031 3032 if (events & (POLLOUT | POLLWRNORM)) 3033 if (sowriteable(so)) 3034 revents |= events & (POLLOUT | POLLWRNORM); 3035 3036 if (events & (POLLPRI | POLLRDBAND)) 3037 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 3038 revents |= events & (POLLPRI | POLLRDBAND); 3039 3040 if ((events & POLLINIGNEOF) == 0) { 3041 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3042 revents |= events & (POLLIN | POLLRDNORM); 3043 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 3044 revents |= POLLHUP; 3045 } 3046 } 3047 3048 if (revents == 0) { 3049 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 3050 selrecord(td, &so->so_rcv.sb_sel); 3051 so->so_rcv.sb_flags |= SB_SEL; 3052 } 3053 3054 if (events & (POLLOUT | POLLWRNORM)) { 3055 selrecord(td, &so->so_snd.sb_sel); 3056 so->so_snd.sb_flags |= SB_SEL; 3057 } 3058 } 3059 3060 SOCKBUF_UNLOCK(&so->so_rcv); 3061 SOCKBUF_UNLOCK(&so->so_snd); 3062 return (revents); 3063 } 3064 3065 int 3066 soo_kqfilter(struct file *fp, struct knote *kn) 3067 { 3068 struct socket *so = kn->kn_fp->f_data; 3069 struct sockbuf *sb; 3070 3071 switch (kn->kn_filter) { 3072 case EVFILT_READ: 3073 if (so->so_options & SO_ACCEPTCONN) 3074 kn->kn_fop = &solisten_filtops; 3075 else 3076 kn->kn_fop = &soread_filtops; 3077 sb = &so->so_rcv; 3078 break; 3079 case EVFILT_WRITE: 3080 kn->kn_fop = &sowrite_filtops; 3081 sb = &so->so_snd; 3082 break; 3083 default: 3084 return (EINVAL); 3085 } 3086 3087 SOCKBUF_LOCK(sb); 3088 knlist_add(&sb->sb_sel.si_note, kn, 1); 3089 sb->sb_flags |= SB_KNOTE; 3090 SOCKBUF_UNLOCK(sb); 3091 return (0); 3092 } 3093 3094 /* 3095 * Some routines that return EOPNOTSUPP for entry points that are not 3096 * supported by a protocol. Fill in as needed. 3097 */ 3098 int 3099 pru_accept_notsupp(struct socket *so, struct sockaddr **nam) 3100 { 3101 3102 return EOPNOTSUPP; 3103 } 3104 3105 int 3106 pru_attach_notsupp(struct socket *so, int proto, struct thread *td) 3107 { 3108 3109 return EOPNOTSUPP; 3110 } 3111 3112 int 3113 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3114 { 3115 3116 return EOPNOTSUPP; 3117 } 3118 3119 int 3120 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) 3121 { 3122 3123 return EOPNOTSUPP; 3124 } 3125 3126 int 3127 pru_connect2_notsupp(struct socket *so1, struct socket *so2) 3128 { 3129 3130 return EOPNOTSUPP; 3131 } 3132 3133 int 3134 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, 3135 struct ifnet *ifp, struct thread *td) 3136 { 3137 3138 return EOPNOTSUPP; 3139 } 3140 3141 int 3142 pru_disconnect_notsupp(struct socket *so) 3143 { 3144 3145 return EOPNOTSUPP; 3146 } 3147 3148 int 3149 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) 3150 { 3151 3152 return EOPNOTSUPP; 3153 } 3154 3155 int 3156 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) 3157 { 3158 3159 return EOPNOTSUPP; 3160 } 3161 3162 int 3163 pru_rcvd_notsupp(struct socket *so, int flags) 3164 { 3165 3166 return EOPNOTSUPP; 3167 } 3168 3169 int 3170 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) 3171 { 3172 3173 return EOPNOTSUPP; 3174 } 3175 3176 int 3177 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, 3178 struct sockaddr *addr, struct mbuf *control, struct thread *td) 3179 { 3180 3181 return EOPNOTSUPP; 3182 } 3183 3184 /* 3185 * This isn't really a ``null'' operation, but it's the default one and 3186 * doesn't do anything destructive. 3187 */ 3188 int 3189 pru_sense_null(struct socket *so, struct stat *sb) 3190 { 3191 3192 sb->st_blksize = so->so_snd.sb_hiwat; 3193 return 0; 3194 } 3195 3196 int 3197 pru_shutdown_notsupp(struct socket *so) 3198 { 3199 3200 return EOPNOTSUPP; 3201 } 3202 3203 int 3204 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) 3205 { 3206 3207 return EOPNOTSUPP; 3208 } 3209 3210 int 3211 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, 3212 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 3213 { 3214 3215 return EOPNOTSUPP; 3216 } 3217 3218 int 3219 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, 3220 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 3221 { 3222 3223 return EOPNOTSUPP; 3224 } 3225 3226 int 3227 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, 3228 struct thread *td) 3229 { 3230 3231 return EOPNOTSUPP; 3232 } 3233 3234 static void 3235 filt_sordetach(struct knote *kn) 3236 { 3237 struct socket *so = kn->kn_fp->f_data; 3238 3239 SOCKBUF_LOCK(&so->so_rcv); 3240 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 3241 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 3242 so->so_rcv.sb_flags &= ~SB_KNOTE; 3243 SOCKBUF_UNLOCK(&so->so_rcv); 3244 } 3245 3246 /*ARGSUSED*/ 3247 static int 3248 filt_soread(struct knote *kn, long hint) 3249 { 3250 struct socket *so; 3251 3252 so = kn->kn_fp->f_data; 3253 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 3254 3255 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 3256 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3257 kn->kn_flags |= EV_EOF; 3258 kn->kn_fflags = so->so_error; 3259 return (1); 3260 } else if (so->so_error) /* temporary udp error */ 3261 return (1); 3262 else if (kn->kn_sfflags & NOTE_LOWAT) 3263 return (kn->kn_data >= kn->kn_sdata); 3264 else 3265 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 3266 } 3267 3268 static void 3269 filt_sowdetach(struct knote *kn) 3270 { 3271 struct socket *so = kn->kn_fp->f_data; 3272 3273 SOCKBUF_LOCK(&so->so_snd); 3274 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 3275 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 3276 so->so_snd.sb_flags &= ~SB_KNOTE; 3277 SOCKBUF_UNLOCK(&so->so_snd); 3278 } 3279 3280 /*ARGSUSED*/ 3281 static int 3282 filt_sowrite(struct knote *kn, long hint) 3283 { 3284 struct socket *so; 3285 3286 so = kn->kn_fp->f_data; 3287 SOCKBUF_LOCK_ASSERT(&so->so_snd); 3288 kn->kn_data = sbspace(&so->so_snd); 3289 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3290 kn->kn_flags |= EV_EOF; 3291 kn->kn_fflags = so->so_error; 3292 return (1); 3293 } else if (so->so_error) /* temporary udp error */ 3294 return (1); 3295 else if (((so->so_state & SS_ISCONNECTED) == 0) && 3296 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 3297 return (0); 3298 else if (kn->kn_sfflags & NOTE_LOWAT) 3299 return (kn->kn_data >= kn->kn_sdata); 3300 else 3301 return (kn->kn_data >= so->so_snd.sb_lowat); 3302 } 3303 3304 /*ARGSUSED*/ 3305 static int 3306 filt_solisten(struct knote *kn, long hint) 3307 { 3308 struct socket *so = kn->kn_fp->f_data; 3309 3310 kn->kn_data = so->so_qlen; 3311 return (! TAILQ_EMPTY(&so->so_comp)); 3312 } 3313 3314 int 3315 socheckuid(struct socket *so, uid_t uid) 3316 { 3317 3318 if (so == NULL) 3319 return (EPERM); 3320 if (so->so_cred->cr_uid != uid) 3321 return (EPERM); 3322 return (0); 3323 } 3324 3325 /* 3326 * These functions are used by protocols to notify the socket layer (and its 3327 * consumers) of state changes in the sockets driven by protocol-side events. 3328 */ 3329 3330 /* 3331 * Procedures to manipulate state flags of socket and do appropriate wakeups. 3332 * 3333 * Normal sequence from the active (originating) side is that 3334 * soisconnecting() is called during processing of connect() call, resulting 3335 * in an eventual call to soisconnected() if/when the connection is 3336 * established. When the connection is torn down soisdisconnecting() is 3337 * called during processing of disconnect() call, and soisdisconnected() is 3338 * called when the connection to the peer is totally severed. The semantics 3339 * of these routines are such that connectionless protocols can call 3340 * soisconnected() and soisdisconnected() only, bypassing the in-progress 3341 * calls when setting up a ``connection'' takes no time. 3342 * 3343 * From the passive side, a socket is created with two queues of sockets: 3344 * so_incomp for connections in progress and so_comp for connections already 3345 * made and awaiting user acceptance. As a protocol is preparing incoming 3346 * connections, it creates a socket structure queued on so_incomp by calling 3347 * sonewconn(). When the connection is established, soisconnected() is 3348 * called, and transfers the socket structure to so_comp, making it available 3349 * to accept(). 3350 * 3351 * If a socket is closed with sockets on either so_incomp or so_comp, these 3352 * sockets are dropped. 3353 * 3354 * If higher-level protocols are implemented in the kernel, the wakeups done 3355 * here will sometimes cause software-interrupt process scheduling. 3356 */ 3357 void 3358 soisconnecting(struct socket *so) 3359 { 3360 3361 SOCK_LOCK(so); 3362 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); 3363 so->so_state |= SS_ISCONNECTING; 3364 SOCK_UNLOCK(so); 3365 } 3366 3367 void 3368 soisconnected(struct socket *so) 3369 { 3370 struct socket *head; 3371 int ret; 3372 3373 restart: 3374 ACCEPT_LOCK(); 3375 SOCK_LOCK(so); 3376 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); 3377 so->so_state |= SS_ISCONNECTED; 3378 head = so->so_head; 3379 if (head != NULL && (so->so_qstate & SQ_INCOMP)) { 3380 if ((so->so_options & SO_ACCEPTFILTER) == 0) { 3381 SOCK_UNLOCK(so); 3382 TAILQ_REMOVE(&head->so_incomp, so, so_list); 3383 head->so_incqlen--; 3384 so->so_qstate &= ~SQ_INCOMP; 3385 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 3386 head->so_qlen++; 3387 so->so_qstate |= SQ_COMP; 3388 ACCEPT_UNLOCK(); 3389 sorwakeup(head); 3390 wakeup_one(&head->so_timeo); 3391 } else { 3392 ACCEPT_UNLOCK(); 3393 soupcall_set(so, SO_RCV, 3394 head->so_accf->so_accept_filter->accf_callback, 3395 head->so_accf->so_accept_filter_arg); 3396 so->so_options &= ~SO_ACCEPTFILTER; 3397 ret = head->so_accf->so_accept_filter->accf_callback(so, 3398 head->so_accf->so_accept_filter_arg, M_DONTWAIT); 3399 if (ret == SU_ISCONNECTED) 3400 soupcall_clear(so, SO_RCV); 3401 SOCK_UNLOCK(so); 3402 if (ret == SU_ISCONNECTED) 3403 goto restart; 3404 } 3405 return; 3406 } 3407 SOCK_UNLOCK(so); 3408 ACCEPT_UNLOCK(); 3409 wakeup(&so->so_timeo); 3410 sorwakeup(so); 3411 sowwakeup(so); 3412 } 3413 3414 void 3415 soisdisconnecting(struct socket *so) 3416 { 3417 3418 /* 3419 * Note: This code assumes that SOCK_LOCK(so) and 3420 * SOCKBUF_LOCK(&so->so_rcv) are the same. 3421 */ 3422 SOCKBUF_LOCK(&so->so_rcv); 3423 so->so_state &= ~SS_ISCONNECTING; 3424 so->so_state |= SS_ISDISCONNECTING; 3425 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 3426 sorwakeup_locked(so); 3427 SOCKBUF_LOCK(&so->so_snd); 3428 so->so_snd.sb_state |= SBS_CANTSENDMORE; 3429 sowwakeup_locked(so); 3430 wakeup(&so->so_timeo); 3431 } 3432 3433 void 3434 soisdisconnected(struct socket *so) 3435 { 3436 3437 /* 3438 * Note: This code assumes that SOCK_LOCK(so) and 3439 * SOCKBUF_LOCK(&so->so_rcv) are the same. 3440 */ 3441 SOCKBUF_LOCK(&so->so_rcv); 3442 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); 3443 so->so_state |= SS_ISDISCONNECTED; 3444 so->so_rcv.sb_state |= SBS_CANTRCVMORE; 3445 sorwakeup_locked(so); 3446 SOCKBUF_LOCK(&so->so_snd); 3447 so->so_snd.sb_state |= SBS_CANTSENDMORE; 3448 sbdrop_locked(&so->so_snd, so->so_snd.sb_cc); 3449 sowwakeup_locked(so); 3450 wakeup(&so->so_timeo); 3451 } 3452 3453 /* 3454 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. 3455 */ 3456 struct sockaddr * 3457 sodupsockaddr(const struct sockaddr *sa, int mflags) 3458 { 3459 struct sockaddr *sa2; 3460 3461 sa2 = malloc(sa->sa_len, M_SONAME, mflags); 3462 if (sa2) 3463 bcopy(sa, sa2, sa->sa_len); 3464 return sa2; 3465 } 3466 3467 /* 3468 * Register per-socket buffer upcalls. 3469 */ 3470 void 3471 soupcall_set(struct socket *so, int which, 3472 int (*func)(struct socket *, void *, int), void *arg) 3473 { 3474 struct sockbuf *sb; 3475 3476 switch (which) { 3477 case SO_RCV: 3478 sb = &so->so_rcv; 3479 break; 3480 case SO_SND: 3481 sb = &so->so_snd; 3482 break; 3483 default: 3484 panic("soupcall_set: bad which"); 3485 } 3486 SOCKBUF_LOCK_ASSERT(sb); 3487 #if 0 3488 /* XXX: accf_http actually wants to do this on purpose. */ 3489 KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall")); 3490 #endif 3491 sb->sb_upcall = func; 3492 sb->sb_upcallarg = arg; 3493 sb->sb_flags |= SB_UPCALL; 3494 } 3495 3496 void 3497 soupcall_clear(struct socket *so, int which) 3498 { 3499 struct sockbuf *sb; 3500 3501 switch (which) { 3502 case SO_RCV: 3503 sb = &so->so_rcv; 3504 break; 3505 case SO_SND: 3506 sb = &so->so_snd; 3507 break; 3508 default: 3509 panic("soupcall_clear: bad which"); 3510 } 3511 SOCKBUF_LOCK_ASSERT(sb); 3512 KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); 3513 sb->sb_upcall = NULL; 3514 sb->sb_upcallarg = NULL; 3515 sb->sb_flags &= ~SB_UPCALL; 3516 } 3517 3518 /* 3519 * Create an external-format (``xsocket'') structure using the information in 3520 * the kernel-format socket structure pointed to by so. This is done to 3521 * reduce the spew of irrelevant information over this interface, to isolate 3522 * user code from changes in the kernel structure, and potentially to provide 3523 * information-hiding if we decide that some of this information should be 3524 * hidden from users. 3525 */ 3526 void 3527 sotoxsocket(struct socket *so, struct xsocket *xso) 3528 { 3529 3530 xso->xso_len = sizeof *xso; 3531 xso->xso_so = so; 3532 xso->so_type = so->so_type; 3533 xso->so_options = so->so_options; 3534 xso->so_linger = so->so_linger; 3535 xso->so_state = so->so_state; 3536 xso->so_pcb = so->so_pcb; 3537 xso->xso_protocol = so->so_proto->pr_protocol; 3538 xso->xso_family = so->so_proto->pr_domain->dom_family; 3539 xso->so_qlen = so->so_qlen; 3540 xso->so_incqlen = so->so_incqlen; 3541 xso->so_qlimit = so->so_qlimit; 3542 xso->so_timeo = so->so_timeo; 3543 xso->so_error = so->so_error; 3544 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; 3545 xso->so_oobmark = so->so_oobmark; 3546 sbtoxsockbuf(&so->so_snd, &xso->so_snd); 3547 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); 3548 xso->so_uid = so->so_cred->cr_uid; 3549 } 3550 3551 3552 /* 3553 * Socket accessor functions to provide external consumers with 3554 * a safe interface to socket state 3555 * 3556 */ 3557 3558 void 3559 so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg) 3560 { 3561 3562 TAILQ_FOREACH(so, &so->so_comp, so_list) 3563 func(so, arg); 3564 } 3565 3566 struct sockbuf * 3567 so_sockbuf_rcv(struct socket *so) 3568 { 3569 3570 return (&so->so_rcv); 3571 } 3572 3573 struct sockbuf * 3574 so_sockbuf_snd(struct socket *so) 3575 { 3576 3577 return (&so->so_snd); 3578 } 3579 3580 int 3581 so_state_get(const struct socket *so) 3582 { 3583 3584 return (so->so_state); 3585 } 3586 3587 void 3588 so_state_set(struct socket *so, int val) 3589 { 3590 3591 so->so_state = val; 3592 } 3593 3594 int 3595 so_options_get(const struct socket *so) 3596 { 3597 3598 return (so->so_options); 3599 } 3600 3601 void 3602 so_options_set(struct socket *so, int val) 3603 { 3604 3605 so->so_options = val; 3606 } 3607 3608 int 3609 so_error_get(const struct socket *so) 3610 { 3611 3612 return (so->so_error); 3613 } 3614 3615 void 3616 so_error_set(struct socket *so, int val) 3617 { 3618 3619 so->so_error = val; 3620 } 3621 3622 int 3623 so_linger_get(const struct socket *so) 3624 { 3625 3626 return (so->so_linger); 3627 } 3628 3629 void 3630 so_linger_set(struct socket *so, int val) 3631 { 3632 3633 so->so_linger = val; 3634 } 3635 3636 struct protosw * 3637 so_protosw_get(const struct socket *so) 3638 { 3639 3640 return (so->so_proto); 3641 } 3642 3643 void 3644 so_protosw_set(struct socket *so, struct protosw *val) 3645 { 3646 3647 so->so_proto = val; 3648 } 3649 3650 void 3651 so_sorwakeup(struct socket *so) 3652 { 3653 3654 sorwakeup(so); 3655 } 3656 3657 void 3658 so_sowwakeup(struct socket *so) 3659 { 3660 3661 sowwakeup(so); 3662 } 3663 3664 void 3665 so_sorwakeup_locked(struct socket *so) 3666 { 3667 3668 sorwakeup_locked(so); 3669 } 3670 3671 void 3672 so_sowwakeup_locked(struct socket *so) 3673 { 3674 3675 sowwakeup_locked(so); 3676 } 3677 3678 void 3679 so_lock(struct socket *so) 3680 { 3681 SOCK_LOCK(so); 3682 } 3683 3684 void 3685 so_unlock(struct socket *so) 3686 { 3687 SOCK_UNLOCK(so); 3688 } 3689