1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * Copyright (c) 2004 The FreeBSD Foundation 5 * Copyright (c) 2004-2006 Robert N. M. Watson 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34 /* 35 * Comments on the socket life cycle: 36 * 37 * soalloc() sets of socket layer state for a socket, called only by 38 * socreate() and sonewconn(). Socket layer private. 39 * 40 * sdealloc() tears down socket layer state for a socket, called only by 41 * sofree() and sonewconn(). Socket layer private. 42 * 43 * pru_attach() associates protocol layer state with an allocated socket; 44 * called only once, may fail, aborting socket allocation. This is called 45 * from socreate() and sonewconn(). Socket layer private. 46 * 47 * pru_detach() disassociates protocol layer state from an attached socket, 48 * and will be called exactly once for sockets in which pru_attach() has 49 * been successfully called. If pru_attach() returned an error, 50 * pru_detach() will not be called. Socket layer private. 51 * 52 * socreate() creates a socket and attaches protocol state. This is a public 53 * interface that may be used by socket layer consumers to create new 54 * sockets. 55 * 56 * sonewconn() creates a socket and attaches protocol state. This is a 57 * public interface that may be used by protocols to create new sockets when 58 * a new connection is received and will be available for accept() on a 59 * listen socket. 60 * 61 * soclose() destroys a socket after possibly waiting for it to disconnect. 62 * This is a public interface that socket consumers should use to close and 63 * release a socket when done with it. 64 * 65 * soabort() destroys a socket without waiting for it to disconnect (used 66 * only for incoming connections that are already partially or fully 67 * connected). This is used internally by the socket layer when clearing 68 * listen socket queues (due to overflow or close on the listen socket), but 69 * is also a public interface protocols may use to abort connections in 70 * their incomplete listen queues should they no longer be required. Sockets 71 * placed in completed connection listen queues should not be aborted. 72 * 73 * sofree() will free a socket and its protocol state if all references on 74 * the socket have been released, and is the public interface to attempt to 75 * free a socket when a reference is removed. This is a socket layer private 76 * interface. 77 * 78 * NOTE: In addition to socreate() and soclose(), which provide a single 79 * socket reference to the consumer to be managed as required, there are two 80 * calls to explicitly manage socket references, soref(), and sorele(). 81 * Currently, these are generally required only when transitioning a socket 82 * from a listen queue to a file descriptor, in order to prevent garbage 83 * collection of the socket at an untimely moment. For a number of reasons, 84 * these interfaces are not preferred, and should be avoided. 85 * 86 * XXXRW: The behavior of sockets after soclose() but before the last 87 * sorele() is poorly defined. We can probably entirely eliminate them with 88 * a little work, since consumers are managing references anyway. 89 */ 90 91 #include <sys/cdefs.h> 92 __FBSDID("$FreeBSD$"); 93 94 #include "opt_inet.h" 95 #include "opt_mac.h" 96 #include "opt_zero.h" 97 #include "opt_compat.h" 98 99 #include <sys/param.h> 100 #include <sys/systm.h> 101 #include <sys/fcntl.h> 102 #include <sys/limits.h> 103 #include <sys/lock.h> 104 #include <sys/mac.h> 105 #include <sys/malloc.h> 106 #include <sys/mbuf.h> 107 #include <sys/mutex.h> 108 #include <sys/domain.h> 109 #include <sys/file.h> /* for struct knote */ 110 #include <sys/kernel.h> 111 #include <sys/event.h> 112 #include <sys/eventhandler.h> 113 #include <sys/poll.h> 114 #include <sys/proc.h> 115 #include <sys/protosw.h> 116 #include <sys/socket.h> 117 #include <sys/socketvar.h> 118 #include <sys/resourcevar.h> 119 #include <sys/signalvar.h> 120 #include <sys/sysctl.h> 121 #include <sys/uio.h> 122 #include <sys/jail.h> 123 124 #include <vm/uma.h> 125 126 #ifdef COMPAT_IA32 127 #include <sys/mount.h> 128 #include <compat/freebsd32/freebsd32.h> 129 130 extern struct sysentvec ia32_freebsd_sysvec; 131 #endif 132 133 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 134 int flags); 135 136 static void filt_sordetach(struct knote *kn); 137 static int filt_soread(struct knote *kn, long hint); 138 static void filt_sowdetach(struct knote *kn); 139 static int filt_sowrite(struct knote *kn, long hint); 140 static int filt_solisten(struct knote *kn, long hint); 141 142 static struct filterops solisten_filtops = 143 { 1, NULL, filt_sordetach, filt_solisten }; 144 static struct filterops soread_filtops = 145 { 1, NULL, filt_sordetach, filt_soread }; 146 static struct filterops sowrite_filtops = 147 { 1, NULL, filt_sowdetach, filt_sowrite }; 148 149 uma_zone_t socket_zone; 150 so_gen_t so_gencnt; /* generation count for sockets */ 151 152 int maxsockets; 153 154 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 155 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 156 157 static int somaxconn = SOMAXCONN; 158 static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS); 159 /* XXX: we dont have SYSCTL_USHORT */ 160 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW, 161 0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection " 162 "queue size"); 163 static int numopensockets; 164 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 165 &numopensockets, 0, "Number of open sockets"); 166 #ifdef ZERO_COPY_SOCKETS 167 /* These aren't static because they're used in other files. */ 168 int so_zero_copy_send = 1; 169 int so_zero_copy_receive = 1; 170 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 171 "Zero copy controls"); 172 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 173 &so_zero_copy_receive, 0, "Enable zero copy receive"); 174 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 175 &so_zero_copy_send, 0, "Enable zero copy send"); 176 #endif /* ZERO_COPY_SOCKETS */ 177 178 /* 179 * accept_mtx locks down per-socket fields relating to accept queues. See 180 * socketvar.h for an annotation of the protected fields of struct socket. 181 */ 182 struct mtx accept_mtx; 183 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 184 185 /* 186 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 187 * so_gencnt field. 188 */ 189 static struct mtx so_global_mtx; 190 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 191 192 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); 193 194 static int 195 sysctl_maxsockets(SYSCTL_HANDLER_ARGS) 196 { 197 int error, newmaxsockets; 198 199 newmaxsockets = maxsockets; 200 error = sysctl_handle_int(oidp, &newmaxsockets, sizeof(int), req); 201 if (error == 0 && req->newptr) { 202 if (newmaxsockets > maxsockets) { 203 maxsockets = newmaxsockets; 204 if (maxsockets > ((maxfiles / 4) * 3)) { 205 maxfiles = (maxsockets * 5) / 4; 206 maxfilesperproc = (maxfiles * 9) / 10; 207 } 208 EVENTHANDLER_INVOKE(maxsockets_change); 209 } else 210 error = EINVAL; 211 } 212 return (error); 213 } 214 215 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW, 216 &maxsockets, 0, sysctl_maxsockets, "IU", 217 "Maximum number of sockets avaliable"); 218 219 /* 220 * Initialise maxsockets 221 */ 222 static void init_maxsockets(void *ignored) 223 { 224 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); 225 maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters)); 226 } 227 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); 228 229 /* 230 * Socket operation routines. 231 * These routines are called by the routines in 232 * sys_socket.c or from a system process, and 233 * implement the semantics of socket operations by 234 * switching out to the protocol specific routines. 235 */ 236 237 /* 238 * Get a socket structure from our zone, and initialize it. 239 * Note that it would probably be better to allocate socket 240 * and PCB at the same time, but I'm not convinced that all 241 * the protocols can be easily modified to do this. 242 * 243 * soalloc() returns a socket with a ref count of 0. 244 */ 245 static struct socket * 246 soalloc(int mflags) 247 { 248 struct socket *so; 249 250 so = uma_zalloc(socket_zone, mflags | M_ZERO); 251 if (so == NULL) 252 return (NULL); 253 #ifdef MAC 254 if (mac_init_socket(so, mflags) != 0) { 255 uma_zfree(socket_zone, so); 256 return (NULL); 257 } 258 #endif 259 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 260 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 261 TAILQ_INIT(&so->so_aiojobq); 262 mtx_lock(&so_global_mtx); 263 so->so_gencnt = ++so_gencnt; 264 ++numopensockets; 265 mtx_unlock(&so_global_mtx); 266 return (so); 267 } 268 269 static void 270 sodealloc(struct socket *so) 271 { 272 273 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 274 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); 275 276 mtx_lock(&so_global_mtx); 277 so->so_gencnt = ++so_gencnt; 278 mtx_unlock(&so_global_mtx); 279 if (so->so_rcv.sb_hiwat) 280 (void)chgsbsize(so->so_cred->cr_uidinfo, 281 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 282 if (so->so_snd.sb_hiwat) 283 (void)chgsbsize(so->so_cred->cr_uidinfo, 284 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 285 #ifdef INET 286 /* remove acccept filter if one is present. */ 287 if (so->so_accf != NULL) 288 do_setopt_accept_filter(so, NULL); 289 #endif 290 #ifdef MAC 291 mac_destroy_socket(so); 292 #endif 293 crfree(so->so_cred); 294 SOCKBUF_LOCK_DESTROY(&so->so_snd); 295 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 296 uma_zfree(socket_zone, so); 297 mtx_lock(&so_global_mtx); 298 --numopensockets; 299 mtx_unlock(&so_global_mtx); 300 } 301 302 /* 303 * socreate returns a socket with a ref count of 1. The socket should be 304 * closed with soclose(). 305 */ 306 int 307 socreate(dom, aso, type, proto, cred, td) 308 int dom; 309 struct socket **aso; 310 int type; 311 int proto; 312 struct ucred *cred; 313 struct thread *td; 314 { 315 struct protosw *prp; 316 struct socket *so; 317 int error; 318 319 if (proto) 320 prp = pffindproto(dom, proto, type); 321 else 322 prp = pffindtype(dom, type); 323 324 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL || 325 prp->pr_usrreqs->pru_attach == pru_attach_notsupp) 326 return (EPROTONOSUPPORT); 327 328 if (jailed(cred) && jail_socket_unixiproute_only && 329 prp->pr_domain->dom_family != PF_LOCAL && 330 prp->pr_domain->dom_family != PF_INET && 331 prp->pr_domain->dom_family != PF_ROUTE) { 332 return (EPROTONOSUPPORT); 333 } 334 335 if (prp->pr_type != type) 336 return (EPROTOTYPE); 337 so = soalloc(M_WAITOK); 338 if (so == NULL) 339 return (ENOBUFS); 340 341 TAILQ_INIT(&so->so_incomp); 342 TAILQ_INIT(&so->so_comp); 343 so->so_type = type; 344 so->so_cred = crhold(cred); 345 so->so_proto = prp; 346 #ifdef MAC 347 mac_create_socket(cred, so); 348 #endif 349 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 350 NULL, NULL, NULL); 351 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 352 NULL, NULL, NULL); 353 so->so_count = 1; 354 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 355 if (error) { 356 sodealloc(so); 357 return (error); 358 } 359 *aso = so; 360 return (0); 361 } 362 363 #ifdef REGRESSION 364 static int regression_sonewconn_earlytest = 1; 365 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, 366 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); 367 #endif 368 369 /* 370 * When an attempt at a new connection is noted on a socket 371 * which accepts connections, sonewconn is called. If the 372 * connection is possible (subject to space constraints, etc.) 373 * then we allocate a new structure, propoerly linked into the 374 * data structure of the original socket, and return this. 375 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. 376 * 377 * note: the ref count on the socket is 0 on return 378 */ 379 struct socket * 380 sonewconn(head, connstatus) 381 register struct socket *head; 382 int connstatus; 383 { 384 register struct socket *so; 385 int over; 386 387 ACCEPT_LOCK(); 388 over = (head->so_qlen > 3 * head->so_qlimit / 2); 389 ACCEPT_UNLOCK(); 390 #ifdef REGRESSION 391 if (regression_sonewconn_earlytest && over) 392 #else 393 if (over) 394 #endif 395 return (NULL); 396 so = soalloc(M_NOWAIT); 397 if (so == NULL) 398 return (NULL); 399 if ((head->so_options & SO_ACCEPTFILTER) != 0) 400 connstatus = 0; 401 so->so_head = head; 402 so->so_type = head->so_type; 403 so->so_options = head->so_options &~ SO_ACCEPTCONN; 404 so->so_linger = head->so_linger; 405 so->so_state = head->so_state | SS_NOFDREF; 406 so->so_proto = head->so_proto; 407 so->so_timeo = head->so_timeo; 408 so->so_cred = crhold(head->so_cred); 409 #ifdef MAC 410 SOCK_LOCK(head); 411 mac_create_socket_from_socket(head, so); 412 SOCK_UNLOCK(head); 413 #endif 414 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv), 415 NULL, NULL, NULL); 416 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd), 417 NULL, NULL, NULL); 418 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) || 419 (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { 420 sodealloc(so); 421 return (NULL); 422 } 423 so->so_state |= connstatus; 424 ACCEPT_LOCK(); 425 if (connstatus) { 426 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); 427 so->so_qstate |= SQ_COMP; 428 head->so_qlen++; 429 } else { 430 /* 431 * Keep removing sockets from the head until there's room for 432 * us to insert on the tail. In pre-locking revisions, this 433 * was a simple if(), but as we could be racing with other 434 * threads and soabort() requires dropping locks, we must 435 * loop waiting for the condition to be true. 436 */ 437 while (head->so_incqlen > head->so_qlimit) { 438 struct socket *sp; 439 sp = TAILQ_FIRST(&head->so_incomp); 440 TAILQ_REMOVE(&head->so_incomp, sp, so_list); 441 head->so_incqlen--; 442 sp->so_qstate &= ~SQ_INCOMP; 443 sp->so_head = NULL; 444 ACCEPT_UNLOCK(); 445 soabort(sp); 446 ACCEPT_LOCK(); 447 } 448 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); 449 so->so_qstate |= SQ_INCOMP; 450 head->so_incqlen++; 451 } 452 ACCEPT_UNLOCK(); 453 if (connstatus) { 454 sorwakeup(head); 455 wakeup_one(&head->so_timeo); 456 } 457 return (so); 458 } 459 460 int 461 sobind(so, nam, td) 462 struct socket *so; 463 struct sockaddr *nam; 464 struct thread *td; 465 { 466 467 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 468 } 469 470 /* 471 * solisten() transitions a socket from a non-listening state to a listening 472 * state, but can also be used to update the listen queue depth on an 473 * existing listen socket. The protocol will call back into the sockets 474 * layer using solisten_proto_check() and solisten_proto() to check and set 475 * socket-layer listen state. Call backs are used so that the protocol can 476 * acquire both protocol and socket layer locks in whatever order is required 477 * by the protocol. 478 * 479 * Protocol implementors are advised to hold the socket lock across the 480 * socket-layer test and set to avoid races at the socket layer. 481 */ 482 int 483 solisten(so, backlog, td) 484 struct socket *so; 485 int backlog; 486 struct thread *td; 487 { 488 489 return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td)); 490 } 491 492 int 493 solisten_proto_check(so) 494 struct socket *so; 495 { 496 497 SOCK_LOCK_ASSERT(so); 498 499 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 500 SS_ISDISCONNECTING)) 501 return (EINVAL); 502 return (0); 503 } 504 505 void 506 solisten_proto(so, backlog) 507 struct socket *so; 508 int backlog; 509 { 510 511 SOCK_LOCK_ASSERT(so); 512 513 if (backlog < 0 || backlog > somaxconn) 514 backlog = somaxconn; 515 so->so_qlimit = backlog; 516 so->so_options |= SO_ACCEPTCONN; 517 } 518 519 /* 520 * Attempt to free a socket. This should really be sotryfree(). 521 * 522 * sofree() will succeed if: 523 * 524 * - There are no outstanding file descriptor references or related consumers 525 * (so_count == 0). 526 * 527 * - The socket has been closed by user space, if ever open (SS_NOFDREF). 528 * 529 * - The protocol does not have an outstanding strong reference on the socket 530 * (SS_PROTOREF). 531 * 532 * - The socket is not in a completed connection queue, so a process has been 533 * notified that it is present. If it is removed, the user process may 534 * block in accept() despite select() saying the socket was ready. 535 * 536 * Otherwise, it will quietly abort so that a future call to sofree(), when 537 * conditions are right, can succeed. 538 */ 539 void 540 sofree(so) 541 struct socket *so; 542 { 543 struct socket *head; 544 545 ACCEPT_LOCK_ASSERT(); 546 SOCK_LOCK_ASSERT(so); 547 548 if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || 549 (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { 550 SOCK_UNLOCK(so); 551 ACCEPT_UNLOCK(); 552 return; 553 } 554 555 head = so->so_head; 556 if (head != NULL) { 557 KASSERT((so->so_qstate & SQ_COMP) != 0 || 558 (so->so_qstate & SQ_INCOMP) != 0, 559 ("sofree: so_head != NULL, but neither SQ_COMP nor " 560 "SQ_INCOMP")); 561 KASSERT((so->so_qstate & SQ_COMP) == 0 || 562 (so->so_qstate & SQ_INCOMP) == 0, 563 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 564 TAILQ_REMOVE(&head->so_incomp, so, so_list); 565 head->so_incqlen--; 566 so->so_qstate &= ~SQ_INCOMP; 567 so->so_head = NULL; 568 } 569 KASSERT((so->so_qstate & SQ_COMP) == 0 && 570 (so->so_qstate & SQ_INCOMP) == 0, 571 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 572 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 573 SOCK_UNLOCK(so); 574 ACCEPT_UNLOCK(); 575 576 SOCKBUF_LOCK(&so->so_snd); 577 so->so_snd.sb_flags |= SB_NOINTR; 578 (void)sblock(&so->so_snd, M_WAITOK); 579 /* 580 * socantsendmore_locked() drops the socket buffer mutex so that it 581 * can safely perform wakeups. Re-acquire the mutex before 582 * continuing. 583 */ 584 socantsendmore_locked(so); 585 SOCKBUF_LOCK(&so->so_snd); 586 sbunlock(&so->so_snd); 587 sbrelease_locked(&so->so_snd, so); 588 SOCKBUF_UNLOCK(&so->so_snd); 589 sorflush(so); 590 knlist_destroy(&so->so_rcv.sb_sel.si_note); 591 knlist_destroy(&so->so_snd.sb_sel.si_note); 592 sodealloc(so); 593 } 594 595 /* 596 * Close a socket on last file table reference removal. 597 * Initiate disconnect if connected. 598 * Free socket when disconnect complete. 599 * 600 * This function will sorele() the socket. Note that soclose() may be 601 * called prior to the ref count reaching zero. The actual socket 602 * structure will not be freed until the ref count reaches zero. 603 */ 604 int 605 soclose(so) 606 struct socket *so; 607 { 608 int error = 0; 609 610 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 611 612 funsetown(&so->so_sigio); 613 if (so->so_options & SO_ACCEPTCONN) { 614 struct socket *sp; 615 ACCEPT_LOCK(); 616 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 617 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 618 so->so_incqlen--; 619 sp->so_qstate &= ~SQ_INCOMP; 620 sp->so_head = NULL; 621 ACCEPT_UNLOCK(); 622 soabort(sp); 623 ACCEPT_LOCK(); 624 } 625 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 626 TAILQ_REMOVE(&so->so_comp, sp, so_list); 627 so->so_qlen--; 628 sp->so_qstate &= ~SQ_COMP; 629 sp->so_head = NULL; 630 ACCEPT_UNLOCK(); 631 soabort(sp); 632 ACCEPT_LOCK(); 633 } 634 ACCEPT_UNLOCK(); 635 } 636 if (so->so_state & SS_ISCONNECTED) { 637 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 638 error = sodisconnect(so); 639 if (error) 640 goto drop; 641 } 642 if (so->so_options & SO_LINGER) { 643 if ((so->so_state & SS_ISDISCONNECTING) && 644 (so->so_state & SS_NBIO)) 645 goto drop; 646 while (so->so_state & SS_ISCONNECTED) { 647 error = tsleep(&so->so_timeo, 648 PSOCK | PCATCH, "soclos", so->so_linger * hz); 649 if (error) 650 break; 651 } 652 } 653 } 654 655 drop: 656 if (*so->so_proto->pr_usrreqs->pru_detach != NULL) 657 (*so->so_proto->pr_usrreqs->pru_detach)(so); 658 ACCEPT_LOCK(); 659 SOCK_LOCK(so); 660 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 661 so->so_state |= SS_NOFDREF; 662 sorele(so); 663 return (error); 664 } 665 666 /* 667 * soabort() allows the socket code or protocol code to detach a socket that 668 * has been in an incomplete or completed listen queue, but has not yet been 669 * accepted. 670 * 671 * This interface is tricky, because it is called on an unreferenced socket, 672 * and must be called only by a thread that has actually removed the socket 673 * from the listen queue it was on, or races with other threads are risked. 674 * 675 * This interface will call into the protocol code, so must not be called 676 * with any socket locks held. Protocols do call it while holding their own 677 * recursible protocol mutexes, but this is something that should be subject 678 * to review in the future. 679 * 680 * XXXRW: Why do we maintain a distinction between pru_abort() and 681 * pru_detach()? 682 */ 683 void 684 soabort(so) 685 struct socket *so; 686 { 687 688 /* 689 * In as much as is possible, assert that no references to this 690 * socket are held. This is not quite the same as asserting that the 691 * current thread is responsible for arranging for no references, but 692 * is as close as we can get for now. 693 */ 694 KASSERT(so->so_count == 0, ("soabort: so_count")); 695 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); 696 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); 697 KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); 698 KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); 699 700 if (*so->so_proto->pr_usrreqs->pru_abort != NULL) 701 (*so->so_proto->pr_usrreqs->pru_abort)(so); 702 ACCEPT_LOCK(); 703 SOCK_LOCK(so); 704 sofree(so); 705 } 706 707 int 708 soaccept(so, nam) 709 struct socket *so; 710 struct sockaddr **nam; 711 { 712 int error; 713 714 SOCK_LOCK(so); 715 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 716 so->so_state &= ~SS_NOFDREF; 717 SOCK_UNLOCK(so); 718 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 719 return (error); 720 } 721 722 int 723 soconnect(so, nam, td) 724 struct socket *so; 725 struct sockaddr *nam; 726 struct thread *td; 727 { 728 int error; 729 730 if (so->so_options & SO_ACCEPTCONN) 731 return (EOPNOTSUPP); 732 /* 733 * If protocol is connection-based, can only connect once. 734 * Otherwise, if connected, try to disconnect first. 735 * This allows user to disconnect by connecting to, e.g., 736 * a null address. 737 */ 738 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 739 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 740 (error = sodisconnect(so)))) { 741 error = EISCONN; 742 } else { 743 /* 744 * Prevent accumulated error from previous connection 745 * from biting us. 746 */ 747 so->so_error = 0; 748 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 749 } 750 751 return (error); 752 } 753 754 int 755 soconnect2(so1, so2) 756 struct socket *so1; 757 struct socket *so2; 758 { 759 760 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 761 } 762 763 int 764 sodisconnect(so) 765 struct socket *so; 766 { 767 int error; 768 769 if ((so->so_state & SS_ISCONNECTED) == 0) 770 return (ENOTCONN); 771 if (so->so_state & SS_ISDISCONNECTING) 772 return (EALREADY); 773 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 774 return (error); 775 } 776 777 #ifdef ZERO_COPY_SOCKETS 778 struct so_zerocopy_stats{ 779 int size_ok; 780 int align_ok; 781 int found_ifp; 782 }; 783 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 784 #include <netinet/in.h> 785 #include <net/route.h> 786 #include <netinet/in_pcb.h> 787 #include <vm/vm.h> 788 #include <vm/vm_page.h> 789 #include <vm/vm_object.h> 790 #endif /*ZERO_COPY_SOCKETS*/ 791 792 /* 793 * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or 794 * all of the data referenced by the uio. If desired, it uses zero-copy. 795 * *space will be updated to reflect data copied in. 796 * 797 * NB: If atomic I/O is requested, the caller must already have checked that 798 * space can hold resid bytes. 799 * 800 * NB: In the event of an error, the caller may need to free the partial 801 * chain pointed to by *mpp. The contents of both *uio and *space may be 802 * modified even in the case of an error. 803 */ 804 static int 805 sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space, 806 int flags) 807 { 808 struct mbuf *m, **mp, *top; 809 long len, resid; 810 int error; 811 #ifdef ZERO_COPY_SOCKETS 812 int cow_send; 813 #endif 814 815 *retmp = top = NULL; 816 mp = ⊤ 817 len = 0; 818 resid = uio->uio_resid; 819 error = 0; 820 do { 821 #ifdef ZERO_COPY_SOCKETS 822 cow_send = 0; 823 #endif /* ZERO_COPY_SOCKETS */ 824 if (resid >= MINCLSIZE) { 825 #ifdef ZERO_COPY_SOCKETS 826 if (top == NULL) { 827 MGETHDR(m, M_TRYWAIT, MT_DATA); 828 if (m == NULL) { 829 error = ENOBUFS; 830 goto out; 831 } 832 m->m_pkthdr.len = 0; 833 m->m_pkthdr.rcvif = NULL; 834 } else { 835 MGET(m, M_TRYWAIT, MT_DATA); 836 if (m == NULL) { 837 error = ENOBUFS; 838 goto out; 839 } 840 } 841 if (so_zero_copy_send && 842 resid>=PAGE_SIZE && 843 *space>=PAGE_SIZE && 844 uio->uio_iov->iov_len>=PAGE_SIZE) { 845 so_zerocp_stats.size_ok++; 846 so_zerocp_stats.align_ok++; 847 cow_send = socow_setup(m, uio); 848 len = cow_send; 849 } 850 if (!cow_send) { 851 MCLGET(m, M_TRYWAIT); 852 if ((m->m_flags & M_EXT) == 0) { 853 m_free(m); 854 m = NULL; 855 } else { 856 len = min(min(MCLBYTES, resid), 857 *space); 858 } 859 } 860 #else /* ZERO_COPY_SOCKETS */ 861 if (top == NULL) { 862 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 863 m->m_pkthdr.len = 0; 864 m->m_pkthdr.rcvif = NULL; 865 } else 866 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 867 len = min(min(MCLBYTES, resid), *space); 868 #endif /* ZERO_COPY_SOCKETS */ 869 } else { 870 if (top == NULL) { 871 m = m_gethdr(M_TRYWAIT, MT_DATA); 872 m->m_pkthdr.len = 0; 873 m->m_pkthdr.rcvif = NULL; 874 875 len = min(min(MHLEN, resid), *space); 876 /* 877 * For datagram protocols, leave room 878 * for protocol headers in first mbuf. 879 */ 880 if (atomic && m && len < MHLEN) 881 MH_ALIGN(m, len); 882 } else { 883 m = m_get(M_TRYWAIT, MT_DATA); 884 len = min(min(MLEN, resid), *space); 885 } 886 } 887 if (m == NULL) { 888 error = ENOBUFS; 889 goto out; 890 } 891 892 *space -= len; 893 #ifdef ZERO_COPY_SOCKETS 894 if (cow_send) 895 error = 0; 896 else 897 #endif /* ZERO_COPY_SOCKETS */ 898 error = uiomove(mtod(m, void *), (int)len, uio); 899 resid = uio->uio_resid; 900 m->m_len = len; 901 *mp = m; 902 top->m_pkthdr.len += len; 903 if (error) 904 goto out; 905 mp = &m->m_next; 906 if (resid <= 0) { 907 if (flags & MSG_EOR) 908 top->m_flags |= M_EOR; 909 break; 910 } 911 } while (*space > 0 && atomic); 912 out: 913 *retmp = top; 914 return (error); 915 } 916 917 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 918 919 int 920 sosend_dgram(so, addr, uio, top, control, flags, td) 921 struct socket *so; 922 struct sockaddr *addr; 923 struct uio *uio; 924 struct mbuf *top; 925 struct mbuf *control; 926 int flags; 927 struct thread *td; 928 { 929 long space, resid; 930 int clen = 0, error, dontroute; 931 int atomic = sosendallatonce(so) || top; 932 933 KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM")); 934 KASSERT(so->so_proto->pr_flags & PR_ATOMIC, 935 ("sodgram_send: !PR_ATOMIC")); 936 937 if (uio != NULL) 938 resid = uio->uio_resid; 939 else 940 resid = top->m_pkthdr.len; 941 /* 942 * In theory resid should be unsigned. 943 * However, space must be signed, as it might be less than 0 944 * if we over-committed, and we must use a signed comparison 945 * of space and resid. On the other hand, a negative resid 946 * causes us to loop sending 0-length segments to the protocol. 947 * 948 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 949 * type sockets since that's an error. 950 */ 951 if (resid < 0) { 952 error = EINVAL; 953 goto out; 954 } 955 956 dontroute = 957 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; 958 if (td != NULL) 959 td->td_proc->p_stats->p_ru.ru_msgsnd++; 960 if (control != NULL) 961 clen = control->m_len; 962 963 SOCKBUF_LOCK(&so->so_snd); 964 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 965 SOCKBUF_UNLOCK(&so->so_snd); 966 error = EPIPE; 967 goto out; 968 } 969 if (so->so_error) { 970 error = so->so_error; 971 so->so_error = 0; 972 SOCKBUF_UNLOCK(&so->so_snd); 973 goto out; 974 } 975 if ((so->so_state & SS_ISCONNECTED) == 0) { 976 /* 977 * `sendto' and `sendmsg' is allowed on a connection- 978 * based socket if it supports implied connect. 979 * Return ENOTCONN if not connected and no address is 980 * supplied. 981 */ 982 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 983 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 984 if ((so->so_state & SS_ISCONFIRMING) == 0 && 985 !(resid == 0 && clen != 0)) { 986 SOCKBUF_UNLOCK(&so->so_snd); 987 error = ENOTCONN; 988 goto out; 989 } 990 } else if (addr == NULL) { 991 if (so->so_proto->pr_flags & PR_CONNREQUIRED) 992 error = ENOTCONN; 993 else 994 error = EDESTADDRREQ; 995 SOCKBUF_UNLOCK(&so->so_snd); 996 goto out; 997 } 998 } 999 1000 /* 1001 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a 1002 * problem and need fixing. 1003 */ 1004 space = sbspace(&so->so_snd); 1005 if (flags & MSG_OOB) 1006 space += 1024; 1007 space -= clen; 1008 if (resid > space) { 1009 error = EMSGSIZE; 1010 goto out; 1011 } 1012 SOCKBUF_UNLOCK(&so->so_snd); 1013 if (uio == NULL) { 1014 resid = 0; 1015 if (flags & MSG_EOR) 1016 top->m_flags |= M_EOR; 1017 } else { 1018 error = sosend_copyin(uio, &top, atomic, &space, flags); 1019 if (error) 1020 goto out; 1021 resid = uio->uio_resid; 1022 } 1023 KASSERT(resid == 0, ("sosend_dgram: resid != 0")); 1024 /* 1025 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock 1026 * than with. 1027 */ 1028 if (dontroute) { 1029 SOCK_LOCK(so); 1030 so->so_options |= SO_DONTROUTE; 1031 SOCK_UNLOCK(so); 1032 } 1033 /* 1034 * XXX all the SBS_CANTSENDMORE checks previously 1035 * done could be out of date. We could have recieved 1036 * a reset packet in an interrupt or maybe we slept 1037 * while doing page faults in uiomove() etc. We could 1038 * probably recheck again inside the locking protection 1039 * here, but there are probably other places that this 1040 * also happens. We must rethink this. 1041 */ 1042 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1043 (flags & MSG_OOB) ? PRUS_OOB : 1044 /* 1045 * If the user set MSG_EOF, the protocol 1046 * understands this flag and nothing left to 1047 * send then use PRU_SEND_EOF instead of PRU_SEND. 1048 */ 1049 ((flags & MSG_EOF) && 1050 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1051 (resid <= 0)) ? 1052 PRUS_EOF : 1053 /* If there is more to send set PRUS_MORETOCOME */ 1054 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1055 top, addr, control, td); 1056 if (dontroute) { 1057 SOCK_LOCK(so); 1058 so->so_options &= ~SO_DONTROUTE; 1059 SOCK_UNLOCK(so); 1060 } 1061 clen = 0; 1062 control = NULL; 1063 top = NULL; 1064 out: 1065 if (top != NULL) 1066 m_freem(top); 1067 if (control != NULL) 1068 m_freem(control); 1069 return (error); 1070 } 1071 1072 /* 1073 * Send on a socket. 1074 * If send must go all at once and message is larger than 1075 * send buffering, then hard error. 1076 * Lock against other senders. 1077 * If must go all at once and not enough room now, then 1078 * inform user that this would block and do nothing. 1079 * Otherwise, if nonblocking, send as much as possible. 1080 * The data to be sent is described by "uio" if nonzero, 1081 * otherwise by the mbuf chain "top" (which must be null 1082 * if uio is not). Data provided in mbuf chain must be small 1083 * enough to send all at once. 1084 * 1085 * Returns nonzero on error, timeout or signal; callers 1086 * must check for short counts if EINTR/ERESTART are returned. 1087 * Data and control buffers are freed on return. 1088 */ 1089 #define snderr(errno) { error = (errno); goto release; } 1090 int 1091 sosend(so, addr, uio, top, control, flags, td) 1092 struct socket *so; 1093 struct sockaddr *addr; 1094 struct uio *uio; 1095 struct mbuf *top; 1096 struct mbuf *control; 1097 int flags; 1098 struct thread *td; 1099 { 1100 long space, resid; 1101 int clen = 0, error, dontroute; 1102 int atomic = sosendallatonce(so) || top; 1103 1104 if (uio != NULL) 1105 resid = uio->uio_resid; 1106 else 1107 resid = top->m_pkthdr.len; 1108 /* 1109 * In theory resid should be unsigned. 1110 * However, space must be signed, as it might be less than 0 1111 * if we over-committed, and we must use a signed comparison 1112 * of space and resid. On the other hand, a negative resid 1113 * causes us to loop sending 0-length segments to the protocol. 1114 * 1115 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1116 * type sockets since that's an error. 1117 */ 1118 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1119 error = EINVAL; 1120 goto out; 1121 } 1122 1123 dontroute = 1124 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 1125 (so->so_proto->pr_flags & PR_ATOMIC); 1126 if (td != NULL) 1127 td->td_proc->p_stats->p_ru.ru_msgsnd++; 1128 if (control != NULL) 1129 clen = control->m_len; 1130 1131 SOCKBUF_LOCK(&so->so_snd); 1132 restart: 1133 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1134 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1135 if (error) 1136 goto out_locked; 1137 do { 1138 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1139 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 1140 snderr(EPIPE); 1141 if (so->so_error) { 1142 error = so->so_error; 1143 so->so_error = 0; 1144 goto release; 1145 } 1146 if ((so->so_state & SS_ISCONNECTED) == 0) { 1147 /* 1148 * `sendto' and `sendmsg' is allowed on a connection- 1149 * based socket if it supports implied connect. 1150 * Return ENOTCONN if not connected and no address is 1151 * supplied. 1152 */ 1153 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 1154 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 1155 if ((so->so_state & SS_ISCONFIRMING) == 0 && 1156 !(resid == 0 && clen != 0)) 1157 snderr(ENOTCONN); 1158 } else if (addr == NULL) 1159 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 1160 ENOTCONN : EDESTADDRREQ); 1161 } 1162 space = sbspace(&so->so_snd); 1163 if (flags & MSG_OOB) 1164 space += 1024; 1165 if ((atomic && resid > so->so_snd.sb_hiwat) || 1166 clen > so->so_snd.sb_hiwat) 1167 snderr(EMSGSIZE); 1168 if (space < resid + clen && 1169 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 1170 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 1171 snderr(EWOULDBLOCK); 1172 sbunlock(&so->so_snd); 1173 error = sbwait(&so->so_snd); 1174 if (error) 1175 goto out_locked; 1176 goto restart; 1177 } 1178 SOCKBUF_UNLOCK(&so->so_snd); 1179 space -= clen; 1180 do { 1181 if (uio == NULL) { 1182 resid = 0; 1183 if (flags & MSG_EOR) 1184 top->m_flags |= M_EOR; 1185 } else { 1186 error = sosend_copyin(uio, &top, atomic, 1187 &space, flags); 1188 if (error != 0) { 1189 SOCKBUF_LOCK(&so->so_snd); 1190 goto release; 1191 } 1192 resid = uio->uio_resid; 1193 } 1194 if (dontroute) { 1195 SOCK_LOCK(so); 1196 so->so_options |= SO_DONTROUTE; 1197 SOCK_UNLOCK(so); 1198 } 1199 /* 1200 * XXX all the SBS_CANTSENDMORE checks previously 1201 * done could be out of date. We could have recieved 1202 * a reset packet in an interrupt or maybe we slept 1203 * while doing page faults in uiomove() etc. We could 1204 * probably recheck again inside the locking protection 1205 * here, but there are probably other places that this 1206 * also happens. We must rethink this. 1207 */ 1208 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 1209 (flags & MSG_OOB) ? PRUS_OOB : 1210 /* 1211 * If the user set MSG_EOF, the protocol 1212 * understands this flag and nothing left to 1213 * send then use PRU_SEND_EOF instead of PRU_SEND. 1214 */ 1215 ((flags & MSG_EOF) && 1216 (so->so_proto->pr_flags & PR_IMPLOPCL) && 1217 (resid <= 0)) ? 1218 PRUS_EOF : 1219 /* If there is more to send set PRUS_MORETOCOME */ 1220 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1221 top, addr, control, td); 1222 if (dontroute) { 1223 SOCK_LOCK(so); 1224 so->so_options &= ~SO_DONTROUTE; 1225 SOCK_UNLOCK(so); 1226 } 1227 clen = 0; 1228 control = NULL; 1229 top = NULL; 1230 if (error) { 1231 SOCKBUF_LOCK(&so->so_snd); 1232 goto release; 1233 } 1234 } while (resid && space > 0); 1235 SOCKBUF_LOCK(&so->so_snd); 1236 } while (resid); 1237 1238 release: 1239 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1240 sbunlock(&so->so_snd); 1241 out_locked: 1242 SOCKBUF_LOCK_ASSERT(&so->so_snd); 1243 SOCKBUF_UNLOCK(&so->so_snd); 1244 out: 1245 if (top != NULL) 1246 m_freem(top); 1247 if (control != NULL) 1248 m_freem(control); 1249 return (error); 1250 } 1251 #undef snderr 1252 1253 /* 1254 * The part of soreceive() that implements reading non-inline out-of-band 1255 * data from a socket. For more complete comments, see soreceive(), from 1256 * which this code originated. 1257 * 1258 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1259 * unable to return an mbuf chain to the caller. 1260 */ 1261 static int 1262 soreceive_rcvoob(so, uio, flags) 1263 struct socket *so; 1264 struct uio *uio; 1265 int flags; 1266 { 1267 struct protosw *pr = so->so_proto; 1268 struct mbuf *m; 1269 int error; 1270 1271 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1272 1273 m = m_get(M_TRYWAIT, MT_DATA); 1274 if (m == NULL) 1275 return (ENOBUFS); 1276 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1277 if (error) 1278 goto bad; 1279 do { 1280 #ifdef ZERO_COPY_SOCKETS 1281 if (so_zero_copy_receive) { 1282 int disposable; 1283 1284 if ((m->m_flags & M_EXT) 1285 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1286 disposable = 1; 1287 else 1288 disposable = 0; 1289 1290 error = uiomoveco(mtod(m, void *), 1291 min(uio->uio_resid, m->m_len), 1292 uio, disposable); 1293 } else 1294 #endif /* ZERO_COPY_SOCKETS */ 1295 error = uiomove(mtod(m, void *), 1296 (int) min(uio->uio_resid, m->m_len), uio); 1297 m = m_free(m); 1298 } while (uio->uio_resid && error == 0 && m); 1299 bad: 1300 if (m != NULL) 1301 m_freem(m); 1302 return (error); 1303 } 1304 1305 /* 1306 * Following replacement or removal of the first mbuf on the first mbuf chain 1307 * of a socket buffer, push necessary state changes back into the socket 1308 * buffer so that other consumers see the values consistently. 'nextrecord' 1309 * is the callers locally stored value of the original value of 1310 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 1311 * NOTE: 'nextrecord' may be NULL. 1312 */ 1313 static __inline void 1314 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 1315 { 1316 1317 SOCKBUF_LOCK_ASSERT(sb); 1318 /* 1319 * First, update for the new value of nextrecord. If necessary, make 1320 * it the first record. 1321 */ 1322 if (sb->sb_mb != NULL) 1323 sb->sb_mb->m_nextpkt = nextrecord; 1324 else 1325 sb->sb_mb = nextrecord; 1326 1327 /* 1328 * Now update any dependent socket buffer fields to reflect the new 1329 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 1330 * addition of a second clause that takes care of the case where 1331 * sb_mb has been updated, but remains the last record. 1332 */ 1333 if (sb->sb_mb == NULL) { 1334 sb->sb_mbtail = NULL; 1335 sb->sb_lastrecord = NULL; 1336 } else if (sb->sb_mb->m_nextpkt == NULL) 1337 sb->sb_lastrecord = sb->sb_mb; 1338 } 1339 1340 1341 /* 1342 * Implement receive operations on a socket. 1343 * We depend on the way that records are added to the sockbuf 1344 * by sbappend*. In particular, each record (mbufs linked through m_next) 1345 * must begin with an address if the protocol so specifies, 1346 * followed by an optional mbuf or mbufs containing ancillary data, 1347 * and then zero or more mbufs of data. 1348 * In order to avoid blocking network interrupts for the entire time here, 1349 * we splx() while doing the actual copy to user space. 1350 * Although the sockbuf is locked, new data may still be appended, 1351 * and thus we must maintain consistency of the sockbuf during that time. 1352 * 1353 * The caller may receive the data as a single mbuf chain by supplying 1354 * an mbuf **mp0 for use in returning the chain. The uio is then used 1355 * only for the count in uio_resid. 1356 */ 1357 int 1358 soreceive(so, psa, uio, mp0, controlp, flagsp) 1359 struct socket *so; 1360 struct sockaddr **psa; 1361 struct uio *uio; 1362 struct mbuf **mp0; 1363 struct mbuf **controlp; 1364 int *flagsp; 1365 { 1366 struct mbuf *m, **mp; 1367 int flags, len, error, offset; 1368 struct protosw *pr = so->so_proto; 1369 struct mbuf *nextrecord; 1370 int moff, type = 0; 1371 int orig_resid = uio->uio_resid; 1372 1373 mp = mp0; 1374 if (psa != NULL) 1375 *psa = NULL; 1376 if (controlp != NULL) 1377 *controlp = NULL; 1378 if (flagsp != NULL) 1379 flags = *flagsp &~ MSG_EOR; 1380 else 1381 flags = 0; 1382 if (flags & MSG_OOB) 1383 return (soreceive_rcvoob(so, uio, flags)); 1384 if (mp != NULL) 1385 *mp = NULL; 1386 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) 1387 && uio->uio_resid) 1388 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 1389 1390 SOCKBUF_LOCK(&so->so_rcv); 1391 restart: 1392 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1393 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 1394 if (error) 1395 goto out; 1396 1397 m = so->so_rcv.sb_mb; 1398 /* 1399 * If we have less data than requested, block awaiting more 1400 * (subject to any timeout) if: 1401 * 1. the current count is less than the low water mark, or 1402 * 2. MSG_WAITALL is set, and it is possible to do the entire 1403 * receive operation at once if we block (resid <= hiwat). 1404 * 3. MSG_DONTWAIT is not set 1405 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1406 * we have to do the receive in sections, and thus risk returning 1407 * a short count if a timeout or signal occurs after we start. 1408 */ 1409 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1410 so->so_rcv.sb_cc < uio->uio_resid) && 1411 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1412 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1413 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1414 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1415 ("receive: m == %p so->so_rcv.sb_cc == %u", 1416 m, so->so_rcv.sb_cc)); 1417 if (so->so_error) { 1418 if (m != NULL) 1419 goto dontblock; 1420 error = so->so_error; 1421 if ((flags & MSG_PEEK) == 0) 1422 so->so_error = 0; 1423 goto release; 1424 } 1425 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1426 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1427 if (m) 1428 goto dontblock; 1429 else 1430 goto release; 1431 } 1432 for (; m != NULL; m = m->m_next) 1433 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1434 m = so->so_rcv.sb_mb; 1435 goto dontblock; 1436 } 1437 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1438 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1439 error = ENOTCONN; 1440 goto release; 1441 } 1442 if (uio->uio_resid == 0) 1443 goto release; 1444 if ((so->so_state & SS_NBIO) || 1445 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1446 error = EWOULDBLOCK; 1447 goto release; 1448 } 1449 SBLASTRECORDCHK(&so->so_rcv); 1450 SBLASTMBUFCHK(&so->so_rcv); 1451 sbunlock(&so->so_rcv); 1452 error = sbwait(&so->so_rcv); 1453 if (error) 1454 goto out; 1455 goto restart; 1456 } 1457 dontblock: 1458 /* 1459 * From this point onward, we maintain 'nextrecord' as a cache of the 1460 * pointer to the next record in the socket buffer. We must keep the 1461 * various socket buffer pointers and local stack versions of the 1462 * pointers in sync, pushing out modifications before dropping the 1463 * socket buffer mutex, and re-reading them when picking it up. 1464 * 1465 * Otherwise, we will race with the network stack appending new data 1466 * or records onto the socket buffer by using inconsistent/stale 1467 * versions of the field, possibly resulting in socket buffer 1468 * corruption. 1469 * 1470 * By holding the high-level sblock(), we prevent simultaneous 1471 * readers from pulling off the front of the socket buffer. 1472 */ 1473 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1474 if (uio->uio_td) 1475 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1476 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1477 SBLASTRECORDCHK(&so->so_rcv); 1478 SBLASTMBUFCHK(&so->so_rcv); 1479 nextrecord = m->m_nextpkt; 1480 if (pr->pr_flags & PR_ADDR) { 1481 KASSERT(m->m_type == MT_SONAME, 1482 ("m->m_type == %d", m->m_type)); 1483 orig_resid = 0; 1484 if (psa != NULL) 1485 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1486 M_NOWAIT); 1487 if (flags & MSG_PEEK) { 1488 m = m->m_next; 1489 } else { 1490 sbfree(&so->so_rcv, m); 1491 so->so_rcv.sb_mb = m_free(m); 1492 m = so->so_rcv.sb_mb; 1493 sockbuf_pushsync(&so->so_rcv, nextrecord); 1494 } 1495 } 1496 1497 /* 1498 * Process one or more MT_CONTROL mbufs present before any data mbufs 1499 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1500 * just copy the data; if !MSG_PEEK, we call into the protocol to 1501 * perform externalization (or freeing if controlp == NULL). 1502 */ 1503 if (m != NULL && m->m_type == MT_CONTROL) { 1504 struct mbuf *cm = NULL, *cmn; 1505 struct mbuf **cme = &cm; 1506 1507 do { 1508 if (flags & MSG_PEEK) { 1509 if (controlp != NULL) { 1510 *controlp = m_copy(m, 0, m->m_len); 1511 controlp = &(*controlp)->m_next; 1512 } 1513 m = m->m_next; 1514 } else { 1515 sbfree(&so->so_rcv, m); 1516 so->so_rcv.sb_mb = m->m_next; 1517 m->m_next = NULL; 1518 *cme = m; 1519 cme = &(*cme)->m_next; 1520 m = so->so_rcv.sb_mb; 1521 } 1522 } while (m != NULL && m->m_type == MT_CONTROL); 1523 if ((flags & MSG_PEEK) == 0) 1524 sockbuf_pushsync(&so->so_rcv, nextrecord); 1525 while (cm != NULL) { 1526 cmn = cm->m_next; 1527 cm->m_next = NULL; 1528 if (pr->pr_domain->dom_externalize != NULL) { 1529 SOCKBUF_UNLOCK(&so->so_rcv); 1530 error = (*pr->pr_domain->dom_externalize) 1531 (cm, controlp); 1532 SOCKBUF_LOCK(&so->so_rcv); 1533 } else if (controlp != NULL) 1534 *controlp = cm; 1535 else 1536 m_freem(cm); 1537 if (controlp != NULL) { 1538 orig_resid = 0; 1539 while (*controlp != NULL) 1540 controlp = &(*controlp)->m_next; 1541 } 1542 cm = cmn; 1543 } 1544 if (so->so_rcv.sb_mb) 1545 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1546 else 1547 nextrecord = NULL; 1548 orig_resid = 0; 1549 } 1550 if (m != NULL) { 1551 if ((flags & MSG_PEEK) == 0) { 1552 KASSERT(m->m_nextpkt == nextrecord, 1553 ("soreceive: post-control, nextrecord !sync")); 1554 if (nextrecord == NULL) { 1555 KASSERT(so->so_rcv.sb_mb == m, 1556 ("soreceive: post-control, sb_mb!=m")); 1557 KASSERT(so->so_rcv.sb_lastrecord == m, 1558 ("soreceive: post-control, lastrecord!=m")); 1559 } 1560 } 1561 type = m->m_type; 1562 if (type == MT_OOBDATA) 1563 flags |= MSG_OOB; 1564 } else { 1565 if ((flags & MSG_PEEK) == 0) { 1566 KASSERT(so->so_rcv.sb_mb == nextrecord, 1567 ("soreceive: sb_mb != nextrecord")); 1568 if (so->so_rcv.sb_mb == NULL) { 1569 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1570 ("soreceive: sb_lastercord != NULL")); 1571 } 1572 } 1573 } 1574 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1575 SBLASTRECORDCHK(&so->so_rcv); 1576 SBLASTMBUFCHK(&so->so_rcv); 1577 1578 /* 1579 * Now continue to read any data mbufs off of the head of the socket 1580 * buffer until the read request is satisfied. Note that 'type' is 1581 * used to store the type of any mbuf reads that have happened so far 1582 * such that soreceive() can stop reading if the type changes, which 1583 * causes soreceive() to return only one of regular data and inline 1584 * out-of-band data in a single socket receive operation. 1585 */ 1586 moff = 0; 1587 offset = 0; 1588 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1589 /* 1590 * If the type of mbuf has changed since the last mbuf 1591 * examined ('type'), end the receive operation. 1592 */ 1593 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1594 if (m->m_type == MT_OOBDATA) { 1595 if (type != MT_OOBDATA) 1596 break; 1597 } else if (type == MT_OOBDATA) 1598 break; 1599 else 1600 KASSERT(m->m_type == MT_DATA, 1601 ("m->m_type == %d", m->m_type)); 1602 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1603 len = uio->uio_resid; 1604 if (so->so_oobmark && len > so->so_oobmark - offset) 1605 len = so->so_oobmark - offset; 1606 if (len > m->m_len - moff) 1607 len = m->m_len - moff; 1608 /* 1609 * If mp is set, just pass back the mbufs. 1610 * Otherwise copy them out via the uio, then free. 1611 * Sockbuf must be consistent here (points to current mbuf, 1612 * it points to next record) when we drop priority; 1613 * we must note any additions to the sockbuf when we 1614 * block interrupts again. 1615 */ 1616 if (mp == NULL) { 1617 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1618 SBLASTRECORDCHK(&so->so_rcv); 1619 SBLASTMBUFCHK(&so->so_rcv); 1620 SOCKBUF_UNLOCK(&so->so_rcv); 1621 #ifdef ZERO_COPY_SOCKETS 1622 if (so_zero_copy_receive) { 1623 int disposable; 1624 1625 if ((m->m_flags & M_EXT) 1626 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1627 disposable = 1; 1628 else 1629 disposable = 0; 1630 1631 error = uiomoveco(mtod(m, char *) + moff, 1632 (int)len, uio, 1633 disposable); 1634 } else 1635 #endif /* ZERO_COPY_SOCKETS */ 1636 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1637 SOCKBUF_LOCK(&so->so_rcv); 1638 if (error) 1639 goto release; 1640 } else 1641 uio->uio_resid -= len; 1642 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1643 if (len == m->m_len - moff) { 1644 if (m->m_flags & M_EOR) 1645 flags |= MSG_EOR; 1646 if (flags & MSG_PEEK) { 1647 m = m->m_next; 1648 moff = 0; 1649 } else { 1650 nextrecord = m->m_nextpkt; 1651 sbfree(&so->so_rcv, m); 1652 if (mp != NULL) { 1653 *mp = m; 1654 mp = &m->m_next; 1655 so->so_rcv.sb_mb = m = m->m_next; 1656 *mp = NULL; 1657 } else { 1658 so->so_rcv.sb_mb = m_free(m); 1659 m = so->so_rcv.sb_mb; 1660 } 1661 sockbuf_pushsync(&so->so_rcv, nextrecord); 1662 SBLASTRECORDCHK(&so->so_rcv); 1663 SBLASTMBUFCHK(&so->so_rcv); 1664 } 1665 } else { 1666 if (flags & MSG_PEEK) 1667 moff += len; 1668 else { 1669 if (mp != NULL) { 1670 int copy_flag; 1671 1672 if (flags & MSG_DONTWAIT) 1673 copy_flag = M_DONTWAIT; 1674 else 1675 copy_flag = M_TRYWAIT; 1676 if (copy_flag == M_TRYWAIT) 1677 SOCKBUF_UNLOCK(&so->so_rcv); 1678 *mp = m_copym(m, 0, len, copy_flag); 1679 if (copy_flag == M_TRYWAIT) 1680 SOCKBUF_LOCK(&so->so_rcv); 1681 if (*mp == NULL) { 1682 /* 1683 * m_copym() couldn't allocate an mbuf. 1684 * Adjust uio_resid back (it was adjusted 1685 * down by len bytes, which we didn't end 1686 * up "copying" over). 1687 */ 1688 uio->uio_resid += len; 1689 break; 1690 } 1691 } 1692 m->m_data += len; 1693 m->m_len -= len; 1694 so->so_rcv.sb_cc -= len; 1695 } 1696 } 1697 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1698 if (so->so_oobmark) { 1699 if ((flags & MSG_PEEK) == 0) { 1700 so->so_oobmark -= len; 1701 if (so->so_oobmark == 0) { 1702 so->so_rcv.sb_state |= SBS_RCVATMARK; 1703 break; 1704 } 1705 } else { 1706 offset += len; 1707 if (offset == so->so_oobmark) 1708 break; 1709 } 1710 } 1711 if (flags & MSG_EOR) 1712 break; 1713 /* 1714 * If the MSG_WAITALL flag is set (for non-atomic socket), 1715 * we must not quit until "uio->uio_resid == 0" or an error 1716 * termination. If a signal/timeout occurs, return 1717 * with a short count but without error. 1718 * Keep sockbuf locked against other readers. 1719 */ 1720 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1721 !sosendallatonce(so) && nextrecord == NULL) { 1722 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1723 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1724 break; 1725 /* 1726 * Notify the protocol that some data has been 1727 * drained before blocking. 1728 */ 1729 if (pr->pr_flags & PR_WANTRCVD) { 1730 SOCKBUF_UNLOCK(&so->so_rcv); 1731 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1732 SOCKBUF_LOCK(&so->so_rcv); 1733 } 1734 SBLASTRECORDCHK(&so->so_rcv); 1735 SBLASTMBUFCHK(&so->so_rcv); 1736 error = sbwait(&so->so_rcv); 1737 if (error) 1738 goto release; 1739 m = so->so_rcv.sb_mb; 1740 if (m != NULL) 1741 nextrecord = m->m_nextpkt; 1742 } 1743 } 1744 1745 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1746 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1747 flags |= MSG_TRUNC; 1748 if ((flags & MSG_PEEK) == 0) 1749 (void) sbdroprecord_locked(&so->so_rcv); 1750 } 1751 if ((flags & MSG_PEEK) == 0) { 1752 if (m == NULL) { 1753 /* 1754 * First part is an inline SB_EMPTY_FIXUP(). Second 1755 * part makes sure sb_lastrecord is up-to-date if 1756 * there is still data in the socket buffer. 1757 */ 1758 so->so_rcv.sb_mb = nextrecord; 1759 if (so->so_rcv.sb_mb == NULL) { 1760 so->so_rcv.sb_mbtail = NULL; 1761 so->so_rcv.sb_lastrecord = NULL; 1762 } else if (nextrecord->m_nextpkt == NULL) 1763 so->so_rcv.sb_lastrecord = nextrecord; 1764 } 1765 SBLASTRECORDCHK(&so->so_rcv); 1766 SBLASTMBUFCHK(&so->so_rcv); 1767 /* 1768 * If soreceive() is being done from the socket callback, then 1769 * don't need to generate ACK to peer to update window, since 1770 * ACK will be generated on return to TCP. 1771 */ 1772 if (!(flags & MSG_SOCALLBCK) && 1773 (pr->pr_flags & PR_WANTRCVD)) { 1774 SOCKBUF_UNLOCK(&so->so_rcv); 1775 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1776 SOCKBUF_LOCK(&so->so_rcv); 1777 } 1778 } 1779 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1780 if (orig_resid == uio->uio_resid && orig_resid && 1781 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1782 sbunlock(&so->so_rcv); 1783 goto restart; 1784 } 1785 1786 if (flagsp != NULL) 1787 *flagsp |= flags; 1788 release: 1789 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1790 sbunlock(&so->so_rcv); 1791 out: 1792 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1793 SOCKBUF_UNLOCK(&so->so_rcv); 1794 return (error); 1795 } 1796 1797 int 1798 soshutdown(so, how) 1799 struct socket *so; 1800 int how; 1801 { 1802 struct protosw *pr = so->so_proto; 1803 1804 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1805 return (EINVAL); 1806 1807 if (how != SHUT_WR) 1808 sorflush(so); 1809 if (how != SHUT_RD) 1810 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1811 return (0); 1812 } 1813 1814 void 1815 sorflush(so) 1816 struct socket *so; 1817 { 1818 struct sockbuf *sb = &so->so_rcv; 1819 struct protosw *pr = so->so_proto; 1820 struct sockbuf asb; 1821 1822 /* 1823 * XXXRW: This is quite ugly. Previously, this code made a copy of 1824 * the socket buffer, then zero'd the original to clear the buffer 1825 * fields. However, with mutexes in the socket buffer, this causes 1826 * problems. We only clear the zeroable bits of the original; 1827 * however, we have to initialize and destroy the mutex in the copy 1828 * so that dom_dispose() and sbrelease() can lock t as needed. 1829 */ 1830 SOCKBUF_LOCK(sb); 1831 sb->sb_flags |= SB_NOINTR; 1832 (void) sblock(sb, M_WAITOK); 1833 /* 1834 * socantrcvmore_locked() drops the socket buffer mutex so that it 1835 * can safely perform wakeups. Re-acquire the mutex before 1836 * continuing. 1837 */ 1838 socantrcvmore_locked(so); 1839 SOCKBUF_LOCK(sb); 1840 sbunlock(sb); 1841 /* 1842 * Invalidate/clear most of the sockbuf structure, but leave 1843 * selinfo and mutex data unchanged. 1844 */ 1845 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1846 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1847 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1848 bzero(&sb->sb_startzero, 1849 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1850 SOCKBUF_UNLOCK(sb); 1851 1852 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1853 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1854 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1855 sbrelease(&asb, so); 1856 SOCKBUF_LOCK_DESTROY(&asb); 1857 } 1858 1859 /* 1860 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1861 * an additional variant to handle the case where the option value needs 1862 * to be some kind of integer, but not a specific size. 1863 * In addition to their use here, these functions are also called by the 1864 * protocol-level pr_ctloutput() routines. 1865 */ 1866 int 1867 sooptcopyin(sopt, buf, len, minlen) 1868 struct sockopt *sopt; 1869 void *buf; 1870 size_t len; 1871 size_t minlen; 1872 { 1873 size_t valsize; 1874 1875 /* 1876 * If the user gives us more than we wanted, we ignore it, 1877 * but if we don't get the minimum length the caller 1878 * wants, we return EINVAL. On success, sopt->sopt_valsize 1879 * is set to however much we actually retrieved. 1880 */ 1881 if ((valsize = sopt->sopt_valsize) < minlen) 1882 return EINVAL; 1883 if (valsize > len) 1884 sopt->sopt_valsize = valsize = len; 1885 1886 if (sopt->sopt_td != NULL) 1887 return (copyin(sopt->sopt_val, buf, valsize)); 1888 1889 bcopy(sopt->sopt_val, buf, valsize); 1890 return (0); 1891 } 1892 1893 /* 1894 * Kernel version of setsockopt(2)/ 1895 * XXX: optlen is size_t, not socklen_t 1896 */ 1897 int 1898 so_setsockopt(struct socket *so, int level, int optname, void *optval, 1899 size_t optlen) 1900 { 1901 struct sockopt sopt; 1902 1903 sopt.sopt_level = level; 1904 sopt.sopt_name = optname; 1905 sopt.sopt_dir = SOPT_SET; 1906 sopt.sopt_val = optval; 1907 sopt.sopt_valsize = optlen; 1908 sopt.sopt_td = NULL; 1909 return (sosetopt(so, &sopt)); 1910 } 1911 1912 int 1913 sosetopt(so, sopt) 1914 struct socket *so; 1915 struct sockopt *sopt; 1916 { 1917 int error, optval; 1918 struct linger l; 1919 struct timeval tv; 1920 u_long val; 1921 #ifdef MAC 1922 struct mac extmac; 1923 #endif 1924 1925 error = 0; 1926 if (sopt->sopt_level != SOL_SOCKET) { 1927 if (so->so_proto && so->so_proto->pr_ctloutput) 1928 return ((*so->so_proto->pr_ctloutput) 1929 (so, sopt)); 1930 error = ENOPROTOOPT; 1931 } else { 1932 switch (sopt->sopt_name) { 1933 #ifdef INET 1934 case SO_ACCEPTFILTER: 1935 error = do_setopt_accept_filter(so, sopt); 1936 if (error) 1937 goto bad; 1938 break; 1939 #endif 1940 case SO_LINGER: 1941 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1942 if (error) 1943 goto bad; 1944 1945 SOCK_LOCK(so); 1946 so->so_linger = l.l_linger; 1947 if (l.l_onoff) 1948 so->so_options |= SO_LINGER; 1949 else 1950 so->so_options &= ~SO_LINGER; 1951 SOCK_UNLOCK(so); 1952 break; 1953 1954 case SO_DEBUG: 1955 case SO_KEEPALIVE: 1956 case SO_DONTROUTE: 1957 case SO_USELOOPBACK: 1958 case SO_BROADCAST: 1959 case SO_REUSEADDR: 1960 case SO_REUSEPORT: 1961 case SO_OOBINLINE: 1962 case SO_TIMESTAMP: 1963 case SO_BINTIME: 1964 case SO_NOSIGPIPE: 1965 error = sooptcopyin(sopt, &optval, sizeof optval, 1966 sizeof optval); 1967 if (error) 1968 goto bad; 1969 SOCK_LOCK(so); 1970 if (optval) 1971 so->so_options |= sopt->sopt_name; 1972 else 1973 so->so_options &= ~sopt->sopt_name; 1974 SOCK_UNLOCK(so); 1975 break; 1976 1977 case SO_SNDBUF: 1978 case SO_RCVBUF: 1979 case SO_SNDLOWAT: 1980 case SO_RCVLOWAT: 1981 error = sooptcopyin(sopt, &optval, sizeof optval, 1982 sizeof optval); 1983 if (error) 1984 goto bad; 1985 1986 /* 1987 * Values < 1 make no sense for any of these 1988 * options, so disallow them. 1989 */ 1990 if (optval < 1) { 1991 error = EINVAL; 1992 goto bad; 1993 } 1994 1995 switch (sopt->sopt_name) { 1996 case SO_SNDBUF: 1997 case SO_RCVBUF: 1998 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1999 &so->so_snd : &so->so_rcv, (u_long)optval, 2000 so, curthread) == 0) { 2001 error = ENOBUFS; 2002 goto bad; 2003 } 2004 break; 2005 2006 /* 2007 * Make sure the low-water is never greater than 2008 * the high-water. 2009 */ 2010 case SO_SNDLOWAT: 2011 SOCKBUF_LOCK(&so->so_snd); 2012 so->so_snd.sb_lowat = 2013 (optval > so->so_snd.sb_hiwat) ? 2014 so->so_snd.sb_hiwat : optval; 2015 SOCKBUF_UNLOCK(&so->so_snd); 2016 break; 2017 case SO_RCVLOWAT: 2018 SOCKBUF_LOCK(&so->so_rcv); 2019 so->so_rcv.sb_lowat = 2020 (optval > so->so_rcv.sb_hiwat) ? 2021 so->so_rcv.sb_hiwat : optval; 2022 SOCKBUF_UNLOCK(&so->so_rcv); 2023 break; 2024 } 2025 break; 2026 2027 case SO_SNDTIMEO: 2028 case SO_RCVTIMEO: 2029 #ifdef COMPAT_IA32 2030 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2031 struct timeval32 tv32; 2032 2033 error = sooptcopyin(sopt, &tv32, sizeof tv32, 2034 sizeof tv32); 2035 CP(tv32, tv, tv_sec); 2036 CP(tv32, tv, tv_usec); 2037 } else 2038 #endif 2039 error = sooptcopyin(sopt, &tv, sizeof tv, 2040 sizeof tv); 2041 if (error) 2042 goto bad; 2043 2044 /* assert(hz > 0); */ 2045 if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz || 2046 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 2047 error = EDOM; 2048 goto bad; 2049 } 2050 /* assert(tick > 0); */ 2051 /* assert(ULONG_MAX - INT_MAX >= 1000000); */ 2052 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 2053 if (val > INT_MAX) { 2054 error = EDOM; 2055 goto bad; 2056 } 2057 if (val == 0 && tv.tv_usec != 0) 2058 val = 1; 2059 2060 switch (sopt->sopt_name) { 2061 case SO_SNDTIMEO: 2062 so->so_snd.sb_timeo = val; 2063 break; 2064 case SO_RCVTIMEO: 2065 so->so_rcv.sb_timeo = val; 2066 break; 2067 } 2068 break; 2069 2070 case SO_LABEL: 2071 #ifdef MAC 2072 error = sooptcopyin(sopt, &extmac, sizeof extmac, 2073 sizeof extmac); 2074 if (error) 2075 goto bad; 2076 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 2077 so, &extmac); 2078 #else 2079 error = EOPNOTSUPP; 2080 #endif 2081 break; 2082 2083 default: 2084 error = ENOPROTOOPT; 2085 break; 2086 } 2087 if (error == 0 && so->so_proto != NULL && 2088 so->so_proto->pr_ctloutput != NULL) { 2089 (void) ((*so->so_proto->pr_ctloutput) 2090 (so, sopt)); 2091 } 2092 } 2093 bad: 2094 return (error); 2095 } 2096 2097 /* Helper routine for getsockopt */ 2098 int 2099 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 2100 { 2101 int error; 2102 size_t valsize; 2103 2104 error = 0; 2105 2106 /* 2107 * Documented get behavior is that we always return a value, 2108 * possibly truncated to fit in the user's buffer. 2109 * Traditional behavior is that we always tell the user 2110 * precisely how much we copied, rather than something useful 2111 * like the total amount we had available for her. 2112 * Note that this interface is not idempotent; the entire answer must 2113 * generated ahead of time. 2114 */ 2115 valsize = min(len, sopt->sopt_valsize); 2116 sopt->sopt_valsize = valsize; 2117 if (sopt->sopt_val != NULL) { 2118 if (sopt->sopt_td != NULL) 2119 error = copyout(buf, sopt->sopt_val, valsize); 2120 else 2121 bcopy(buf, sopt->sopt_val, valsize); 2122 } 2123 return (error); 2124 } 2125 2126 int 2127 sogetopt(so, sopt) 2128 struct socket *so; 2129 struct sockopt *sopt; 2130 { 2131 int error, optval; 2132 struct linger l; 2133 struct timeval tv; 2134 #ifdef MAC 2135 struct mac extmac; 2136 #endif 2137 2138 error = 0; 2139 if (sopt->sopt_level != SOL_SOCKET) { 2140 if (so->so_proto && so->so_proto->pr_ctloutput) { 2141 return ((*so->so_proto->pr_ctloutput) 2142 (so, sopt)); 2143 } else 2144 return (ENOPROTOOPT); 2145 } else { 2146 switch (sopt->sopt_name) { 2147 #ifdef INET 2148 case SO_ACCEPTFILTER: 2149 error = do_getopt_accept_filter(so, sopt); 2150 break; 2151 #endif 2152 case SO_LINGER: 2153 SOCK_LOCK(so); 2154 l.l_onoff = so->so_options & SO_LINGER; 2155 l.l_linger = so->so_linger; 2156 SOCK_UNLOCK(so); 2157 error = sooptcopyout(sopt, &l, sizeof l); 2158 break; 2159 2160 case SO_USELOOPBACK: 2161 case SO_DONTROUTE: 2162 case SO_DEBUG: 2163 case SO_KEEPALIVE: 2164 case SO_REUSEADDR: 2165 case SO_REUSEPORT: 2166 case SO_BROADCAST: 2167 case SO_OOBINLINE: 2168 case SO_ACCEPTCONN: 2169 case SO_TIMESTAMP: 2170 case SO_BINTIME: 2171 case SO_NOSIGPIPE: 2172 optval = so->so_options & sopt->sopt_name; 2173 integer: 2174 error = sooptcopyout(sopt, &optval, sizeof optval); 2175 break; 2176 2177 case SO_TYPE: 2178 optval = so->so_type; 2179 goto integer; 2180 2181 case SO_ERROR: 2182 SOCK_LOCK(so); 2183 optval = so->so_error; 2184 so->so_error = 0; 2185 SOCK_UNLOCK(so); 2186 goto integer; 2187 2188 case SO_SNDBUF: 2189 optval = so->so_snd.sb_hiwat; 2190 goto integer; 2191 2192 case SO_RCVBUF: 2193 optval = so->so_rcv.sb_hiwat; 2194 goto integer; 2195 2196 case SO_SNDLOWAT: 2197 optval = so->so_snd.sb_lowat; 2198 goto integer; 2199 2200 case SO_RCVLOWAT: 2201 optval = so->so_rcv.sb_lowat; 2202 goto integer; 2203 2204 case SO_SNDTIMEO: 2205 case SO_RCVTIMEO: 2206 optval = (sopt->sopt_name == SO_SNDTIMEO ? 2207 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 2208 2209 tv.tv_sec = optval / hz; 2210 tv.tv_usec = (optval % hz) * tick; 2211 #ifdef COMPAT_IA32 2212 if (curthread->td_proc->p_sysent == &ia32_freebsd_sysvec) { 2213 struct timeval32 tv32; 2214 2215 CP(tv, tv32, tv_sec); 2216 CP(tv, tv32, tv_usec); 2217 error = sooptcopyout(sopt, &tv32, sizeof tv32); 2218 } else 2219 #endif 2220 error = sooptcopyout(sopt, &tv, sizeof tv); 2221 break; 2222 2223 case SO_LABEL: 2224 #ifdef MAC 2225 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2226 sizeof(extmac)); 2227 if (error) 2228 return (error); 2229 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 2230 so, &extmac); 2231 if (error) 2232 return (error); 2233 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2234 #else 2235 error = EOPNOTSUPP; 2236 #endif 2237 break; 2238 2239 case SO_PEERLABEL: 2240 #ifdef MAC 2241 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 2242 sizeof(extmac)); 2243 if (error) 2244 return (error); 2245 error = mac_getsockopt_peerlabel( 2246 sopt->sopt_td->td_ucred, so, &extmac); 2247 if (error) 2248 return (error); 2249 error = sooptcopyout(sopt, &extmac, sizeof extmac); 2250 #else 2251 error = EOPNOTSUPP; 2252 #endif 2253 break; 2254 2255 case SO_LISTENQLIMIT: 2256 optval = so->so_qlimit; 2257 goto integer; 2258 2259 case SO_LISTENQLEN: 2260 optval = so->so_qlen; 2261 goto integer; 2262 2263 case SO_LISTENINCQLEN: 2264 optval = so->so_incqlen; 2265 goto integer; 2266 2267 default: 2268 error = ENOPROTOOPT; 2269 break; 2270 } 2271 return (error); 2272 } 2273 } 2274 2275 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 2276 int 2277 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 2278 { 2279 struct mbuf *m, *m_prev; 2280 int sopt_size = sopt->sopt_valsize; 2281 2282 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2283 if (m == NULL) 2284 return ENOBUFS; 2285 if (sopt_size > MLEN) { 2286 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 2287 if ((m->m_flags & M_EXT) == 0) { 2288 m_free(m); 2289 return ENOBUFS; 2290 } 2291 m->m_len = min(MCLBYTES, sopt_size); 2292 } else { 2293 m->m_len = min(MLEN, sopt_size); 2294 } 2295 sopt_size -= m->m_len; 2296 *mp = m; 2297 m_prev = m; 2298 2299 while (sopt_size) { 2300 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 2301 if (m == NULL) { 2302 m_freem(*mp); 2303 return ENOBUFS; 2304 } 2305 if (sopt_size > MLEN) { 2306 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 2307 M_DONTWAIT); 2308 if ((m->m_flags & M_EXT) == 0) { 2309 m_freem(m); 2310 m_freem(*mp); 2311 return ENOBUFS; 2312 } 2313 m->m_len = min(MCLBYTES, sopt_size); 2314 } else { 2315 m->m_len = min(MLEN, sopt_size); 2316 } 2317 sopt_size -= m->m_len; 2318 m_prev->m_next = m; 2319 m_prev = m; 2320 } 2321 return (0); 2322 } 2323 2324 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2325 int 2326 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2327 { 2328 struct mbuf *m0 = m; 2329 2330 if (sopt->sopt_val == NULL) 2331 return (0); 2332 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2333 if (sopt->sopt_td != NULL) { 2334 int error; 2335 2336 error = copyin(sopt->sopt_val, mtod(m, char *), 2337 m->m_len); 2338 if (error != 0) { 2339 m_freem(m0); 2340 return(error); 2341 } 2342 } else 2343 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2344 sopt->sopt_valsize -= m->m_len; 2345 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2346 m = m->m_next; 2347 } 2348 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2349 panic("ip6_sooptmcopyin"); 2350 return (0); 2351 } 2352 2353 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2354 int 2355 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2356 { 2357 struct mbuf *m0 = m; 2358 size_t valsize = 0; 2359 2360 if (sopt->sopt_val == NULL) 2361 return (0); 2362 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2363 if (sopt->sopt_td != NULL) { 2364 int error; 2365 2366 error = copyout(mtod(m, char *), sopt->sopt_val, 2367 m->m_len); 2368 if (error != 0) { 2369 m_freem(m0); 2370 return(error); 2371 } 2372 } else 2373 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2374 sopt->sopt_valsize -= m->m_len; 2375 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2376 valsize += m->m_len; 2377 m = m->m_next; 2378 } 2379 if (m != NULL) { 2380 /* enough soopt buffer should be given from user-land */ 2381 m_freem(m0); 2382 return(EINVAL); 2383 } 2384 sopt->sopt_valsize = valsize; 2385 return (0); 2386 } 2387 2388 void 2389 sohasoutofband(so) 2390 struct socket *so; 2391 { 2392 if (so->so_sigio != NULL) 2393 pgsigio(&so->so_sigio, SIGURG, 0); 2394 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2395 } 2396 2397 int 2398 sopoll(struct socket *so, int events, struct ucred *active_cred, 2399 struct thread *td) 2400 { 2401 int revents = 0; 2402 2403 SOCKBUF_LOCK(&so->so_snd); 2404 SOCKBUF_LOCK(&so->so_rcv); 2405 if (events & (POLLIN | POLLRDNORM)) 2406 if (soreadable(so)) 2407 revents |= events & (POLLIN | POLLRDNORM); 2408 2409 if (events & POLLINIGNEOF) 2410 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2411 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2412 revents |= POLLINIGNEOF; 2413 2414 if (events & (POLLOUT | POLLWRNORM)) 2415 if (sowriteable(so)) 2416 revents |= events & (POLLOUT | POLLWRNORM); 2417 2418 if (events & (POLLPRI | POLLRDBAND)) 2419 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2420 revents |= events & (POLLPRI | POLLRDBAND); 2421 2422 if (revents == 0) { 2423 if (events & 2424 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2425 POLLRDBAND)) { 2426 selrecord(td, &so->so_rcv.sb_sel); 2427 so->so_rcv.sb_flags |= SB_SEL; 2428 } 2429 2430 if (events & (POLLOUT | POLLWRNORM)) { 2431 selrecord(td, &so->so_snd.sb_sel); 2432 so->so_snd.sb_flags |= SB_SEL; 2433 } 2434 } 2435 2436 SOCKBUF_UNLOCK(&so->so_rcv); 2437 SOCKBUF_UNLOCK(&so->so_snd); 2438 return (revents); 2439 } 2440 2441 int 2442 soo_kqfilter(struct file *fp, struct knote *kn) 2443 { 2444 struct socket *so = kn->kn_fp->f_data; 2445 struct sockbuf *sb; 2446 2447 switch (kn->kn_filter) { 2448 case EVFILT_READ: 2449 if (so->so_options & SO_ACCEPTCONN) 2450 kn->kn_fop = &solisten_filtops; 2451 else 2452 kn->kn_fop = &soread_filtops; 2453 sb = &so->so_rcv; 2454 break; 2455 case EVFILT_WRITE: 2456 kn->kn_fop = &sowrite_filtops; 2457 sb = &so->so_snd; 2458 break; 2459 default: 2460 return (EINVAL); 2461 } 2462 2463 SOCKBUF_LOCK(sb); 2464 knlist_add(&sb->sb_sel.si_note, kn, 1); 2465 sb->sb_flags |= SB_KNOTE; 2466 SOCKBUF_UNLOCK(sb); 2467 return (0); 2468 } 2469 2470 static void 2471 filt_sordetach(struct knote *kn) 2472 { 2473 struct socket *so = kn->kn_fp->f_data; 2474 2475 SOCKBUF_LOCK(&so->so_rcv); 2476 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2477 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2478 so->so_rcv.sb_flags &= ~SB_KNOTE; 2479 SOCKBUF_UNLOCK(&so->so_rcv); 2480 } 2481 2482 /*ARGSUSED*/ 2483 static int 2484 filt_soread(struct knote *kn, long hint) 2485 { 2486 struct socket *so; 2487 2488 so = kn->kn_fp->f_data; 2489 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 2490 2491 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2492 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2493 kn->kn_flags |= EV_EOF; 2494 kn->kn_fflags = so->so_error; 2495 return (1); 2496 } else if (so->so_error) /* temporary udp error */ 2497 return (1); 2498 else if (kn->kn_sfflags & NOTE_LOWAT) 2499 return (kn->kn_data >= kn->kn_sdata); 2500 else 2501 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2502 } 2503 2504 static void 2505 filt_sowdetach(struct knote *kn) 2506 { 2507 struct socket *so = kn->kn_fp->f_data; 2508 2509 SOCKBUF_LOCK(&so->so_snd); 2510 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2511 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2512 so->so_snd.sb_flags &= ~SB_KNOTE; 2513 SOCKBUF_UNLOCK(&so->so_snd); 2514 } 2515 2516 /*ARGSUSED*/ 2517 static int 2518 filt_sowrite(struct knote *kn, long hint) 2519 { 2520 struct socket *so; 2521 2522 so = kn->kn_fp->f_data; 2523 SOCKBUF_LOCK_ASSERT(&so->so_snd); 2524 kn->kn_data = sbspace(&so->so_snd); 2525 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2526 kn->kn_flags |= EV_EOF; 2527 kn->kn_fflags = so->so_error; 2528 return (1); 2529 } else if (so->so_error) /* temporary udp error */ 2530 return (1); 2531 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2532 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2533 return (0); 2534 else if (kn->kn_sfflags & NOTE_LOWAT) 2535 return (kn->kn_data >= kn->kn_sdata); 2536 else 2537 return (kn->kn_data >= so->so_snd.sb_lowat); 2538 } 2539 2540 /*ARGSUSED*/ 2541 static int 2542 filt_solisten(struct knote *kn, long hint) 2543 { 2544 struct socket *so = kn->kn_fp->f_data; 2545 2546 kn->kn_data = so->so_qlen; 2547 return (! TAILQ_EMPTY(&so->so_comp)); 2548 } 2549 2550 int 2551 socheckuid(struct socket *so, uid_t uid) 2552 { 2553 2554 if (so == NULL) 2555 return (EPERM); 2556 if (so->so_cred->cr_uid != uid) 2557 return (EPERM); 2558 return (0); 2559 } 2560 2561 static int 2562 somaxconn_sysctl(SYSCTL_HANDLER_ARGS) 2563 { 2564 int error; 2565 int val; 2566 2567 val = somaxconn; 2568 error = sysctl_handle_int(oidp, &val, sizeof(int), req); 2569 if (error || !req->newptr ) 2570 return (error); 2571 2572 if (val < 1 || val > USHRT_MAX) 2573 return (EINVAL); 2574 2575 somaxconn = val; 2576 return (0); 2577 } 2578