1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1991, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2004-2006 Robert N. M. Watson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 32 */ 33 34 /* 35 * UNIX Domain (Local) Sockets 36 * 37 * This is an implementation of UNIX (local) domain sockets. Each socket has 38 * an associated struct unpcb (UNIX protocol control block). Stream sockets 39 * may be connected to 0 or 1 other socket. Datagram sockets may be 40 * connected to 0, 1, or many other sockets. Sockets may be created and 41 * connected in pairs (socketpair(2)), or bound/connected to using the file 42 * system name space. For most purposes, only the receive socket buffer is 43 * used, as sending on one socket delivers directly to the receive socket 44 * buffer of a second socket. The implementation is substantially 45 * complicated by the fact that "ancillary data", such as file descriptors or 46 * credentials, may be passed across UNIX domain sockets. The potential for 47 * passing UNIX domain sockets over other UNIX domain sockets requires the 48 * implementation of a simple garbage collector to find and tear down cycles 49 * of disconnected sockets. 50 */ 51 52 #include <sys/cdefs.h> 53 __FBSDID("$FreeBSD$"); 54 55 #include "opt_mac.h" 56 57 #include <sys/param.h> 58 #include <sys/domain.h> 59 #include <sys/fcntl.h> 60 #include <sys/malloc.h> /* XXX must be before <sys/file.h> */ 61 #include <sys/eventhandler.h> 62 #include <sys/file.h> 63 #include <sys/filedesc.h> 64 #include <sys/jail.h> 65 #include <sys/kernel.h> 66 #include <sys/lock.h> 67 #include <sys/mbuf.h> 68 #include <sys/mount.h> 69 #include <sys/mutex.h> 70 #include <sys/namei.h> 71 #include <sys/proc.h> 72 #include <sys/protosw.h> 73 #include <sys/resourcevar.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/signalvar.h> 77 #include <sys/stat.h> 78 #include <sys/sx.h> 79 #include <sys/sysctl.h> 80 #include <sys/systm.h> 81 #include <sys/taskqueue.h> 82 #include <sys/un.h> 83 #include <sys/unpcb.h> 84 #include <sys/vnode.h> 85 86 #include <security/mac/mac_framework.h> 87 88 #include <vm/uma.h> 89 90 static uma_zone_t unp_zone; 91 static unp_gen_t unp_gencnt; 92 static u_int unp_count; 93 94 static struct unp_head unp_shead, unp_dhead; 95 96 /* 97 * Unix communications domain. 98 * 99 * TODO: 100 * SEQPACKET, RDM 101 * rethink name space problems 102 * need a proper out-of-band 103 * lock pushdown 104 */ 105 static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; 106 static ino_t unp_ino; /* prototype for fake inode numbers */ 107 struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); 108 109 /* 110 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for 111 * stream sockets, although the total for sender and receiver is actually 112 * only PIPSIZ. 113 * 114 * Datagram sockets really use the sendspace as the maximum datagram size, 115 * and don't really want to reserve the sendspace. Their recvspace should be 116 * large enough for at least one max-size datagram plus address. 117 */ 118 #ifndef PIPSIZ 119 #define PIPSIZ 8192 120 #endif 121 static u_long unpst_sendspace = PIPSIZ; 122 static u_long unpst_recvspace = PIPSIZ; 123 static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 124 static u_long unpdg_recvspace = 4*1024; 125 126 static int unp_rights; /* file descriptors in flight */ 127 128 SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); 129 SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); 130 SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); 131 132 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 133 &unpst_sendspace, 0, ""); 134 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, 135 &unpst_recvspace, 0, ""); 136 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, 137 &unpdg_sendspace, 0, ""); 138 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, 139 &unpdg_recvspace, 0, ""); 140 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); 141 142 /* 143 * Currently, UNIX domain sockets are protected by a single subsystem lock, 144 * which covers global data structures and variables, the contents of each 145 * per-socket unpcb structure, and the so_pcb field in sockets attached to 146 * the UNIX domain. This provides for a moderate degree of paralellism, as 147 * receive operations on UNIX domain sockets do not need to acquire the 148 * subsystem lock. Finer grained locking to permit send() without acquiring 149 * a global lock would be a logical next step. 150 * 151 * The UNIX domain socket lock preceds all socket layer locks, including the 152 * socket lock and socket buffer lock, permitting UNIX domain socket code to 153 * call into socket support routines without releasing its locks. 154 * 155 * Some caution is required in areas where the UNIX domain socket code enters 156 * VFS in order to create or find rendezvous points. This results in 157 * dropping of the UNIX domain socket subsystem lock, acquisition of the 158 * Giant lock, and potential sleeping. This increases the chances of races, 159 * and exposes weaknesses in the socket->protocol API by offering poor 160 * failure modes. 161 */ 162 static struct mtx unp_mtx; 163 #define UNP_LOCK_INIT() \ 164 mtx_init(&unp_mtx, "unp", NULL, MTX_DEF) 165 #define UNP_LOCK() mtx_lock(&unp_mtx) 166 #define UNP_UNLOCK() mtx_unlock(&unp_mtx) 167 #define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED) 168 #define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED) 169 170 /* 171 * Garbage collection of cyclic file descriptor/socket references occurs 172 * asynchronously in a taskqueue context in order to avoid recursion and 173 * reentrance in the UNIX domain socket, file descriptor, and socket layer 174 * code. See unp_gc() for a full description. 175 */ 176 static struct task unp_gc_task; 177 178 static int unp_connect(struct socket *,struct sockaddr *, struct thread *); 179 static int unp_connect2(struct socket *so, struct socket *so2, int); 180 static void unp_disconnect(struct unpcb *); 181 static void unp_shutdown(struct unpcb *); 182 static void unp_drop(struct unpcb *, int); 183 static void unp_gc(__unused void *, int); 184 static void unp_scan(struct mbuf *, void (*)(struct file *)); 185 static void unp_mark(struct file *); 186 static void unp_discard(struct file *); 187 static void unp_freerights(struct file **, int); 188 static int unp_internalize(struct mbuf **, struct thread *); 189 static int unp_listen(struct socket *, struct unpcb *, int, 190 struct thread *); 191 192 /* 193 * Definitions of protocols supported in the LOCAL domain. 194 */ 195 static struct domain localdomain; 196 static struct protosw localsw[] = { 197 { 198 .pr_type = SOCK_STREAM, 199 .pr_domain = &localdomain, 200 .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, 201 .pr_ctloutput = &uipc_ctloutput, 202 .pr_usrreqs = &uipc_usrreqs 203 }, 204 { 205 .pr_type = SOCK_DGRAM, 206 .pr_domain = &localdomain, 207 .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS, 208 .pr_usrreqs = &uipc_usrreqs 209 }, 210 }; 211 212 static struct domain localdomain = { 213 .dom_family = AF_LOCAL, 214 .dom_name = "local", 215 .dom_init = unp_init, 216 .dom_externalize = unp_externalize, 217 .dom_dispose = unp_dispose, 218 .dom_protosw = localsw, 219 .dom_protoswNPROTOSW = &localsw[sizeof(localsw)/sizeof(localsw[0])] 220 }; 221 DOMAIN_SET(local); 222 223 static void 224 uipc_abort(struct socket *so) 225 { 226 struct unpcb *unp; 227 228 unp = sotounpcb(so); 229 KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); 230 UNP_LOCK(); 231 unp_drop(unp, ECONNABORTED); 232 UNP_UNLOCK(); 233 } 234 235 static int 236 uipc_accept(struct socket *so, struct sockaddr **nam) 237 { 238 struct unpcb *unp; 239 const struct sockaddr *sa; 240 241 /* 242 * Pass back name of connected socket, if it was bound and we are 243 * still connected (our peer may have closed already!). 244 */ 245 unp = sotounpcb(so); 246 KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); 247 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 248 UNP_LOCK(); 249 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) 250 sa = (struct sockaddr *) unp->unp_conn->unp_addr; 251 else 252 sa = &sun_noname; 253 bcopy(sa, *nam, sa->sa_len); 254 UNP_UNLOCK(); 255 return (0); 256 } 257 258 static int 259 uipc_attach(struct socket *so, int proto, struct thread *td) 260 { 261 struct unpcb *unp; 262 int error; 263 264 KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); 265 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 266 switch (so->so_type) { 267 case SOCK_STREAM: 268 error = soreserve(so, unpst_sendspace, unpst_recvspace); 269 break; 270 271 case SOCK_DGRAM: 272 error = soreserve(so, unpdg_sendspace, unpdg_recvspace); 273 break; 274 275 default: 276 panic("unp_attach"); 277 } 278 if (error) 279 return (error); 280 } 281 unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO); 282 if (unp == NULL) 283 return (ENOBUFS); 284 LIST_INIT(&unp->unp_refs); 285 unp->unp_socket = so; 286 so->so_pcb = unp; 287 288 unp->unp_refcount = 1; 289 UNP_LOCK(); 290 unp->unp_gencnt = ++unp_gencnt; 291 unp_count++; 292 LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, 293 unp, unp_link); 294 UNP_UNLOCK(); 295 296 return (0); 297 } 298 299 static int 300 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 301 { 302 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 303 struct vattr vattr; 304 int error, namelen; 305 struct nameidata nd; 306 struct unpcb *unp; 307 struct vnode *vp; 308 struct mount *mp; 309 char *buf; 310 311 unp = sotounpcb(so); 312 KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); 313 314 namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); 315 if (namelen <= 0) 316 return (EINVAL); 317 318 /* 319 * We don't allow simultaneous bind() calls on a single UNIX domain 320 * socket, so flag in-progress operations, and return an error if an 321 * operation is already in progress. 322 * 323 * Historically, we have not allowed a socket to be rebound, so this 324 * also returns an error. Not allowing re-binding certainly 325 * simplifies the implementation and avoids a great many possible 326 * failure modes. 327 */ 328 UNP_LOCK(); 329 if (unp->unp_vnode != NULL) { 330 UNP_UNLOCK(); 331 return (EINVAL); 332 } 333 if (unp->unp_flags & UNP_BINDING) { 334 UNP_UNLOCK(); 335 return (EALREADY); 336 } 337 unp->unp_flags |= UNP_BINDING; 338 UNP_UNLOCK(); 339 340 buf = malloc(namelen + 1, M_TEMP, M_WAITOK); 341 strlcpy(buf, soun->sun_path, namelen + 1); 342 343 mtx_lock(&Giant); 344 restart: 345 mtx_assert(&Giant, MA_OWNED); 346 NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, 347 buf, td); 348 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ 349 error = namei(&nd); 350 if (error) 351 goto error; 352 vp = nd.ni_vp; 353 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { 354 NDFREE(&nd, NDF_ONLY_PNBUF); 355 if (nd.ni_dvp == vp) 356 vrele(nd.ni_dvp); 357 else 358 vput(nd.ni_dvp); 359 if (vp != NULL) { 360 vrele(vp); 361 error = EADDRINUSE; 362 goto error; 363 } 364 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); 365 if (error) 366 goto error; 367 goto restart; 368 } 369 VATTR_NULL(&vattr); 370 vattr.va_type = VSOCK; 371 vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); 372 #ifdef MAC 373 error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, 374 &vattr); 375 #endif 376 if (error == 0) { 377 VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); 378 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); 379 } 380 NDFREE(&nd, NDF_ONLY_PNBUF); 381 vput(nd.ni_dvp); 382 if (error) { 383 vn_finished_write(mp); 384 goto error; 385 } 386 vp = nd.ni_vp; 387 ASSERT_VOP_LOCKED(vp, "uipc_bind"); 388 soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); 389 UNP_LOCK(); 390 vp->v_socket = unp->unp_socket; 391 unp->unp_vnode = vp; 392 unp->unp_addr = soun; 393 unp->unp_flags &= ~UNP_BINDING; 394 UNP_UNLOCK(); 395 VOP_UNLOCK(vp, 0, td); 396 vn_finished_write(mp); 397 mtx_unlock(&Giant); 398 free(buf, M_TEMP); 399 return (0); 400 error: 401 UNP_LOCK(); 402 unp->unp_flags &= ~UNP_BINDING; 403 UNP_UNLOCK(); 404 mtx_unlock(&Giant); 405 free(buf, M_TEMP); 406 return (error); 407 } 408 409 static int 410 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 411 { 412 int error; 413 414 KASSERT(td == curthread, ("uipc_connect: td != curthread")); 415 UNP_LOCK(); 416 error = unp_connect(so, nam, td); 417 UNP_UNLOCK(); 418 return (error); 419 } 420 421 /* 422 * XXXRW: Should also unbind? 423 */ 424 static void 425 uipc_close(struct socket *so) 426 { 427 struct unpcb *unp; 428 429 unp = sotounpcb(so); 430 KASSERT(unp != NULL, ("uipc_close: unp == NULL")); 431 UNP_LOCK(); 432 unp_disconnect(unp); 433 UNP_UNLOCK(); 434 } 435 436 int 437 uipc_connect2(struct socket *so1, struct socket *so2) 438 { 439 struct unpcb *unp; 440 int error; 441 442 unp = sotounpcb(so1); 443 KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); 444 UNP_LOCK(); 445 error = unp_connect2(so1, so2, PRU_CONNECT2); 446 UNP_UNLOCK(); 447 return (error); 448 } 449 450 /* control is EOPNOTSUPP */ 451 452 static void 453 uipc_detach(struct socket *so) 454 { 455 struct sockaddr_un *saved_unp_addr; 456 struct unpcb *unp; 457 struct vnode *vp; 458 int freeunp, local_unp_rights; 459 460 unp = sotounpcb(so); 461 KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); 462 UNP_LOCK(); 463 LIST_REMOVE(unp, unp_link); 464 unp->unp_gencnt = ++unp_gencnt; 465 --unp_count; 466 if ((vp = unp->unp_vnode) != NULL) { 467 /* 468 * XXXRW: should v_socket be frobbed only while holding 469 * Giant? 470 */ 471 unp->unp_vnode->v_socket = NULL; 472 unp->unp_vnode = NULL; 473 } 474 if (unp->unp_conn != NULL) 475 unp_disconnect(unp); 476 while (!LIST_EMPTY(&unp->unp_refs)) { 477 struct unpcb *ref = LIST_FIRST(&unp->unp_refs); 478 unp_drop(ref, ECONNRESET); 479 } 480 unp->unp_socket->so_pcb = NULL; 481 local_unp_rights = unp_rights; 482 saved_unp_addr = unp->unp_addr; 483 unp->unp_addr = NULL; 484 unp->unp_refcount--; 485 freeunp = (unp->unp_refcount == 0); 486 UNP_UNLOCK(); 487 if (saved_unp_addr != NULL) 488 FREE(saved_unp_addr, M_SONAME); 489 if (freeunp) 490 uma_zfree(unp_zone, unp); 491 if (vp) { 492 int vfslocked; 493 494 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 495 vrele(vp); 496 VFS_UNLOCK_GIANT(vfslocked); 497 } 498 if (local_unp_rights) 499 taskqueue_enqueue(taskqueue_thread, &unp_gc_task); 500 } 501 502 static int 503 uipc_disconnect(struct socket *so) 504 { 505 struct unpcb *unp; 506 507 unp = sotounpcb(so); 508 KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); 509 UNP_LOCK(); 510 unp_disconnect(unp); 511 UNP_UNLOCK(); 512 return (0); 513 } 514 515 static int 516 uipc_listen(struct socket *so, int backlog, struct thread *td) 517 { 518 struct unpcb *unp; 519 int error; 520 521 unp = sotounpcb(so); 522 KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); 523 UNP_LOCK(); 524 if (unp->unp_vnode == NULL) { 525 UNP_UNLOCK(); 526 return (EINVAL); 527 } 528 error = unp_listen(so, unp, backlog, td); 529 UNP_UNLOCK(); 530 return (error); 531 } 532 533 static int 534 uipc_peeraddr(struct socket *so, struct sockaddr **nam) 535 { 536 struct unpcb *unp; 537 const struct sockaddr *sa; 538 539 unp = sotounpcb(so); 540 KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); 541 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 542 UNP_LOCK(); 543 if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL) 544 sa = (struct sockaddr *) unp->unp_conn->unp_addr; 545 else { 546 /* 547 * XXX: It seems that this test always fails even when 548 * connection is established. So, this else clause is 549 * added as workaround to return PF_LOCAL sockaddr. 550 */ 551 sa = &sun_noname; 552 } 553 bcopy(sa, *nam, sa->sa_len); 554 UNP_UNLOCK(); 555 return (0); 556 } 557 558 static int 559 uipc_rcvd(struct socket *so, int flags) 560 { 561 struct unpcb *unp; 562 struct socket *so2; 563 u_int mbcnt, sbcc; 564 u_long newhiwat; 565 566 unp = sotounpcb(so); 567 KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL")); 568 switch (so->so_type) { 569 case SOCK_DGRAM: 570 panic("uipc_rcvd DGRAM?"); 571 /*NOTREACHED*/ 572 573 case SOCK_STREAM: 574 /* 575 * Adjust backpressure on sender and wakeup any waiting to 576 * write. 577 */ 578 SOCKBUF_LOCK(&so->so_rcv); 579 mbcnt = so->so_rcv.sb_mbcnt; 580 sbcc = so->so_rcv.sb_cc; 581 SOCKBUF_UNLOCK(&so->so_rcv); 582 UNP_LOCK(); 583 if (unp->unp_conn == NULL) { 584 UNP_UNLOCK(); 585 break; 586 } 587 so2 = unp->unp_conn->unp_socket; 588 SOCKBUF_LOCK(&so2->so_snd); 589 so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt; 590 newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc; 591 (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, 592 newhiwat, RLIM_INFINITY); 593 sowwakeup_locked(so2); 594 unp->unp_mbcnt = mbcnt; 595 unp->unp_cc = sbcc; 596 UNP_UNLOCK(); 597 break; 598 599 default: 600 panic("uipc_rcvd unknown socktype"); 601 } 602 return (0); 603 } 604 605 /* pru_rcvoob is EOPNOTSUPP */ 606 607 static int 608 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 609 struct mbuf *control, struct thread *td) 610 { 611 struct unpcb *unp, *unp2; 612 struct socket *so2; 613 u_int mbcnt, sbcc; 614 u_long newhiwat; 615 int error = 0; 616 617 unp = sotounpcb(so); 618 KASSERT(unp != NULL, ("uipc_send: unp == NULL")); 619 if (flags & PRUS_OOB) { 620 error = EOPNOTSUPP; 621 goto release; 622 } 623 624 if (control != NULL && (error = unp_internalize(&control, td))) 625 goto release; 626 627 UNP_LOCK(); 628 switch (so->so_type) { 629 case SOCK_DGRAM: 630 { 631 const struct sockaddr *from; 632 633 if (nam != NULL) { 634 if (unp->unp_conn != NULL) { 635 error = EISCONN; 636 break; 637 } 638 error = unp_connect(so, nam, td); 639 if (error) 640 break; 641 } 642 /* 643 * Because connect() and send() are non-atomic in a sendto() 644 * with a target address, it's possible that the socket will 645 * have disconnected before the send() can run. In that case 646 * return the slightly counter-intuitive but otherwise 647 * correct error that the socket is not connected. 648 */ 649 unp2 = unp->unp_conn; 650 if (unp2 == NULL) { 651 error = ENOTCONN; 652 break; 653 } 654 so2 = unp2->unp_socket; 655 if (unp->unp_addr != NULL) 656 from = (struct sockaddr *)unp->unp_addr; 657 else 658 from = &sun_noname; 659 if (unp2->unp_flags & UNP_WANTCRED) 660 control = unp_addsockcred(td, control); 661 SOCKBUF_LOCK(&so2->so_rcv); 662 if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { 663 sorwakeup_locked(so2); 664 m = NULL; 665 control = NULL; 666 } else { 667 SOCKBUF_UNLOCK(&so2->so_rcv); 668 error = ENOBUFS; 669 } 670 if (nam != NULL) 671 unp_disconnect(unp); 672 break; 673 } 674 675 case SOCK_STREAM: 676 /* 677 * Connect if not connected yet. 678 * 679 * Note: A better implementation would complain if not equal 680 * to the peer's address. 681 */ 682 if ((so->so_state & SS_ISCONNECTED) == 0) { 683 if (nam != NULL) { 684 error = unp_connect(so, nam, td); 685 if (error) 686 break; /* XXX */ 687 } else { 688 error = ENOTCONN; 689 break; 690 } 691 } 692 693 /* Lockless read. */ 694 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 695 error = EPIPE; 696 break; 697 } 698 /* 699 * Because connect() and send() are non-atomic in a sendto() 700 * with a target address, it's possible that the socket will 701 * have disconnected before the send() can run. In that case 702 * return the slightly counter-intuitive but otherwise 703 * correct error that the socket is not connected. 704 */ 705 unp2 = unp->unp_conn; 706 if (unp2 == NULL) { 707 error = ENOTCONN; 708 break; 709 } 710 so2 = unp2->unp_socket; 711 SOCKBUF_LOCK(&so2->so_rcv); 712 if (unp2->unp_flags & UNP_WANTCRED) { 713 /* 714 * Credentials are passed only once on 715 * SOCK_STREAM. 716 */ 717 unp2->unp_flags &= ~UNP_WANTCRED; 718 control = unp_addsockcred(td, control); 719 } 720 /* 721 * Send to paired receive port, and then reduce send buffer 722 * hiwater marks to maintain backpressure. Wake up readers. 723 */ 724 if (control != NULL) { 725 if (sbappendcontrol_locked(&so2->so_rcv, m, control)) 726 control = NULL; 727 } else { 728 sbappend_locked(&so2->so_rcv, m); 729 } 730 mbcnt = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt; 731 unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt; 732 sbcc = so2->so_rcv.sb_cc; 733 sorwakeup_locked(so2); 734 735 SOCKBUF_LOCK(&so->so_snd); 736 newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc); 737 (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 738 newhiwat, RLIM_INFINITY); 739 so->so_snd.sb_mbmax -= mbcnt; 740 SOCKBUF_UNLOCK(&so->so_snd); 741 742 unp2->unp_cc = sbcc; 743 m = NULL; 744 break; 745 746 default: 747 panic("uipc_send unknown socktype"); 748 } 749 750 /* 751 * SEND_EOF is equivalent to a SEND followed by 752 * a SHUTDOWN. 753 */ 754 if (flags & PRUS_EOF) { 755 socantsendmore(so); 756 unp_shutdown(unp); 757 } 758 UNP_UNLOCK(); 759 760 if (control != NULL && error != 0) 761 unp_dispose(control); 762 763 release: 764 if (control != NULL) 765 m_freem(control); 766 if (m != NULL) 767 m_freem(m); 768 return (error); 769 } 770 771 static int 772 uipc_sense(struct socket *so, struct stat *sb) 773 { 774 struct unpcb *unp; 775 struct socket *so2; 776 777 unp = sotounpcb(so); 778 KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); 779 UNP_LOCK(); 780 sb->st_blksize = so->so_snd.sb_hiwat; 781 if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { 782 so2 = unp->unp_conn->unp_socket; 783 sb->st_blksize += so2->so_rcv.sb_cc; 784 } 785 sb->st_dev = NODEV; 786 if (unp->unp_ino == 0) 787 unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; 788 sb->st_ino = unp->unp_ino; 789 UNP_UNLOCK(); 790 return (0); 791 } 792 793 static int 794 uipc_shutdown(struct socket *so) 795 { 796 struct unpcb *unp; 797 798 unp = sotounpcb(so); 799 KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); 800 UNP_LOCK(); 801 socantsendmore(so); 802 unp_shutdown(unp); 803 UNP_UNLOCK(); 804 return (0); 805 } 806 807 static int 808 uipc_sockaddr(struct socket *so, struct sockaddr **nam) 809 { 810 struct unpcb *unp; 811 const struct sockaddr *sa; 812 813 unp = sotounpcb(so); 814 KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); 815 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 816 UNP_LOCK(); 817 if (unp->unp_addr != NULL) 818 sa = (struct sockaddr *) unp->unp_addr; 819 else 820 sa = &sun_noname; 821 bcopy(sa, *nam, sa->sa_len); 822 UNP_UNLOCK(); 823 return (0); 824 } 825 826 struct pr_usrreqs uipc_usrreqs = { 827 .pru_abort = uipc_abort, 828 .pru_accept = uipc_accept, 829 .pru_attach = uipc_attach, 830 .pru_bind = uipc_bind, 831 .pru_connect = uipc_connect, 832 .pru_connect2 = uipc_connect2, 833 .pru_detach = uipc_detach, 834 .pru_disconnect = uipc_disconnect, 835 .pru_listen = uipc_listen, 836 .pru_peeraddr = uipc_peeraddr, 837 .pru_rcvd = uipc_rcvd, 838 .pru_send = uipc_send, 839 .pru_sense = uipc_sense, 840 .pru_shutdown = uipc_shutdown, 841 .pru_sockaddr = uipc_sockaddr, 842 .pru_close = uipc_close, 843 }; 844 845 int 846 uipc_ctloutput(struct socket *so, struct sockopt *sopt) 847 { 848 struct unpcb *unp; 849 struct xucred xu; 850 int error, optval; 851 852 if (sopt->sopt_level != 0) 853 return (EINVAL); 854 855 unp = sotounpcb(so); 856 KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); 857 UNP_LOCK(); 858 error = 0; 859 switch (sopt->sopt_dir) { 860 case SOPT_GET: 861 switch (sopt->sopt_name) { 862 case LOCAL_PEERCRED: 863 if (unp->unp_flags & UNP_HAVEPC) 864 xu = unp->unp_peercred; 865 else { 866 if (so->so_type == SOCK_STREAM) 867 error = ENOTCONN; 868 else 869 error = EINVAL; 870 } 871 if (error == 0) 872 error = sooptcopyout(sopt, &xu, sizeof(xu)); 873 break; 874 case LOCAL_CREDS: 875 optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0; 876 error = sooptcopyout(sopt, &optval, sizeof(optval)); 877 break; 878 case LOCAL_CONNWAIT: 879 optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; 880 error = sooptcopyout(sopt, &optval, sizeof(optval)); 881 break; 882 default: 883 error = EOPNOTSUPP; 884 break; 885 } 886 break; 887 case SOPT_SET: 888 switch (sopt->sopt_name) { 889 case LOCAL_CREDS: 890 case LOCAL_CONNWAIT: 891 error = sooptcopyin(sopt, &optval, sizeof(optval), 892 sizeof(optval)); 893 if (error) 894 break; 895 896 #define OPTSET(bit) \ 897 if (optval) \ 898 unp->unp_flags |= bit; \ 899 else \ 900 unp->unp_flags &= ~bit; 901 902 switch (sopt->sopt_name) { 903 case LOCAL_CREDS: 904 OPTSET(UNP_WANTCRED); 905 break; 906 case LOCAL_CONNWAIT: 907 OPTSET(UNP_CONNWAIT); 908 break; 909 default: 910 break; 911 } 912 break; 913 #undef OPTSET 914 default: 915 error = ENOPROTOOPT; 916 break; 917 } 918 break; 919 default: 920 error = EOPNOTSUPP; 921 break; 922 } 923 UNP_UNLOCK(); 924 return (error); 925 } 926 927 static int 928 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 929 { 930 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 931 struct vnode *vp; 932 struct socket *so2, *so3; 933 struct unpcb *unp, *unp2, *unp3; 934 int error, len; 935 struct nameidata nd; 936 char buf[SOCK_MAXADDRLEN]; 937 struct sockaddr *sa; 938 939 UNP_LOCK_ASSERT(); 940 941 unp = sotounpcb(so); 942 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 943 len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); 944 if (len <= 0) 945 return (EINVAL); 946 strlcpy(buf, soun->sun_path, len + 1); 947 if (unp->unp_flags & UNP_CONNECTING) { 948 UNP_UNLOCK(); 949 return (EALREADY); 950 } 951 UNP_UNLOCK(); 952 sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 953 mtx_lock(&Giant); 954 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); 955 error = namei(&nd); 956 if (error) 957 vp = NULL; 958 else 959 vp = nd.ni_vp; 960 ASSERT_VOP_LOCKED(vp, "unp_connect"); 961 NDFREE(&nd, NDF_ONLY_PNBUF); 962 if (error) 963 goto bad; 964 965 if (vp->v_type != VSOCK) { 966 error = ENOTSOCK; 967 goto bad; 968 } 969 error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); 970 if (error) 971 goto bad; 972 mtx_unlock(&Giant); 973 UNP_LOCK(); 974 unp = sotounpcb(so); 975 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 976 so2 = vp->v_socket; 977 if (so2 == NULL) { 978 error = ECONNREFUSED; 979 goto bad2; 980 } 981 if (so->so_type != so2->so_type) { 982 error = EPROTOTYPE; 983 goto bad2; 984 } 985 if (so->so_proto->pr_flags & PR_CONNREQUIRED) { 986 if (so2->so_options & SO_ACCEPTCONN) { 987 /* 988 * NB: drop locks here so unp_attach is entered w/o 989 * locks; this avoids a recursive lock of the head 990 * and holding sleep locks across a (potentially) 991 * blocking malloc. 992 */ 993 UNP_UNLOCK(); 994 so3 = sonewconn(so2, 0); 995 UNP_LOCK(); 996 } else 997 so3 = NULL; 998 if (so3 == NULL) { 999 error = ECONNREFUSED; 1000 goto bad2; 1001 } 1002 unp = sotounpcb(so); 1003 unp2 = sotounpcb(so2); 1004 unp3 = sotounpcb(so3); 1005 if (unp2->unp_addr != NULL) { 1006 bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); 1007 unp3->unp_addr = (struct sockaddr_un *) sa; 1008 sa = NULL; 1009 } 1010 /* 1011 * unp_peercred management: 1012 * 1013 * The connecter's (client's) credentials are copied from its 1014 * process structure at the time of connect() (which is now). 1015 */ 1016 cru2x(td->td_ucred, &unp3->unp_peercred); 1017 unp3->unp_flags |= UNP_HAVEPC; 1018 /* 1019 * The receiver's (server's) credentials are copied from the 1020 * unp_peercred member of socket on which the former called 1021 * listen(); unp_listen() cached that process's credentials 1022 * at that time so we can use them now. 1023 */ 1024 KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, 1025 ("unp_connect: listener without cached peercred")); 1026 memcpy(&unp->unp_peercred, &unp2->unp_peercred, 1027 sizeof(unp->unp_peercred)); 1028 unp->unp_flags |= UNP_HAVEPC; 1029 if (unp2->unp_flags & UNP_WANTCRED) 1030 unp3->unp_flags |= UNP_WANTCRED; 1031 #ifdef MAC 1032 SOCK_LOCK(so); 1033 mac_set_socket_peer_from_socket(so, so3); 1034 mac_set_socket_peer_from_socket(so3, so); 1035 SOCK_UNLOCK(so); 1036 #endif 1037 1038 so2 = so3; 1039 } 1040 error = unp_connect2(so, so2, PRU_CONNECT); 1041 bad2: 1042 UNP_UNLOCK(); 1043 mtx_lock(&Giant); 1044 bad: 1045 mtx_assert(&Giant, MA_OWNED); 1046 if (vp != NULL) 1047 vput(vp); 1048 mtx_unlock(&Giant); 1049 free(sa, M_SONAME); 1050 UNP_LOCK(); 1051 unp->unp_flags &= ~UNP_CONNECTING; 1052 return (error); 1053 } 1054 1055 static int 1056 unp_connect2(struct socket *so, struct socket *so2, int req) 1057 { 1058 struct unpcb *unp = sotounpcb(so); 1059 struct unpcb *unp2; 1060 1061 UNP_LOCK_ASSERT(); 1062 1063 if (so2->so_type != so->so_type) 1064 return (EPROTOTYPE); 1065 unp2 = sotounpcb(so2); 1066 KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); 1067 unp->unp_conn = unp2; 1068 switch (so->so_type) { 1069 case SOCK_DGRAM: 1070 LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); 1071 soisconnected(so); 1072 break; 1073 1074 case SOCK_STREAM: 1075 unp2->unp_conn = unp; 1076 if (req == PRU_CONNECT && 1077 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) 1078 soisconnecting(so); 1079 else 1080 soisconnected(so); 1081 soisconnected(so2); 1082 break; 1083 1084 default: 1085 panic("unp_connect2"); 1086 } 1087 return (0); 1088 } 1089 1090 static void 1091 unp_disconnect(struct unpcb *unp) 1092 { 1093 struct unpcb *unp2 = unp->unp_conn; 1094 struct socket *so; 1095 1096 UNP_LOCK_ASSERT(); 1097 1098 if (unp2 == NULL) 1099 return; 1100 unp->unp_conn = NULL; 1101 switch (unp->unp_socket->so_type) { 1102 case SOCK_DGRAM: 1103 LIST_REMOVE(unp, unp_reflink); 1104 so = unp->unp_socket; 1105 SOCK_LOCK(so); 1106 so->so_state &= ~SS_ISCONNECTED; 1107 SOCK_UNLOCK(so); 1108 break; 1109 1110 case SOCK_STREAM: 1111 soisdisconnected(unp->unp_socket); 1112 unp2->unp_conn = NULL; 1113 soisdisconnected(unp2->unp_socket); 1114 break; 1115 } 1116 } 1117 1118 /* 1119 * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed by 1120 * the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers are 1121 * safe to reference. It first scans the list of struct unpcb's to generate 1122 * a pointer list, then it rescans its list one entry at a time to 1123 * externalize and copyout. It checks the generation number to see if a 1124 * struct unpcb has been reused, and will skip it if so. 1125 */ 1126 static int 1127 unp_pcblist(SYSCTL_HANDLER_ARGS) 1128 { 1129 int error, i, n; 1130 int freeunp; 1131 struct unpcb *unp, **unp_list; 1132 unp_gen_t gencnt; 1133 struct xunpgen *xug; 1134 struct unp_head *head; 1135 struct xunpcb *xu; 1136 1137 head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); 1138 1139 /* 1140 * The process of preparing the PCB list is too time-consuming and 1141 * resource-intensive to repeat twice on every request. 1142 */ 1143 if (req->oldptr == NULL) { 1144 n = unp_count; 1145 req->oldidx = 2 * (sizeof *xug) 1146 + (n + n/8) * sizeof(struct xunpcb); 1147 return (0); 1148 } 1149 1150 if (req->newptr != NULL) 1151 return (EPERM); 1152 1153 /* 1154 * OK, now we're committed to doing something. 1155 */ 1156 xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); 1157 UNP_LOCK(); 1158 gencnt = unp_gencnt; 1159 n = unp_count; 1160 UNP_UNLOCK(); 1161 1162 xug->xug_len = sizeof *xug; 1163 xug->xug_count = n; 1164 xug->xug_gen = gencnt; 1165 xug->xug_sogen = so_gencnt; 1166 error = SYSCTL_OUT(req, xug, sizeof *xug); 1167 if (error) { 1168 free(xug, M_TEMP); 1169 return (error); 1170 } 1171 1172 unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); 1173 1174 UNP_LOCK(); 1175 for (unp = LIST_FIRST(head), i = 0; unp && i < n; 1176 unp = LIST_NEXT(unp, unp_link)) { 1177 if (unp->unp_gencnt <= gencnt) { 1178 if (cr_cansee(req->td->td_ucred, 1179 unp->unp_socket->so_cred)) 1180 continue; 1181 unp_list[i++] = unp; 1182 unp->unp_refcount++; 1183 } 1184 } 1185 UNP_UNLOCK(); 1186 n = i; /* In case we lost some during malloc. */ 1187 1188 error = 0; 1189 xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); 1190 for (i = 0; i < n; i++) { 1191 unp = unp_list[i]; 1192 UNP_LOCK(); 1193 unp->unp_refcount--; 1194 if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) { 1195 xu->xu_len = sizeof *xu; 1196 xu->xu_unpp = unp; 1197 /* 1198 * XXX - need more locking here to protect against 1199 * connect/disconnect races for SMP. 1200 */ 1201 if (unp->unp_addr != NULL) 1202 bcopy(unp->unp_addr, &xu->xu_addr, 1203 unp->unp_addr->sun_len); 1204 if (unp->unp_conn != NULL && 1205 unp->unp_conn->unp_addr != NULL) 1206 bcopy(unp->unp_conn->unp_addr, 1207 &xu->xu_caddr, 1208 unp->unp_conn->unp_addr->sun_len); 1209 bcopy(unp, &xu->xu_unp, sizeof *unp); 1210 sotoxsocket(unp->unp_socket, &xu->xu_socket); 1211 UNP_UNLOCK(); 1212 error = SYSCTL_OUT(req, xu, sizeof *xu); 1213 } else { 1214 freeunp = (unp->unp_refcount == 0); 1215 UNP_UNLOCK(); 1216 if (freeunp) 1217 uma_zfree(unp_zone, unp); 1218 } 1219 } 1220 free(xu, M_TEMP); 1221 if (!error) { 1222 /* 1223 * Give the user an updated idea of our state. If the 1224 * generation differs from what we told her before, she knows 1225 * that something happened while we were processing this 1226 * request, and it might be necessary to retry. 1227 */ 1228 xug->xug_gen = unp_gencnt; 1229 xug->xug_sogen = so_gencnt; 1230 xug->xug_count = unp_count; 1231 error = SYSCTL_OUT(req, xug, sizeof *xug); 1232 } 1233 free(unp_list, M_TEMP); 1234 free(xug, M_TEMP); 1235 return (error); 1236 } 1237 1238 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, 1239 (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", 1240 "List of active local datagram sockets"); 1241 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, 1242 (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", 1243 "List of active local stream sockets"); 1244 1245 static void 1246 unp_shutdown(struct unpcb *unp) 1247 { 1248 struct socket *so; 1249 1250 UNP_LOCK_ASSERT(); 1251 1252 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && 1253 (so = unp->unp_conn->unp_socket)) 1254 socantrcvmore(so); 1255 } 1256 1257 static void 1258 unp_drop(struct unpcb *unp, int errno) 1259 { 1260 struct socket *so = unp->unp_socket; 1261 1262 UNP_LOCK_ASSERT(); 1263 1264 so->so_error = errno; 1265 unp_disconnect(unp); 1266 } 1267 1268 static void 1269 unp_freerights(struct file **rp, int fdcount) 1270 { 1271 int i; 1272 struct file *fp; 1273 1274 for (i = 0; i < fdcount; i++) { 1275 fp = *rp; 1276 /* 1277 * Zero the pointer before calling unp_discard since it may 1278 * end up in unp_gc().. 1279 * 1280 * XXXRW: This is less true than it used to be. 1281 */ 1282 *rp++ = 0; 1283 unp_discard(fp); 1284 } 1285 } 1286 1287 int 1288 unp_externalize(struct mbuf *control, struct mbuf **controlp) 1289 { 1290 struct thread *td = curthread; /* XXX */ 1291 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1292 int i; 1293 int *fdp; 1294 struct file **rp; 1295 struct file *fp; 1296 void *data; 1297 socklen_t clen = control->m_len, datalen; 1298 int error, newfds; 1299 int f; 1300 u_int newlen; 1301 1302 UNP_UNLOCK_ASSERT(); 1303 1304 error = 0; 1305 if (controlp != NULL) /* controlp == NULL => free control messages */ 1306 *controlp = NULL; 1307 1308 while (cm != NULL) { 1309 if (sizeof(*cm) > clen || cm->cmsg_len > clen) { 1310 error = EINVAL; 1311 break; 1312 } 1313 1314 data = CMSG_DATA(cm); 1315 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 1316 1317 if (cm->cmsg_level == SOL_SOCKET 1318 && cm->cmsg_type == SCM_RIGHTS) { 1319 newfds = datalen / sizeof(struct file *); 1320 rp = data; 1321 1322 /* If we're not outputting the descriptors free them. */ 1323 if (error || controlp == NULL) { 1324 unp_freerights(rp, newfds); 1325 goto next; 1326 } 1327 FILEDESC_LOCK(td->td_proc->p_fd); 1328 /* if the new FD's will not fit free them. */ 1329 if (!fdavail(td, newfds)) { 1330 FILEDESC_UNLOCK(td->td_proc->p_fd); 1331 error = EMSGSIZE; 1332 unp_freerights(rp, newfds); 1333 goto next; 1334 } 1335 /* 1336 * Now change each pointer to an fd in the global 1337 * table to an integer that is the index to the local 1338 * fd table entry that we set up to point to the 1339 * global one we are transferring. 1340 */ 1341 newlen = newfds * sizeof(int); 1342 *controlp = sbcreatecontrol(NULL, newlen, 1343 SCM_RIGHTS, SOL_SOCKET); 1344 if (*controlp == NULL) { 1345 FILEDESC_UNLOCK(td->td_proc->p_fd); 1346 error = E2BIG; 1347 unp_freerights(rp, newfds); 1348 goto next; 1349 } 1350 1351 fdp = (int *) 1352 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1353 for (i = 0; i < newfds; i++) { 1354 if (fdalloc(td, 0, &f)) 1355 panic("unp_externalize fdalloc failed"); 1356 fp = *rp++; 1357 td->td_proc->p_fd->fd_ofiles[f] = fp; 1358 FILE_LOCK(fp); 1359 fp->f_msgcount--; 1360 FILE_UNLOCK(fp); 1361 unp_rights--; 1362 *fdp++ = f; 1363 } 1364 FILEDESC_UNLOCK(td->td_proc->p_fd); 1365 } else { 1366 /* We can just copy anything else across. */ 1367 if (error || controlp == NULL) 1368 goto next; 1369 *controlp = sbcreatecontrol(NULL, datalen, 1370 cm->cmsg_type, cm->cmsg_level); 1371 if (*controlp == NULL) { 1372 error = ENOBUFS; 1373 goto next; 1374 } 1375 bcopy(data, 1376 CMSG_DATA(mtod(*controlp, struct cmsghdr *)), 1377 datalen); 1378 } 1379 1380 controlp = &(*controlp)->m_next; 1381 1382 next: 1383 if (CMSG_SPACE(datalen) < clen) { 1384 clen -= CMSG_SPACE(datalen); 1385 cm = (struct cmsghdr *) 1386 ((caddr_t)cm + CMSG_SPACE(datalen)); 1387 } else { 1388 clen = 0; 1389 cm = NULL; 1390 } 1391 } 1392 1393 m_freem(control); 1394 1395 return (error); 1396 } 1397 1398 static void 1399 unp_zone_change(void *tag) 1400 { 1401 1402 uma_zone_set_max(unp_zone, maxsockets); 1403 } 1404 1405 void 1406 unp_init(void) 1407 { 1408 1409 unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, 1410 NULL, NULL, UMA_ALIGN_PTR, 0); 1411 if (unp_zone == NULL) 1412 panic("unp_init"); 1413 uma_zone_set_max(unp_zone, maxsockets); 1414 EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, 1415 NULL, EVENTHANDLER_PRI_ANY); 1416 LIST_INIT(&unp_dhead); 1417 LIST_INIT(&unp_shead); 1418 TASK_INIT(&unp_gc_task, 0, unp_gc, NULL); 1419 UNP_LOCK_INIT(); 1420 } 1421 1422 static int 1423 unp_internalize(struct mbuf **controlp, struct thread *td) 1424 { 1425 struct mbuf *control = *controlp; 1426 struct proc *p = td->td_proc; 1427 struct filedesc *fdescp = p->p_fd; 1428 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 1429 struct cmsgcred *cmcred; 1430 struct file **rp; 1431 struct file *fp; 1432 struct timeval *tv; 1433 int i, fd, *fdp; 1434 void *data; 1435 socklen_t clen = control->m_len, datalen; 1436 int error, oldfds; 1437 u_int newlen; 1438 1439 UNP_UNLOCK_ASSERT(); 1440 1441 error = 0; 1442 *controlp = NULL; 1443 1444 while (cm != NULL) { 1445 if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET 1446 || cm->cmsg_len > clen) { 1447 error = EINVAL; 1448 goto out; 1449 } 1450 1451 data = CMSG_DATA(cm); 1452 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 1453 1454 switch (cm->cmsg_type) { 1455 /* 1456 * Fill in credential information. 1457 */ 1458 case SCM_CREDS: 1459 *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), 1460 SCM_CREDS, SOL_SOCKET); 1461 if (*controlp == NULL) { 1462 error = ENOBUFS; 1463 goto out; 1464 } 1465 1466 cmcred = (struct cmsgcred *) 1467 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1468 cmcred->cmcred_pid = p->p_pid; 1469 cmcred->cmcred_uid = td->td_ucred->cr_ruid; 1470 cmcred->cmcred_gid = td->td_ucred->cr_rgid; 1471 cmcred->cmcred_euid = td->td_ucred->cr_uid; 1472 cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, 1473 CMGROUP_MAX); 1474 for (i = 0; i < cmcred->cmcred_ngroups; i++) 1475 cmcred->cmcred_groups[i] = 1476 td->td_ucred->cr_groups[i]; 1477 break; 1478 1479 case SCM_RIGHTS: 1480 oldfds = datalen / sizeof (int); 1481 /* 1482 * Check that all the FDs passed in refer to legal 1483 * files. If not, reject the entire operation. 1484 */ 1485 fdp = data; 1486 FILEDESC_LOCK(fdescp); 1487 for (i = 0; i < oldfds; i++) { 1488 fd = *fdp++; 1489 if ((unsigned)fd >= fdescp->fd_nfiles || 1490 fdescp->fd_ofiles[fd] == NULL) { 1491 FILEDESC_UNLOCK(fdescp); 1492 error = EBADF; 1493 goto out; 1494 } 1495 fp = fdescp->fd_ofiles[fd]; 1496 if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { 1497 FILEDESC_UNLOCK(fdescp); 1498 error = EOPNOTSUPP; 1499 goto out; 1500 } 1501 1502 } 1503 /* 1504 * Now replace the integer FDs with pointers to the 1505 * associated global file table entry.. 1506 */ 1507 newlen = oldfds * sizeof(struct file *); 1508 *controlp = sbcreatecontrol(NULL, newlen, 1509 SCM_RIGHTS, SOL_SOCKET); 1510 if (*controlp == NULL) { 1511 FILEDESC_UNLOCK(fdescp); 1512 error = E2BIG; 1513 goto out; 1514 } 1515 1516 fdp = data; 1517 rp = (struct file **) 1518 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1519 for (i = 0; i < oldfds; i++) { 1520 fp = fdescp->fd_ofiles[*fdp++]; 1521 *rp++ = fp; 1522 FILE_LOCK(fp); 1523 fp->f_count++; 1524 fp->f_msgcount++; 1525 FILE_UNLOCK(fp); 1526 unp_rights++; 1527 } 1528 FILEDESC_UNLOCK(fdescp); 1529 break; 1530 1531 case SCM_TIMESTAMP: 1532 *controlp = sbcreatecontrol(NULL, sizeof(*tv), 1533 SCM_TIMESTAMP, SOL_SOCKET); 1534 if (*controlp == NULL) { 1535 error = ENOBUFS; 1536 goto out; 1537 } 1538 tv = (struct timeval *) 1539 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 1540 microtime(tv); 1541 break; 1542 1543 default: 1544 error = EINVAL; 1545 goto out; 1546 } 1547 1548 controlp = &(*controlp)->m_next; 1549 1550 if (CMSG_SPACE(datalen) < clen) { 1551 clen -= CMSG_SPACE(datalen); 1552 cm = (struct cmsghdr *) 1553 ((caddr_t)cm + CMSG_SPACE(datalen)); 1554 } else { 1555 clen = 0; 1556 cm = NULL; 1557 } 1558 } 1559 1560 out: 1561 m_freem(control); 1562 1563 return (error); 1564 } 1565 1566 struct mbuf * 1567 unp_addsockcred(struct thread *td, struct mbuf *control) 1568 { 1569 struct mbuf *m, *n, *n_prev; 1570 struct sockcred *sc; 1571 const struct cmsghdr *cm; 1572 int ngroups; 1573 int i; 1574 1575 ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); 1576 1577 m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET); 1578 if (m == NULL) 1579 return (control); 1580 1581 sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *)); 1582 sc->sc_uid = td->td_ucred->cr_ruid; 1583 sc->sc_euid = td->td_ucred->cr_uid; 1584 sc->sc_gid = td->td_ucred->cr_rgid; 1585 sc->sc_egid = td->td_ucred->cr_gid; 1586 sc->sc_ngroups = ngroups; 1587 for (i = 0; i < sc->sc_ngroups; i++) 1588 sc->sc_groups[i] = td->td_ucred->cr_groups[i]; 1589 1590 /* 1591 * Unlink SCM_CREDS control messages (struct cmsgcred), since just 1592 * created SCM_CREDS control message (struct sockcred) has another 1593 * format. 1594 */ 1595 if (control != NULL) 1596 for (n = control, n_prev = NULL; n != NULL;) { 1597 cm = mtod(n, struct cmsghdr *); 1598 if (cm->cmsg_level == SOL_SOCKET && 1599 cm->cmsg_type == SCM_CREDS) { 1600 if (n_prev == NULL) 1601 control = n->m_next; 1602 else 1603 n_prev->m_next = n->m_next; 1604 n = m_free(n); 1605 } else { 1606 n_prev = n; 1607 n = n->m_next; 1608 } 1609 } 1610 1611 /* Prepend it to the head. */ 1612 m->m_next = control; 1613 1614 return (m); 1615 } 1616 1617 /* 1618 * unp_defer indicates whether additional work has been defered for a future 1619 * pass through unp_gc(). It is thread local and does not require explicit 1620 * synchronization. 1621 */ 1622 static int unp_defer; 1623 1624 static int unp_taskcount; 1625 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); 1626 1627 static int unp_recycled; 1628 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, ""); 1629 1630 static void 1631 unp_gc(__unused void *arg, int pending) 1632 { 1633 struct file *fp, *nextfp; 1634 struct socket *so; 1635 struct file **extra_ref, **fpp; 1636 int nunref, i; 1637 int nfiles_snap; 1638 int nfiles_slack = 20; 1639 1640 unp_taskcount++; 1641 unp_defer = 0; 1642 /* 1643 * Before going through all this, set all FDs to be NOT deferred and 1644 * NOT externally accessible. 1645 */ 1646 sx_slock(&filelist_lock); 1647 LIST_FOREACH(fp, &filehead, f_list) 1648 fp->f_gcflag &= ~(FMARK|FDEFER); 1649 do { 1650 KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer)); 1651 LIST_FOREACH(fp, &filehead, f_list) { 1652 FILE_LOCK(fp); 1653 /* 1654 * If the file is not open, skip it -- could be a 1655 * file in the process of being opened, or in the 1656 * process of being closed. If the file is 1657 * "closing", it may have been marked for deferred 1658 * consideration. Clear the flag now if so. 1659 */ 1660 if (fp->f_count == 0) { 1661 if (fp->f_gcflag & FDEFER) 1662 unp_defer--; 1663 fp->f_gcflag &= ~(FMARK|FDEFER); 1664 FILE_UNLOCK(fp); 1665 continue; 1666 } 1667 /* 1668 * If we already marked it as 'defer' in a 1669 * previous pass, then try to process it this 1670 * time and un-mark it. 1671 */ 1672 if (fp->f_gcflag & FDEFER) { 1673 fp->f_gcflag &= ~FDEFER; 1674 unp_defer--; 1675 } else { 1676 /* 1677 * if it's not deferred, then check if it's 1678 * already marked.. if so skip it 1679 */ 1680 if (fp->f_gcflag & FMARK) { 1681 FILE_UNLOCK(fp); 1682 continue; 1683 } 1684 /* 1685 * If all references are from messages in 1686 * transit, then skip it. it's not externally 1687 * accessible. 1688 */ 1689 if (fp->f_count == fp->f_msgcount) { 1690 FILE_UNLOCK(fp); 1691 continue; 1692 } 1693 /* 1694 * If it got this far then it must be 1695 * externally accessible. 1696 */ 1697 fp->f_gcflag |= FMARK; 1698 } 1699 /* 1700 * Either it was deferred, or it is externally 1701 * accessible and not already marked so. Now check 1702 * if it is possibly one of OUR sockets. 1703 */ 1704 if (fp->f_type != DTYPE_SOCKET || 1705 (so = fp->f_data) == NULL) { 1706 FILE_UNLOCK(fp); 1707 continue; 1708 } 1709 if (so->so_proto->pr_domain != &localdomain || 1710 (so->so_proto->pr_flags & PR_RIGHTS) == 0) { 1711 FILE_UNLOCK(fp); 1712 continue; 1713 } 1714 1715 /* 1716 * Tell any other threads that do a subsequent 1717 * fdrop() that we are scanning the message 1718 * buffers. 1719 */ 1720 fp->f_gcflag |= FWAIT; 1721 FILE_UNLOCK(fp); 1722 1723 /* 1724 * So, Ok, it's one of our sockets and it IS 1725 * externally accessible (or was deferred). Now we 1726 * look to see if we hold any file descriptors in its 1727 * message buffers. Follow those links and mark them 1728 * as accessible too. 1729 */ 1730 SOCKBUF_LOCK(&so->so_rcv); 1731 unp_scan(so->so_rcv.sb_mb, unp_mark); 1732 SOCKBUF_UNLOCK(&so->so_rcv); 1733 1734 /* 1735 * Wake up any threads waiting in fdrop(). 1736 */ 1737 FILE_LOCK(fp); 1738 fp->f_gcflag &= ~FWAIT; 1739 wakeup(&fp->f_gcflag); 1740 FILE_UNLOCK(fp); 1741 } 1742 } while (unp_defer); 1743 sx_sunlock(&filelist_lock); 1744 /* 1745 * XXXRW: The following comments need updating for a post-SMPng and 1746 * deferred unp_gc() world, but are still generally accurate. 1747 * 1748 * We grab an extra reference to each of the file table entries that 1749 * are not otherwise accessible and then free the rights that are 1750 * stored in messages on them. 1751 * 1752 * The bug in the orginal code is a little tricky, so I'll describe 1753 * what's wrong with it here. 1754 * 1755 * It is incorrect to simply unp_discard each entry for f_msgcount 1756 * times -- consider the case of sockets A and B that contain 1757 * references to each other. On a last close of some other socket, 1758 * we trigger a gc since the number of outstanding rights (unp_rights) 1759 * is non-zero. If during the sweep phase the gc code unp_discards, 1760 * we end up doing a (full) closef on the descriptor. A closef on A 1761 * results in the following chain. Closef calls soo_close, which 1762 * calls soclose. Soclose calls first (through the switch 1763 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply 1764 * returns because the previous instance had set unp_gcing, and we 1765 * return all the way back to soclose, which marks the socket with 1766 * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free 1767 * up the rights that are queued in messages on the socket A, i.e., 1768 * the reference on B. The sorflush calls via the dom_dispose switch 1769 * unp_dispose, which unp_scans with unp_discard. This second 1770 * instance of unp_discard just calls closef on B. 1771 * 1772 * Well, a similar chain occurs on B, resulting in a sorflush on B, 1773 * which results in another closef on A. Unfortunately, A is already 1774 * being closed, and the descriptor has already been marked with 1775 * SS_NOFDREF, and soclose panics at this point. 1776 * 1777 * Here, we first take an extra reference to each inaccessible 1778 * descriptor. Then, we call sorflush ourself, since we know it is a 1779 * Unix domain socket anyhow. After we destroy all the rights 1780 * carried in messages, we do a last closef to get rid of our extra 1781 * reference. This is the last close, and the unp_detach etc will 1782 * shut down the socket. 1783 * 1784 * 91/09/19, bsy@cs.cmu.edu 1785 */ 1786 again: 1787 nfiles_snap = openfiles + nfiles_slack; /* some slack */ 1788 extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP, 1789 M_WAITOK); 1790 sx_slock(&filelist_lock); 1791 if (nfiles_snap < openfiles) { 1792 sx_sunlock(&filelist_lock); 1793 free(extra_ref, M_TEMP); 1794 nfiles_slack += 20; 1795 goto again; 1796 } 1797 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; 1798 fp != NULL; fp = nextfp) { 1799 nextfp = LIST_NEXT(fp, f_list); 1800 FILE_LOCK(fp); 1801 /* 1802 * If it's not open, skip it 1803 */ 1804 if (fp->f_count == 0) { 1805 FILE_UNLOCK(fp); 1806 continue; 1807 } 1808 /* 1809 * If all refs are from msgs, and it's not marked accessible 1810 * then it must be referenced from some unreachable cycle of 1811 * (shut-down) FDs, so include it in our list of FDs to 1812 * remove. 1813 */ 1814 if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { 1815 *fpp++ = fp; 1816 nunref++; 1817 fp->f_count++; 1818 } 1819 FILE_UNLOCK(fp); 1820 } 1821 sx_sunlock(&filelist_lock); 1822 /* 1823 * For each FD on our hit list, do the following two things: 1824 */ 1825 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1826 struct file *tfp = *fpp; 1827 FILE_LOCK(tfp); 1828 if (tfp->f_type == DTYPE_SOCKET && 1829 tfp->f_data != NULL) { 1830 FILE_UNLOCK(tfp); 1831 sorflush(tfp->f_data); 1832 } else { 1833 FILE_UNLOCK(tfp); 1834 } 1835 } 1836 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { 1837 closef(*fpp, (struct thread *) NULL); 1838 unp_recycled++; 1839 } 1840 free(extra_ref, M_TEMP); 1841 } 1842 1843 void 1844 unp_dispose(struct mbuf *m) 1845 { 1846 1847 if (m) 1848 unp_scan(m, unp_discard); 1849 } 1850 1851 static int 1852 unp_listen(struct socket *so, struct unpcb *unp, int backlog, 1853 struct thread *td) 1854 { 1855 int error; 1856 1857 UNP_LOCK_ASSERT(); 1858 1859 SOCK_LOCK(so); 1860 error = solisten_proto_check(so); 1861 if (error == 0) { 1862 cru2x(td->td_ucred, &unp->unp_peercred); 1863 unp->unp_flags |= UNP_HAVEPCCACHED; 1864 solisten_proto(so, backlog); 1865 } 1866 SOCK_UNLOCK(so); 1867 return (error); 1868 } 1869 1870 static void 1871 unp_scan(struct mbuf *m0, void (*op)(struct file *)) 1872 { 1873 struct mbuf *m; 1874 struct file **rp; 1875 struct cmsghdr *cm; 1876 void *data; 1877 int i; 1878 socklen_t clen, datalen; 1879 int qfds; 1880 1881 while (m0 != NULL) { 1882 for (m = m0; m; m = m->m_next) { 1883 if (m->m_type != MT_CONTROL) 1884 continue; 1885 1886 cm = mtod(m, struct cmsghdr *); 1887 clen = m->m_len; 1888 1889 while (cm != NULL) { 1890 if (sizeof(*cm) > clen || cm->cmsg_len > clen) 1891 break; 1892 1893 data = CMSG_DATA(cm); 1894 datalen = (caddr_t)cm + cm->cmsg_len 1895 - (caddr_t)data; 1896 1897 if (cm->cmsg_level == SOL_SOCKET && 1898 cm->cmsg_type == SCM_RIGHTS) { 1899 qfds = datalen / sizeof (struct file *); 1900 rp = data; 1901 for (i = 0; i < qfds; i++) 1902 (*op)(*rp++); 1903 } 1904 1905 if (CMSG_SPACE(datalen) < clen) { 1906 clen -= CMSG_SPACE(datalen); 1907 cm = (struct cmsghdr *) 1908 ((caddr_t)cm + CMSG_SPACE(datalen)); 1909 } else { 1910 clen = 0; 1911 cm = NULL; 1912 } 1913 } 1914 } 1915 m0 = m0->m_act; 1916 } 1917 } 1918 1919 static void 1920 unp_mark(struct file *fp) 1921 { 1922 if (fp->f_gcflag & FMARK) 1923 return; 1924 unp_defer++; 1925 fp->f_gcflag |= (FMARK|FDEFER); 1926 } 1927 1928 static void 1929 unp_discard(struct file *fp) 1930 { 1931 UNP_LOCK(); 1932 FILE_LOCK(fp); 1933 fp->f_msgcount--; 1934 unp_rights--; 1935 FILE_UNLOCK(fp); 1936 UNP_UNLOCK(); 1937 (void) closef(fp, (struct thread *)NULL); 1938 } 1939