1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1989, 1991, 1993 5 * The Regents of the University of California. All Rights Reserved. 6 * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved. 7 * Copyright (c) 2018 Matthew Macy 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 34 */ 35 36 /* 37 * UNIX Domain (Local) Sockets 38 * 39 * This is an implementation of UNIX (local) domain sockets. Each socket has 40 * an associated struct unpcb (UNIX protocol control block). Stream sockets 41 * may be connected to 0 or 1 other socket. Datagram sockets may be 42 * connected to 0, 1, or many other sockets. Sockets may be created and 43 * connected in pairs (socketpair(2)), or bound/connected to using the file 44 * system name space. For most purposes, only the receive socket buffer is 45 * used, as sending on one socket delivers directly to the receive socket 46 * buffer of a second socket. 47 * 48 * The implementation is substantially complicated by the fact that 49 * "ancillary data", such as file descriptors or credentials, may be passed 50 * across UNIX domain sockets. The potential for passing UNIX domain sockets 51 * over other UNIX domain sockets requires the implementation of a simple 52 * garbage collector to find and tear down cycles of disconnected sockets. 53 * 54 * TODO: 55 * RDM 56 * rethink name space problems 57 * need a proper out-of-band 58 */ 59 60 #include <sys/cdefs.h> 61 __FBSDID("$FreeBSD$"); 62 63 #include "opt_ddb.h" 64 65 #include <sys/param.h> 66 #include <sys/capsicum.h> 67 #include <sys/domain.h> 68 #include <sys/eventhandler.h> 69 #include <sys/fcntl.h> 70 #include <sys/file.h> 71 #include <sys/filedesc.h> 72 #include <sys/kernel.h> 73 #include <sys/lock.h> 74 #include <sys/malloc.h> 75 #include <sys/mbuf.h> 76 #include <sys/mount.h> 77 #include <sys/mutex.h> 78 #include <sys/namei.h> 79 #include <sys/proc.h> 80 #include <sys/protosw.h> 81 #include <sys/queue.h> 82 #include <sys/resourcevar.h> 83 #include <sys/rwlock.h> 84 #include <sys/socket.h> 85 #include <sys/socketvar.h> 86 #include <sys/signalvar.h> 87 #include <sys/stat.h> 88 #include <sys/sx.h> 89 #include <sys/sysctl.h> 90 #include <sys/systm.h> 91 #include <sys/taskqueue.h> 92 #include <sys/un.h> 93 #include <sys/unpcb.h> 94 #include <sys/vnode.h> 95 96 #include <net/vnet.h> 97 98 #ifdef DDB 99 #include <ddb/ddb.h> 100 #endif 101 102 #include <security/mac/mac_framework.h> 103 104 #include <vm/uma.h> 105 106 MALLOC_DECLARE(M_FILECAPS); 107 108 /* 109 * See unpcb.h for the locking key. 110 */ 111 112 static uma_zone_t unp_zone; 113 static unp_gen_t unp_gencnt; /* (l) */ 114 static u_int unp_count; /* (l) Count of local sockets. */ 115 static ino_t unp_ino; /* Prototype for fake inode numbers. */ 116 static int unp_rights; /* (g) File descriptors in flight. */ 117 static struct unp_head unp_shead; /* (l) List of stream sockets. */ 118 static struct unp_head unp_dhead; /* (l) List of datagram sockets. */ 119 static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */ 120 121 struct unp_defer { 122 SLIST_ENTRY(unp_defer) ud_link; 123 struct file *ud_fp; 124 }; 125 static SLIST_HEAD(, unp_defer) unp_defers; 126 static int unp_defers_count; 127 128 static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; 129 130 /* 131 * Garbage collection of cyclic file descriptor/socket references occurs 132 * asynchronously in a taskqueue context in order to avoid recursion and 133 * reentrance in the UNIX domain socket, file descriptor, and socket layer 134 * code. See unp_gc() for a full description. 135 */ 136 static struct timeout_task unp_gc_task; 137 138 /* 139 * The close of unix domain sockets attached as SCM_RIGHTS is 140 * postponed to the taskqueue, to avoid arbitrary recursion depth. 141 * The attached sockets might have another sockets attached. 142 */ 143 static struct task unp_defer_task; 144 145 /* 146 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for 147 * stream sockets, although the total for sender and receiver is actually 148 * only PIPSIZ. 149 * 150 * Datagram sockets really use the sendspace as the maximum datagram size, 151 * and don't really want to reserve the sendspace. Their recvspace should be 152 * large enough for at least one max-size datagram plus address. 153 */ 154 #ifndef PIPSIZ 155 #define PIPSIZ 8192 156 #endif 157 static u_long unpst_sendspace = PIPSIZ; 158 static u_long unpst_recvspace = PIPSIZ; 159 static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ 160 static u_long unpdg_recvspace = 16*1024; /* support 8KB syslog msgs */ 161 static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */ 162 static u_long unpsp_recvspace = PIPSIZ; 163 164 static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 165 "Local domain"); 166 static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, 167 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 168 "SOCK_STREAM"); 169 static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, 170 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 171 "SOCK_DGRAM"); 172 static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, 173 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 174 "SOCK_SEQPACKET"); 175 176 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 177 &unpst_sendspace, 0, "Default stream send space."); 178 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, 179 &unpst_recvspace, 0, "Default stream receive space."); 180 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, 181 &unpdg_sendspace, 0, "Default datagram send space."); 182 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, 183 &unpdg_recvspace, 0, "Default datagram receive space."); 184 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW, 185 &unpsp_sendspace, 0, "Default seqpacket send space."); 186 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW, 187 &unpsp_recvspace, 0, "Default seqpacket receive space."); 188 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, 189 "File descriptors in flight."); 190 SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD, 191 &unp_defers_count, 0, 192 "File descriptors deferred to taskqueue for close."); 193 194 /* 195 * Locking and synchronization: 196 * 197 * Several types of locks exist in the local domain socket implementation: 198 * - a global linkage lock 199 * - a global connection list lock 200 * - the mtxpool lock 201 * - per-unpcb mutexes 202 * 203 * The linkage lock protects the global socket lists, the generation number 204 * counter and garbage collector state. 205 * 206 * The connection list lock protects the list of referring sockets in a datagram 207 * socket PCB. This lock is also overloaded to protect a global list of 208 * sockets whose buffers contain socket references in the form of SCM_RIGHTS 209 * messages. To avoid recursion, such references are released by a dedicated 210 * thread. 211 * 212 * The mtxpool lock protects the vnode from being modified while referenced. 213 * Lock ordering rules require that it be acquired before any PCB locks. 214 * 215 * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the 216 * unpcb. This includes the unp_conn field, which either links two connected 217 * PCBs together (for connected socket types) or points at the destination 218 * socket (for connectionless socket types). The operations of creating or 219 * destroying a connection therefore involve locking multiple PCBs. To avoid 220 * lock order reversals, in some cases this involves dropping a PCB lock and 221 * using a reference counter to maintain liveness. 222 * 223 * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer, 224 * allocated in pru_attach() and freed in pru_detach(). The validity of that 225 * pointer is an invariant, so no lock is required to dereference the so_pcb 226 * pointer if a valid socket reference is held by the caller. In practice, 227 * this is always true during operations performed on a socket. Each unpcb 228 * has a back-pointer to its socket, unp_socket, which will be stable under 229 * the same circumstances. 230 * 231 * This pointer may only be safely dereferenced as long as a valid reference 232 * to the unpcb is held. Typically, this reference will be from the socket, 233 * or from another unpcb when the referring unpcb's lock is held (in order 234 * that the reference not be invalidated during use). For example, to follow 235 * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee 236 * that detach is not run clearing unp_socket. 237 * 238 * Blocking with UNIX domain sockets is a tricky issue: unlike most network 239 * protocols, bind() is a non-atomic operation, and connect() requires 240 * potential sleeping in the protocol, due to potentially waiting on local or 241 * distributed file systems. We try to separate "lookup" operations, which 242 * may sleep, and the IPC operations themselves, which typically can occur 243 * with relative atomicity as locks can be held over the entire operation. 244 * 245 * Another tricky issue is simultaneous multi-threaded or multi-process 246 * access to a single UNIX domain socket. These are handled by the flags 247 * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or 248 * binding, both of which involve dropping UNIX domain socket locks in order 249 * to perform namei() and other file system operations. 250 */ 251 static struct rwlock unp_link_rwlock; 252 static struct mtx unp_defers_lock; 253 254 #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \ 255 "unp_link_rwlock") 256 257 #define UNP_LINK_LOCK_ASSERT() rw_assert(&unp_link_rwlock, \ 258 RA_LOCKED) 259 #define UNP_LINK_UNLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ 260 RA_UNLOCKED) 261 262 #define UNP_LINK_RLOCK() rw_rlock(&unp_link_rwlock) 263 #define UNP_LINK_RUNLOCK() rw_runlock(&unp_link_rwlock) 264 #define UNP_LINK_WLOCK() rw_wlock(&unp_link_rwlock) 265 #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock) 266 #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ 267 RA_WLOCKED) 268 #define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock) 269 270 #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \ 271 "unp_defer", NULL, MTX_DEF) 272 #define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock) 273 #define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock) 274 275 #define UNP_REF_LIST_LOCK() UNP_DEFERRED_LOCK(); 276 #define UNP_REF_LIST_UNLOCK() UNP_DEFERRED_UNLOCK(); 277 278 #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \ 279 "unp", "unp", \ 280 MTX_DUPOK|MTX_DEF) 281 #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx) 282 #define UNP_PCB_LOCKPTR(unp) (&(unp)->unp_mtx) 283 #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx) 284 #define UNP_PCB_TRYLOCK(unp) mtx_trylock(&(unp)->unp_mtx) 285 #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx) 286 #define UNP_PCB_OWNED(unp) mtx_owned(&(unp)->unp_mtx) 287 #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED) 288 #define UNP_PCB_UNLOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED) 289 290 static int uipc_connect2(struct socket *, struct socket *); 291 static int uipc_ctloutput(struct socket *, struct sockopt *); 292 static int unp_connect(struct socket *, struct sockaddr *, 293 struct thread *); 294 static int unp_connectat(int, struct socket *, struct sockaddr *, 295 struct thread *); 296 static void unp_connect2(struct socket *so, struct socket *so2, int); 297 static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2); 298 static void unp_dispose(struct socket *so); 299 static void unp_dispose_mbuf(struct mbuf *); 300 static void unp_shutdown(struct unpcb *); 301 static void unp_drop(struct unpcb *); 302 static void unp_gc(__unused void *, int); 303 static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int)); 304 static void unp_discard(struct file *); 305 static void unp_freerights(struct filedescent **, int); 306 static int unp_internalize(struct mbuf **, struct thread *); 307 static void unp_internalize_fp(struct file *); 308 static int unp_externalize(struct mbuf *, struct mbuf **, int); 309 static int unp_externalize_fp(struct file *); 310 static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *, int); 311 static void unp_process_defers(void * __unused, int); 312 313 static void 314 unp_pcb_hold(struct unpcb *unp) 315 { 316 u_int old __unused; 317 318 old = refcount_acquire(&unp->unp_refcount); 319 KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp)); 320 } 321 322 static __result_use_check bool 323 unp_pcb_rele(struct unpcb *unp) 324 { 325 bool ret; 326 327 UNP_PCB_LOCK_ASSERT(unp); 328 329 if ((ret = refcount_release(&unp->unp_refcount))) { 330 UNP_PCB_UNLOCK(unp); 331 UNP_PCB_LOCK_DESTROY(unp); 332 uma_zfree(unp_zone, unp); 333 } 334 return (ret); 335 } 336 337 static void 338 unp_pcb_rele_notlast(struct unpcb *unp) 339 { 340 bool ret __unused; 341 342 ret = refcount_release(&unp->unp_refcount); 343 KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp)); 344 } 345 346 static void 347 unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2) 348 { 349 UNP_PCB_UNLOCK_ASSERT(unp); 350 UNP_PCB_UNLOCK_ASSERT(unp2); 351 352 if (unp == unp2) { 353 UNP_PCB_LOCK(unp); 354 } else if ((uintptr_t)unp2 > (uintptr_t)unp) { 355 UNP_PCB_LOCK(unp); 356 UNP_PCB_LOCK(unp2); 357 } else { 358 UNP_PCB_LOCK(unp2); 359 UNP_PCB_LOCK(unp); 360 } 361 } 362 363 static void 364 unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2) 365 { 366 UNP_PCB_UNLOCK(unp); 367 if (unp != unp2) 368 UNP_PCB_UNLOCK(unp2); 369 } 370 371 /* 372 * Try to lock the connected peer of an already locked socket. In some cases 373 * this requires that we unlock the current socket. The pairbusy counter is 374 * used to block concurrent connection attempts while the lock is dropped. The 375 * caller must be careful to revalidate PCB state. 376 */ 377 static struct unpcb * 378 unp_pcb_lock_peer(struct unpcb *unp) 379 { 380 struct unpcb *unp2; 381 382 UNP_PCB_LOCK_ASSERT(unp); 383 unp2 = unp->unp_conn; 384 if (unp2 == NULL) 385 return (NULL); 386 if (__predict_false(unp == unp2)) 387 return (unp); 388 389 UNP_PCB_UNLOCK_ASSERT(unp2); 390 391 if (__predict_true(UNP_PCB_TRYLOCK(unp2))) 392 return (unp2); 393 if ((uintptr_t)unp2 > (uintptr_t)unp) { 394 UNP_PCB_LOCK(unp2); 395 return (unp2); 396 } 397 unp->unp_pairbusy++; 398 unp_pcb_hold(unp2); 399 UNP_PCB_UNLOCK(unp); 400 401 UNP_PCB_LOCK(unp2); 402 UNP_PCB_LOCK(unp); 403 KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL, 404 ("%s: socket %p was reconnected", __func__, unp)); 405 if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) { 406 unp->unp_flags &= ~UNP_WAITING; 407 wakeup(unp); 408 } 409 if (unp_pcb_rele(unp2)) { 410 /* unp2 is unlocked. */ 411 return (NULL); 412 } 413 if (unp->unp_conn == NULL) { 414 UNP_PCB_UNLOCK(unp2); 415 return (NULL); 416 } 417 return (unp2); 418 } 419 420 /* 421 * Definitions of protocols supported in the LOCAL domain. 422 */ 423 static struct domain localdomain; 424 static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream; 425 static struct pr_usrreqs uipc_usrreqs_seqpacket; 426 static struct protosw localsw[] = { 427 { 428 .pr_type = SOCK_STREAM, 429 .pr_domain = &localdomain, 430 .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS| 431 PR_CAPATTACH, 432 .pr_ctloutput = &uipc_ctloutput, 433 .pr_usrreqs = &uipc_usrreqs_stream 434 }, 435 { 436 .pr_type = SOCK_DGRAM, 437 .pr_domain = &localdomain, 438 .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS|PR_CAPATTACH, 439 .pr_ctloutput = &uipc_ctloutput, 440 .pr_usrreqs = &uipc_usrreqs_dgram 441 }, 442 { 443 .pr_type = SOCK_SEQPACKET, 444 .pr_domain = &localdomain, 445 446 /* 447 * XXXRW: For now, PR_ADDR because soreceive will bump into them 448 * due to our use of sbappendaddr. A new sbappend variants is needed 449 * that supports both atomic record writes and control data. 450 */ 451 .pr_flags = PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED| 452 PR_WANTRCVD|PR_RIGHTS|PR_CAPATTACH, 453 .pr_ctloutput = &uipc_ctloutput, 454 .pr_usrreqs = &uipc_usrreqs_seqpacket, 455 }, 456 }; 457 458 static struct domain localdomain = { 459 .dom_family = AF_LOCAL, 460 .dom_name = "local", 461 .dom_externalize = unp_externalize, 462 .dom_dispose = unp_dispose, 463 .dom_protosw = localsw, 464 .dom_protoswNPROTOSW = &localsw[nitems(localsw)] 465 }; 466 DOMAIN_SET(local); 467 468 static void 469 uipc_abort(struct socket *so) 470 { 471 struct unpcb *unp, *unp2; 472 473 unp = sotounpcb(so); 474 KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); 475 UNP_PCB_UNLOCK_ASSERT(unp); 476 477 UNP_PCB_LOCK(unp); 478 unp2 = unp->unp_conn; 479 if (unp2 != NULL) { 480 unp_pcb_hold(unp2); 481 UNP_PCB_UNLOCK(unp); 482 unp_drop(unp2); 483 } else 484 UNP_PCB_UNLOCK(unp); 485 } 486 487 static int 488 uipc_accept(struct socket *so, struct sockaddr **nam) 489 { 490 struct unpcb *unp, *unp2; 491 const struct sockaddr *sa; 492 493 /* 494 * Pass back name of connected socket, if it was bound and we are 495 * still connected (our peer may have closed already!). 496 */ 497 unp = sotounpcb(so); 498 KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); 499 500 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 501 UNP_PCB_LOCK(unp); 502 unp2 = unp_pcb_lock_peer(unp); 503 if (unp2 != NULL && unp2->unp_addr != NULL) 504 sa = (struct sockaddr *)unp2->unp_addr; 505 else 506 sa = &sun_noname; 507 bcopy(sa, *nam, sa->sa_len); 508 if (unp2 != NULL) 509 unp_pcb_unlock_pair(unp, unp2); 510 else 511 UNP_PCB_UNLOCK(unp); 512 return (0); 513 } 514 515 static int 516 uipc_attach(struct socket *so, int proto, struct thread *td) 517 { 518 u_long sendspace, recvspace; 519 struct unpcb *unp; 520 int error; 521 bool locked; 522 523 KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); 524 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 525 switch (so->so_type) { 526 case SOCK_STREAM: 527 sendspace = unpst_sendspace; 528 recvspace = unpst_recvspace; 529 break; 530 531 case SOCK_DGRAM: 532 sendspace = unpdg_sendspace; 533 recvspace = unpdg_recvspace; 534 break; 535 536 case SOCK_SEQPACKET: 537 sendspace = unpsp_sendspace; 538 recvspace = unpsp_recvspace; 539 break; 540 541 default: 542 panic("uipc_attach"); 543 } 544 error = soreserve(so, sendspace, recvspace); 545 if (error) 546 return (error); 547 } 548 unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO); 549 if (unp == NULL) 550 return (ENOBUFS); 551 LIST_INIT(&unp->unp_refs); 552 UNP_PCB_LOCK_INIT(unp); 553 unp->unp_socket = so; 554 so->so_pcb = unp; 555 refcount_init(&unp->unp_refcount, 1); 556 557 if ((locked = UNP_LINK_WOWNED()) == false) 558 UNP_LINK_WLOCK(); 559 560 unp->unp_gencnt = ++unp_gencnt; 561 unp->unp_ino = ++unp_ino; 562 unp_count++; 563 switch (so->so_type) { 564 case SOCK_STREAM: 565 LIST_INSERT_HEAD(&unp_shead, unp, unp_link); 566 break; 567 568 case SOCK_DGRAM: 569 LIST_INSERT_HEAD(&unp_dhead, unp, unp_link); 570 break; 571 572 case SOCK_SEQPACKET: 573 LIST_INSERT_HEAD(&unp_sphead, unp, unp_link); 574 break; 575 576 default: 577 panic("uipc_attach"); 578 } 579 580 if (locked == false) 581 UNP_LINK_WUNLOCK(); 582 583 return (0); 584 } 585 586 static int 587 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) 588 { 589 struct sockaddr_un *soun = (struct sockaddr_un *)nam; 590 struct vattr vattr; 591 int error, namelen; 592 struct nameidata nd; 593 struct unpcb *unp; 594 struct vnode *vp; 595 struct mount *mp; 596 cap_rights_t rights; 597 char *buf; 598 599 if (nam->sa_family != AF_UNIX) 600 return (EAFNOSUPPORT); 601 602 unp = sotounpcb(so); 603 KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); 604 605 if (soun->sun_len > sizeof(struct sockaddr_un)) 606 return (EINVAL); 607 namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); 608 if (namelen <= 0) 609 return (EINVAL); 610 611 /* 612 * We don't allow simultaneous bind() calls on a single UNIX domain 613 * socket, so flag in-progress operations, and return an error if an 614 * operation is already in progress. 615 * 616 * Historically, we have not allowed a socket to be rebound, so this 617 * also returns an error. Not allowing re-binding simplifies the 618 * implementation and avoids a great many possible failure modes. 619 */ 620 UNP_PCB_LOCK(unp); 621 if (unp->unp_vnode != NULL) { 622 UNP_PCB_UNLOCK(unp); 623 return (EINVAL); 624 } 625 if (unp->unp_flags & UNP_BINDING) { 626 UNP_PCB_UNLOCK(unp); 627 return (EALREADY); 628 } 629 unp->unp_flags |= UNP_BINDING; 630 UNP_PCB_UNLOCK(unp); 631 632 buf = malloc(namelen + 1, M_TEMP, M_WAITOK); 633 bcopy(soun->sun_path, buf, namelen); 634 buf[namelen] = 0; 635 636 restart: 637 NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE, 638 UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT)); 639 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ 640 error = namei(&nd); 641 if (error) 642 goto error; 643 vp = nd.ni_vp; 644 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { 645 NDFREE_PNBUF(&nd); 646 if (nd.ni_dvp == vp) 647 vrele(nd.ni_dvp); 648 else 649 vput(nd.ni_dvp); 650 if (vp != NULL) { 651 vrele(vp); 652 error = EADDRINUSE; 653 goto error; 654 } 655 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); 656 if (error) 657 goto error; 658 goto restart; 659 } 660 VATTR_NULL(&vattr); 661 vattr.va_type = VSOCK; 662 vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_pd->pd_cmask); 663 #ifdef MAC 664 error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, 665 &vattr); 666 #endif 667 if (error == 0) 668 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); 669 NDFREE_PNBUF(&nd); 670 if (error) { 671 VOP_VPUT_PAIR(nd.ni_dvp, NULL, true); 672 vn_finished_write(mp); 673 if (error == ERELOOKUP) 674 goto restart; 675 goto error; 676 } 677 vp = nd.ni_vp; 678 ASSERT_VOP_ELOCKED(vp, "uipc_bind"); 679 soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); 680 681 UNP_PCB_LOCK(unp); 682 VOP_UNP_BIND(vp, unp); 683 unp->unp_vnode = vp; 684 unp->unp_addr = soun; 685 unp->unp_flags &= ~UNP_BINDING; 686 UNP_PCB_UNLOCK(unp); 687 vref(vp); 688 VOP_VPUT_PAIR(nd.ni_dvp, &vp, true); 689 vn_finished_write(mp); 690 free(buf, M_TEMP); 691 return (0); 692 693 error: 694 UNP_PCB_LOCK(unp); 695 unp->unp_flags &= ~UNP_BINDING; 696 UNP_PCB_UNLOCK(unp); 697 free(buf, M_TEMP); 698 return (error); 699 } 700 701 static int 702 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 703 { 704 705 return (uipc_bindat(AT_FDCWD, so, nam, td)); 706 } 707 708 static int 709 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 710 { 711 int error; 712 713 KASSERT(td == curthread, ("uipc_connect: td != curthread")); 714 error = unp_connect(so, nam, td); 715 return (error); 716 } 717 718 static int 719 uipc_connectat(int fd, struct socket *so, struct sockaddr *nam, 720 struct thread *td) 721 { 722 int error; 723 724 KASSERT(td == curthread, ("uipc_connectat: td != curthread")); 725 error = unp_connectat(fd, so, nam, td); 726 return (error); 727 } 728 729 static void 730 uipc_close(struct socket *so) 731 { 732 struct unpcb *unp, *unp2; 733 struct vnode *vp = NULL; 734 struct mtx *vplock; 735 736 unp = sotounpcb(so); 737 KASSERT(unp != NULL, ("uipc_close: unp == NULL")); 738 739 vplock = NULL; 740 if ((vp = unp->unp_vnode) != NULL) { 741 vplock = mtx_pool_find(mtxpool_sleep, vp); 742 mtx_lock(vplock); 743 } 744 UNP_PCB_LOCK(unp); 745 if (vp && unp->unp_vnode == NULL) { 746 mtx_unlock(vplock); 747 vp = NULL; 748 } 749 if (vp != NULL) { 750 VOP_UNP_DETACH(vp); 751 unp->unp_vnode = NULL; 752 } 753 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) 754 unp_disconnect(unp, unp2); 755 else 756 UNP_PCB_UNLOCK(unp); 757 if (vp) { 758 mtx_unlock(vplock); 759 vrele(vp); 760 } 761 } 762 763 static int 764 uipc_connect2(struct socket *so1, struct socket *so2) 765 { 766 struct unpcb *unp, *unp2; 767 768 if (so1->so_type != so2->so_type) 769 return (EPROTOTYPE); 770 771 unp = so1->so_pcb; 772 KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); 773 unp2 = so2->so_pcb; 774 KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL")); 775 unp_pcb_lock_pair(unp, unp2); 776 unp_connect2(so1, so2, PRU_CONNECT2); 777 unp_pcb_unlock_pair(unp, unp2); 778 779 return (0); 780 } 781 782 static void 783 uipc_detach(struct socket *so) 784 { 785 struct unpcb *unp, *unp2; 786 struct mtx *vplock; 787 struct vnode *vp; 788 int local_unp_rights; 789 790 unp = sotounpcb(so); 791 KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); 792 793 vp = NULL; 794 vplock = NULL; 795 796 UNP_LINK_WLOCK(); 797 LIST_REMOVE(unp, unp_link); 798 if (unp->unp_gcflag & UNPGC_DEAD) 799 LIST_REMOVE(unp, unp_dead); 800 unp->unp_gencnt = ++unp_gencnt; 801 --unp_count; 802 UNP_LINK_WUNLOCK(); 803 804 UNP_PCB_UNLOCK_ASSERT(unp); 805 restart: 806 if ((vp = unp->unp_vnode) != NULL) { 807 vplock = mtx_pool_find(mtxpool_sleep, vp); 808 mtx_lock(vplock); 809 } 810 UNP_PCB_LOCK(unp); 811 if (unp->unp_vnode != vp && unp->unp_vnode != NULL) { 812 if (vplock) 813 mtx_unlock(vplock); 814 UNP_PCB_UNLOCK(unp); 815 goto restart; 816 } 817 if ((vp = unp->unp_vnode) != NULL) { 818 VOP_UNP_DETACH(vp); 819 unp->unp_vnode = NULL; 820 } 821 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) 822 unp_disconnect(unp, unp2); 823 else 824 UNP_PCB_UNLOCK(unp); 825 826 UNP_REF_LIST_LOCK(); 827 while (!LIST_EMPTY(&unp->unp_refs)) { 828 struct unpcb *ref = LIST_FIRST(&unp->unp_refs); 829 830 unp_pcb_hold(ref); 831 UNP_REF_LIST_UNLOCK(); 832 833 MPASS(ref != unp); 834 UNP_PCB_UNLOCK_ASSERT(ref); 835 unp_drop(ref); 836 UNP_REF_LIST_LOCK(); 837 } 838 UNP_REF_LIST_UNLOCK(); 839 840 UNP_PCB_LOCK(unp); 841 local_unp_rights = unp_rights; 842 unp->unp_socket->so_pcb = NULL; 843 unp->unp_socket = NULL; 844 free(unp->unp_addr, M_SONAME); 845 unp->unp_addr = NULL; 846 if (!unp_pcb_rele(unp)) 847 UNP_PCB_UNLOCK(unp); 848 if (vp) { 849 mtx_unlock(vplock); 850 vrele(vp); 851 } 852 if (local_unp_rights) 853 taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1); 854 } 855 856 static int 857 uipc_disconnect(struct socket *so) 858 { 859 struct unpcb *unp, *unp2; 860 861 unp = sotounpcb(so); 862 KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); 863 864 UNP_PCB_LOCK(unp); 865 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) 866 unp_disconnect(unp, unp2); 867 else 868 UNP_PCB_UNLOCK(unp); 869 return (0); 870 } 871 872 static int 873 uipc_listen(struct socket *so, int backlog, struct thread *td) 874 { 875 struct unpcb *unp; 876 int error; 877 878 MPASS(so->so_type != SOCK_DGRAM); 879 880 /* 881 * Synchronize with concurrent connection attempts. 882 */ 883 error = 0; 884 unp = sotounpcb(so); 885 UNP_PCB_LOCK(unp); 886 if (unp->unp_conn != NULL || (unp->unp_flags & UNP_CONNECTING) != 0) 887 error = EINVAL; 888 else if (unp->unp_vnode == NULL) 889 error = EDESTADDRREQ; 890 if (error != 0) { 891 UNP_PCB_UNLOCK(unp); 892 return (error); 893 } 894 895 SOCK_LOCK(so); 896 error = solisten_proto_check(so); 897 if (error == 0) { 898 cru2xt(td, &unp->unp_peercred); 899 solisten_proto(so, backlog); 900 } 901 SOCK_UNLOCK(so); 902 UNP_PCB_UNLOCK(unp); 903 return (error); 904 } 905 906 static int 907 uipc_peeraddr(struct socket *so, struct sockaddr **nam) 908 { 909 struct unpcb *unp, *unp2; 910 const struct sockaddr *sa; 911 912 unp = sotounpcb(so); 913 KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); 914 915 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 916 UNP_LINK_RLOCK(); 917 /* 918 * XXX: It seems that this test always fails even when connection is 919 * established. So, this else clause is added as workaround to 920 * return PF_LOCAL sockaddr. 921 */ 922 unp2 = unp->unp_conn; 923 if (unp2 != NULL) { 924 UNP_PCB_LOCK(unp2); 925 if (unp2->unp_addr != NULL) 926 sa = (struct sockaddr *) unp2->unp_addr; 927 else 928 sa = &sun_noname; 929 bcopy(sa, *nam, sa->sa_len); 930 UNP_PCB_UNLOCK(unp2); 931 } else { 932 sa = &sun_noname; 933 bcopy(sa, *nam, sa->sa_len); 934 } 935 UNP_LINK_RUNLOCK(); 936 return (0); 937 } 938 939 static int 940 uipc_rcvd(struct socket *so, int flags) 941 { 942 struct unpcb *unp, *unp2; 943 struct socket *so2; 944 u_int mbcnt, sbcc; 945 946 unp = sotounpcb(so); 947 KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); 948 KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET, 949 ("%s: socktype %d", __func__, so->so_type)); 950 951 /* 952 * Adjust backpressure on sender and wakeup any waiting to write. 953 * 954 * The unp lock is acquired to maintain the validity of the unp_conn 955 * pointer; no lock on unp2 is required as unp2->unp_socket will be 956 * static as long as we don't permit unp2 to disconnect from unp, 957 * which is prevented by the lock on unp. We cache values from 958 * so_rcv to avoid holding the so_rcv lock over the entire 959 * transaction on the remote so_snd. 960 */ 961 SOCKBUF_LOCK(&so->so_rcv); 962 mbcnt = so->so_rcv.sb_mbcnt; 963 sbcc = sbavail(&so->so_rcv); 964 SOCKBUF_UNLOCK(&so->so_rcv); 965 /* 966 * There is a benign race condition at this point. If we're planning to 967 * clear SB_STOP, but uipc_send is called on the connected socket at 968 * this instant, it might add data to the sockbuf and set SB_STOP. Then 969 * we would erroneously clear SB_STOP below, even though the sockbuf is 970 * full. The race is benign because the only ill effect is to allow the 971 * sockbuf to exceed its size limit, and the size limits are not 972 * strictly guaranteed anyway. 973 */ 974 UNP_PCB_LOCK(unp); 975 unp2 = unp->unp_conn; 976 if (unp2 == NULL) { 977 UNP_PCB_UNLOCK(unp); 978 return (0); 979 } 980 so2 = unp2->unp_socket; 981 SOCKBUF_LOCK(&so2->so_snd); 982 if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax) 983 so2->so_snd.sb_flags &= ~SB_STOP; 984 sowwakeup_locked(so2); 985 UNP_PCB_UNLOCK(unp); 986 return (0); 987 } 988 989 static int 990 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 991 struct mbuf *control, struct thread *td) 992 { 993 struct unpcb *unp, *unp2; 994 struct socket *so2; 995 u_int mbcnt, sbcc; 996 int error; 997 998 unp = sotounpcb(so); 999 KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); 1000 KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM || 1001 so->so_type == SOCK_SEQPACKET, 1002 ("%s: socktype %d", __func__, so->so_type)); 1003 1004 error = 0; 1005 if (flags & PRUS_OOB) { 1006 error = EOPNOTSUPP; 1007 goto release; 1008 } 1009 if (control != NULL && (error = unp_internalize(&control, td))) 1010 goto release; 1011 1012 unp2 = NULL; 1013 switch (so->so_type) { 1014 case SOCK_DGRAM: 1015 { 1016 const struct sockaddr *from; 1017 1018 if (nam != NULL) { 1019 error = unp_connect(so, nam, td); 1020 if (error != 0) 1021 break; 1022 } 1023 UNP_PCB_LOCK(unp); 1024 1025 /* 1026 * Because connect() and send() are non-atomic in a sendto() 1027 * with a target address, it's possible that the socket will 1028 * have disconnected before the send() can run. In that case 1029 * return the slightly counter-intuitive but otherwise 1030 * correct error that the socket is not connected. 1031 */ 1032 unp2 = unp_pcb_lock_peer(unp); 1033 if (unp2 == NULL) { 1034 UNP_PCB_UNLOCK(unp); 1035 error = ENOTCONN; 1036 break; 1037 } 1038 1039 if (unp2->unp_flags & UNP_WANTCRED_MASK) 1040 control = unp_addsockcred(td, control, 1041 unp2->unp_flags); 1042 if (unp->unp_addr != NULL) 1043 from = (struct sockaddr *)unp->unp_addr; 1044 else 1045 from = &sun_noname; 1046 so2 = unp2->unp_socket; 1047 SOCKBUF_LOCK(&so2->so_rcv); 1048 if (sbappendaddr_locked(&so2->so_rcv, from, m, 1049 control)) { 1050 sorwakeup_locked(so2); 1051 m = NULL; 1052 control = NULL; 1053 } else { 1054 soroverflow_locked(so2); 1055 error = (so->so_state & SS_NBIO) ? EAGAIN : ENOBUFS; 1056 } 1057 if (nam != NULL) 1058 unp_disconnect(unp, unp2); 1059 else 1060 unp_pcb_unlock_pair(unp, unp2); 1061 break; 1062 } 1063 1064 case SOCK_SEQPACKET: 1065 case SOCK_STREAM: 1066 if ((so->so_state & SS_ISCONNECTED) == 0) { 1067 if (nam != NULL) { 1068 error = unp_connect(so, nam, td); 1069 if (error != 0) 1070 break; 1071 } else { 1072 error = ENOTCONN; 1073 break; 1074 } 1075 } 1076 1077 UNP_PCB_LOCK(unp); 1078 if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) { 1079 UNP_PCB_UNLOCK(unp); 1080 error = ENOTCONN; 1081 break; 1082 } else if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1083 unp_pcb_unlock_pair(unp, unp2); 1084 error = EPIPE; 1085 break; 1086 } 1087 UNP_PCB_UNLOCK(unp); 1088 if ((so2 = unp2->unp_socket) == NULL) { 1089 UNP_PCB_UNLOCK(unp2); 1090 error = ENOTCONN; 1091 break; 1092 } 1093 SOCKBUF_LOCK(&so2->so_rcv); 1094 if (unp2->unp_flags & UNP_WANTCRED_MASK) { 1095 /* 1096 * Credentials are passed only once on SOCK_STREAM and 1097 * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or 1098 * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS). 1099 */ 1100 control = unp_addsockcred(td, control, unp2->unp_flags); 1101 unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT; 1102 } 1103 1104 /* 1105 * Send to paired receive port and wake up readers. Don't 1106 * check for space available in the receive buffer if we're 1107 * attaching ancillary data; Unix domain sockets only check 1108 * for space in the sending sockbuf, and that check is 1109 * performed one level up the stack. At that level we cannot 1110 * precisely account for the amount of buffer space used 1111 * (e.g., because control messages are not yet internalized). 1112 */ 1113 switch (so->so_type) { 1114 case SOCK_STREAM: 1115 if (control != NULL) { 1116 sbappendcontrol_locked(&so2->so_rcv, m, 1117 control, flags); 1118 control = NULL; 1119 } else 1120 sbappend_locked(&so2->so_rcv, m, flags); 1121 break; 1122 1123 case SOCK_SEQPACKET: 1124 if (sbappendaddr_nospacecheck_locked(&so2->so_rcv, 1125 &sun_noname, m, control)) 1126 control = NULL; 1127 break; 1128 } 1129 1130 mbcnt = so2->so_rcv.sb_mbcnt; 1131 sbcc = sbavail(&so2->so_rcv); 1132 if (sbcc) 1133 sorwakeup_locked(so2); 1134 else 1135 SOCKBUF_UNLOCK(&so2->so_rcv); 1136 1137 /* 1138 * The PCB lock on unp2 protects the SB_STOP flag. Without it, 1139 * it would be possible for uipc_rcvd to be called at this 1140 * point, drain the receiving sockbuf, clear SB_STOP, and then 1141 * we would set SB_STOP below. That could lead to an empty 1142 * sockbuf having SB_STOP set 1143 */ 1144 SOCKBUF_LOCK(&so->so_snd); 1145 if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax) 1146 so->so_snd.sb_flags |= SB_STOP; 1147 SOCKBUF_UNLOCK(&so->so_snd); 1148 UNP_PCB_UNLOCK(unp2); 1149 m = NULL; 1150 break; 1151 } 1152 1153 /* 1154 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown. 1155 */ 1156 if (flags & PRUS_EOF) { 1157 UNP_PCB_LOCK(unp); 1158 socantsendmore(so); 1159 unp_shutdown(unp); 1160 UNP_PCB_UNLOCK(unp); 1161 } 1162 if (control != NULL && error != 0) 1163 unp_dispose_mbuf(control); 1164 1165 release: 1166 if (control != NULL) 1167 m_freem(control); 1168 /* 1169 * In case of PRUS_NOTREADY, uipc_ready() is responsible 1170 * for freeing memory. 1171 */ 1172 if (m != NULL && (flags & PRUS_NOTREADY) == 0) 1173 m_freem(m); 1174 return (error); 1175 } 1176 1177 static bool 1178 uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp) 1179 { 1180 struct mbuf *mb, *n; 1181 struct sockbuf *sb; 1182 1183 SOCK_LOCK(so); 1184 if (SOLISTENING(so)) { 1185 SOCK_UNLOCK(so); 1186 return (false); 1187 } 1188 mb = NULL; 1189 sb = &so->so_rcv; 1190 SOCKBUF_LOCK(sb); 1191 if (sb->sb_fnrdy != NULL) { 1192 for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) { 1193 if (mb == m) { 1194 *errorp = sbready(sb, m, count); 1195 break; 1196 } 1197 mb = mb->m_next; 1198 if (mb == NULL) { 1199 mb = n; 1200 if (mb != NULL) 1201 n = mb->m_nextpkt; 1202 } 1203 } 1204 } 1205 SOCKBUF_UNLOCK(sb); 1206 SOCK_UNLOCK(so); 1207 return (mb != NULL); 1208 } 1209 1210 static int 1211 uipc_ready(struct socket *so, struct mbuf *m, int count) 1212 { 1213 struct unpcb *unp, *unp2; 1214 struct socket *so2; 1215 int error, i; 1216 1217 unp = sotounpcb(so); 1218 1219 KASSERT(so->so_type == SOCK_STREAM, 1220 ("%s: unexpected socket type for %p", __func__, so)); 1221 1222 UNP_PCB_LOCK(unp); 1223 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) { 1224 UNP_PCB_UNLOCK(unp); 1225 so2 = unp2->unp_socket; 1226 SOCKBUF_LOCK(&so2->so_rcv); 1227 if ((error = sbready(&so2->so_rcv, m, count)) == 0) 1228 sorwakeup_locked(so2); 1229 else 1230 SOCKBUF_UNLOCK(&so2->so_rcv); 1231 UNP_PCB_UNLOCK(unp2); 1232 return (error); 1233 } 1234 UNP_PCB_UNLOCK(unp); 1235 1236 /* 1237 * The receiving socket has been disconnected, but may still be valid. 1238 * In this case, the now-ready mbufs are still present in its socket 1239 * buffer, so perform an exhaustive search before giving up and freeing 1240 * the mbufs. 1241 */ 1242 UNP_LINK_RLOCK(); 1243 LIST_FOREACH(unp, &unp_shead, unp_link) { 1244 if (uipc_ready_scan(unp->unp_socket, m, count, &error)) 1245 break; 1246 } 1247 UNP_LINK_RUNLOCK(); 1248 1249 if (unp == NULL) { 1250 for (i = 0; i < count; i++) 1251 m = m_free(m); 1252 error = ECONNRESET; 1253 } 1254 return (error); 1255 } 1256 1257 static int 1258 uipc_sense(struct socket *so, struct stat *sb) 1259 { 1260 struct unpcb *unp; 1261 1262 unp = sotounpcb(so); 1263 KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); 1264 1265 sb->st_blksize = so->so_snd.sb_hiwat; 1266 sb->st_dev = NODEV; 1267 sb->st_ino = unp->unp_ino; 1268 return (0); 1269 } 1270 1271 static int 1272 uipc_shutdown(struct socket *so) 1273 { 1274 struct unpcb *unp; 1275 1276 unp = sotounpcb(so); 1277 KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); 1278 1279 UNP_PCB_LOCK(unp); 1280 socantsendmore(so); 1281 unp_shutdown(unp); 1282 UNP_PCB_UNLOCK(unp); 1283 return (0); 1284 } 1285 1286 static int 1287 uipc_sockaddr(struct socket *so, struct sockaddr **nam) 1288 { 1289 struct unpcb *unp; 1290 const struct sockaddr *sa; 1291 1292 unp = sotounpcb(so); 1293 KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); 1294 1295 *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 1296 UNP_PCB_LOCK(unp); 1297 if (unp->unp_addr != NULL) 1298 sa = (struct sockaddr *) unp->unp_addr; 1299 else 1300 sa = &sun_noname; 1301 bcopy(sa, *nam, sa->sa_len); 1302 UNP_PCB_UNLOCK(unp); 1303 return (0); 1304 } 1305 1306 static struct pr_usrreqs uipc_usrreqs_dgram = { 1307 .pru_abort = uipc_abort, 1308 .pru_accept = uipc_accept, 1309 .pru_attach = uipc_attach, 1310 .pru_bind = uipc_bind, 1311 .pru_bindat = uipc_bindat, 1312 .pru_connect = uipc_connect, 1313 .pru_connectat = uipc_connectat, 1314 .pru_connect2 = uipc_connect2, 1315 .pru_detach = uipc_detach, 1316 .pru_disconnect = uipc_disconnect, 1317 .pru_peeraddr = uipc_peeraddr, 1318 .pru_send = uipc_send, 1319 .pru_sense = uipc_sense, 1320 .pru_shutdown = uipc_shutdown, 1321 .pru_sockaddr = uipc_sockaddr, 1322 .pru_soreceive = soreceive_dgram, 1323 .pru_close = uipc_close, 1324 }; 1325 1326 static struct pr_usrreqs uipc_usrreqs_seqpacket = { 1327 .pru_abort = uipc_abort, 1328 .pru_accept = uipc_accept, 1329 .pru_attach = uipc_attach, 1330 .pru_bind = uipc_bind, 1331 .pru_bindat = uipc_bindat, 1332 .pru_connect = uipc_connect, 1333 .pru_connectat = uipc_connectat, 1334 .pru_connect2 = uipc_connect2, 1335 .pru_detach = uipc_detach, 1336 .pru_disconnect = uipc_disconnect, 1337 .pru_listen = uipc_listen, 1338 .pru_peeraddr = uipc_peeraddr, 1339 .pru_rcvd = uipc_rcvd, 1340 .pru_send = uipc_send, 1341 .pru_sense = uipc_sense, 1342 .pru_shutdown = uipc_shutdown, 1343 .pru_sockaddr = uipc_sockaddr, 1344 .pru_soreceive = soreceive_generic, /* XXX: or...? */ 1345 .pru_close = uipc_close, 1346 }; 1347 1348 static struct pr_usrreqs uipc_usrreqs_stream = { 1349 .pru_abort = uipc_abort, 1350 .pru_accept = uipc_accept, 1351 .pru_attach = uipc_attach, 1352 .pru_bind = uipc_bind, 1353 .pru_bindat = uipc_bindat, 1354 .pru_connect = uipc_connect, 1355 .pru_connectat = uipc_connectat, 1356 .pru_connect2 = uipc_connect2, 1357 .pru_detach = uipc_detach, 1358 .pru_disconnect = uipc_disconnect, 1359 .pru_listen = uipc_listen, 1360 .pru_peeraddr = uipc_peeraddr, 1361 .pru_rcvd = uipc_rcvd, 1362 .pru_send = uipc_send, 1363 .pru_ready = uipc_ready, 1364 .pru_sense = uipc_sense, 1365 .pru_shutdown = uipc_shutdown, 1366 .pru_sockaddr = uipc_sockaddr, 1367 .pru_soreceive = soreceive_generic, 1368 .pru_close = uipc_close, 1369 }; 1370 1371 static int 1372 uipc_ctloutput(struct socket *so, struct sockopt *sopt) 1373 { 1374 struct unpcb *unp; 1375 struct xucred xu; 1376 int error, optval; 1377 1378 if (sopt->sopt_level != SOL_LOCAL) 1379 return (EINVAL); 1380 1381 unp = sotounpcb(so); 1382 KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); 1383 error = 0; 1384 switch (sopt->sopt_dir) { 1385 case SOPT_GET: 1386 switch (sopt->sopt_name) { 1387 case LOCAL_PEERCRED: 1388 UNP_PCB_LOCK(unp); 1389 if (unp->unp_flags & UNP_HAVEPC) 1390 xu = unp->unp_peercred; 1391 else { 1392 if (so->so_type == SOCK_STREAM) 1393 error = ENOTCONN; 1394 else 1395 error = EINVAL; 1396 } 1397 UNP_PCB_UNLOCK(unp); 1398 if (error == 0) 1399 error = sooptcopyout(sopt, &xu, sizeof(xu)); 1400 break; 1401 1402 case LOCAL_CREDS: 1403 /* Unlocked read. */ 1404 optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0; 1405 error = sooptcopyout(sopt, &optval, sizeof(optval)); 1406 break; 1407 1408 case LOCAL_CREDS_PERSISTENT: 1409 /* Unlocked read. */ 1410 optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0; 1411 error = sooptcopyout(sopt, &optval, sizeof(optval)); 1412 break; 1413 1414 case LOCAL_CONNWAIT: 1415 /* Unlocked read. */ 1416 optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; 1417 error = sooptcopyout(sopt, &optval, sizeof(optval)); 1418 break; 1419 1420 default: 1421 error = EOPNOTSUPP; 1422 break; 1423 } 1424 break; 1425 1426 case SOPT_SET: 1427 switch (sopt->sopt_name) { 1428 case LOCAL_CREDS: 1429 case LOCAL_CREDS_PERSISTENT: 1430 case LOCAL_CONNWAIT: 1431 error = sooptcopyin(sopt, &optval, sizeof(optval), 1432 sizeof(optval)); 1433 if (error) 1434 break; 1435 1436 #define OPTSET(bit, exclusive) do { \ 1437 UNP_PCB_LOCK(unp); \ 1438 if (optval) { \ 1439 if ((unp->unp_flags & (exclusive)) != 0) { \ 1440 UNP_PCB_UNLOCK(unp); \ 1441 error = EINVAL; \ 1442 break; \ 1443 } \ 1444 unp->unp_flags |= (bit); \ 1445 } else \ 1446 unp->unp_flags &= ~(bit); \ 1447 UNP_PCB_UNLOCK(unp); \ 1448 } while (0) 1449 1450 switch (sopt->sopt_name) { 1451 case LOCAL_CREDS: 1452 OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS); 1453 break; 1454 1455 case LOCAL_CREDS_PERSISTENT: 1456 OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT); 1457 break; 1458 1459 case LOCAL_CONNWAIT: 1460 OPTSET(UNP_CONNWAIT, 0); 1461 break; 1462 1463 default: 1464 break; 1465 } 1466 break; 1467 #undef OPTSET 1468 default: 1469 error = ENOPROTOOPT; 1470 break; 1471 } 1472 break; 1473 1474 default: 1475 error = EOPNOTSUPP; 1476 break; 1477 } 1478 return (error); 1479 } 1480 1481 static int 1482 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 1483 { 1484 1485 return (unp_connectat(AT_FDCWD, so, nam, td)); 1486 } 1487 1488 static int 1489 unp_connectat(int fd, struct socket *so, struct sockaddr *nam, 1490 struct thread *td) 1491 { 1492 struct mtx *vplock; 1493 struct sockaddr_un *soun; 1494 struct vnode *vp; 1495 struct socket *so2; 1496 struct unpcb *unp, *unp2, *unp3; 1497 struct nameidata nd; 1498 char buf[SOCK_MAXADDRLEN]; 1499 struct sockaddr *sa; 1500 cap_rights_t rights; 1501 int error, len; 1502 bool connreq; 1503 1504 if (nam->sa_family != AF_UNIX) 1505 return (EAFNOSUPPORT); 1506 if (nam->sa_len > sizeof(struct sockaddr_un)) 1507 return (EINVAL); 1508 len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); 1509 if (len <= 0) 1510 return (EINVAL); 1511 soun = (struct sockaddr_un *)nam; 1512 bcopy(soun->sun_path, buf, len); 1513 buf[len] = 0; 1514 1515 error = 0; 1516 unp = sotounpcb(so); 1517 UNP_PCB_LOCK(unp); 1518 for (;;) { 1519 /* 1520 * Wait for connection state to stabilize. If a connection 1521 * already exists, give up. For datagram sockets, which permit 1522 * multiple consecutive connect(2) calls, upper layers are 1523 * responsible for disconnecting in advance of a subsequent 1524 * connect(2), but this is not synchronized with PCB connection 1525 * state. 1526 * 1527 * Also make sure that no threads are currently attempting to 1528 * lock the peer socket, to ensure that unp_conn cannot 1529 * transition between two valid sockets while locks are dropped. 1530 */ 1531 if (SOLISTENING(so)) 1532 error = EOPNOTSUPP; 1533 else if (unp->unp_conn != NULL) 1534 error = EISCONN; 1535 else if ((unp->unp_flags & UNP_CONNECTING) != 0) { 1536 error = EALREADY; 1537 } 1538 if (error != 0) { 1539 UNP_PCB_UNLOCK(unp); 1540 return (error); 1541 } 1542 if (unp->unp_pairbusy > 0) { 1543 unp->unp_flags |= UNP_WAITING; 1544 mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0); 1545 continue; 1546 } 1547 break; 1548 } 1549 unp->unp_flags |= UNP_CONNECTING; 1550 UNP_PCB_UNLOCK(unp); 1551 1552 connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0; 1553 if (connreq) 1554 sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); 1555 else 1556 sa = NULL; 1557 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, 1558 UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT)); 1559 error = namei(&nd); 1560 if (error) 1561 vp = NULL; 1562 else 1563 vp = nd.ni_vp; 1564 ASSERT_VOP_LOCKED(vp, "unp_connect"); 1565 NDFREE_NOTHING(&nd); 1566 if (error) 1567 goto bad; 1568 1569 if (vp->v_type != VSOCK) { 1570 error = ENOTSOCK; 1571 goto bad; 1572 } 1573 #ifdef MAC 1574 error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD); 1575 if (error) 1576 goto bad; 1577 #endif 1578 error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); 1579 if (error) 1580 goto bad; 1581 1582 unp = sotounpcb(so); 1583 KASSERT(unp != NULL, ("unp_connect: unp == NULL")); 1584 1585 vplock = mtx_pool_find(mtxpool_sleep, vp); 1586 mtx_lock(vplock); 1587 VOP_UNP_CONNECT(vp, &unp2); 1588 if (unp2 == NULL) { 1589 error = ECONNREFUSED; 1590 goto bad2; 1591 } 1592 so2 = unp2->unp_socket; 1593 if (so->so_type != so2->so_type) { 1594 error = EPROTOTYPE; 1595 goto bad2; 1596 } 1597 if (connreq) { 1598 if (SOLISTENING(so2)) { 1599 CURVNET_SET(so2->so_vnet); 1600 so2 = sonewconn(so2, 0); 1601 CURVNET_RESTORE(); 1602 } else 1603 so2 = NULL; 1604 if (so2 == NULL) { 1605 error = ECONNREFUSED; 1606 goto bad2; 1607 } 1608 unp3 = sotounpcb(so2); 1609 unp_pcb_lock_pair(unp2, unp3); 1610 if (unp2->unp_addr != NULL) { 1611 bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); 1612 unp3->unp_addr = (struct sockaddr_un *) sa; 1613 sa = NULL; 1614 } 1615 1616 unp_copy_peercred(td, unp3, unp, unp2); 1617 1618 UNP_PCB_UNLOCK(unp2); 1619 unp2 = unp3; 1620 1621 /* 1622 * It is safe to block on the PCB lock here since unp2 is 1623 * nascent and cannot be connected to any other sockets. 1624 */ 1625 UNP_PCB_LOCK(unp); 1626 #ifdef MAC 1627 mac_socketpeer_set_from_socket(so, so2); 1628 mac_socketpeer_set_from_socket(so2, so); 1629 #endif 1630 } else { 1631 unp_pcb_lock_pair(unp, unp2); 1632 } 1633 KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && 1634 sotounpcb(so2) == unp2, 1635 ("%s: unp2 %p so2 %p", __func__, unp2, so2)); 1636 unp_connect2(so, so2, PRU_CONNECT); 1637 KASSERT((unp->unp_flags & UNP_CONNECTING) != 0, 1638 ("%s: unp %p has UNP_CONNECTING clear", __func__, unp)); 1639 unp->unp_flags &= ~UNP_CONNECTING; 1640 unp_pcb_unlock_pair(unp, unp2); 1641 bad2: 1642 mtx_unlock(vplock); 1643 bad: 1644 if (vp != NULL) { 1645 vput(vp); 1646 } 1647 free(sa, M_SONAME); 1648 if (__predict_false(error)) { 1649 UNP_PCB_LOCK(unp); 1650 KASSERT((unp->unp_flags & UNP_CONNECTING) != 0, 1651 ("%s: unp %p has UNP_CONNECTING clear", __func__, unp)); 1652 unp->unp_flags &= ~UNP_CONNECTING; 1653 UNP_PCB_UNLOCK(unp); 1654 } 1655 return (error); 1656 } 1657 1658 /* 1659 * Set socket peer credentials at connection time. 1660 * 1661 * The client's PCB credentials are copied from its process structure. The 1662 * server's PCB credentials are copied from the socket on which it called 1663 * listen(2). uipc_listen cached that process's credentials at the time. 1664 */ 1665 void 1666 unp_copy_peercred(struct thread *td, struct unpcb *client_unp, 1667 struct unpcb *server_unp, struct unpcb *listen_unp) 1668 { 1669 cru2xt(td, &client_unp->unp_peercred); 1670 client_unp->unp_flags |= UNP_HAVEPC; 1671 1672 memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred, 1673 sizeof(server_unp->unp_peercred)); 1674 server_unp->unp_flags |= UNP_HAVEPC; 1675 client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK); 1676 } 1677 1678 static void 1679 unp_connect2(struct socket *so, struct socket *so2, int req) 1680 { 1681 struct unpcb *unp; 1682 struct unpcb *unp2; 1683 1684 MPASS(so2->so_type == so->so_type); 1685 unp = sotounpcb(so); 1686 KASSERT(unp != NULL, ("unp_connect2: unp == NULL")); 1687 unp2 = sotounpcb(so2); 1688 KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); 1689 1690 UNP_PCB_LOCK_ASSERT(unp); 1691 UNP_PCB_LOCK_ASSERT(unp2); 1692 KASSERT(unp->unp_conn == NULL, 1693 ("%s: socket %p is already connected", __func__, unp)); 1694 1695 unp->unp_conn = unp2; 1696 unp_pcb_hold(unp2); 1697 unp_pcb_hold(unp); 1698 switch (so->so_type) { 1699 case SOCK_DGRAM: 1700 UNP_REF_LIST_LOCK(); 1701 LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); 1702 UNP_REF_LIST_UNLOCK(); 1703 soisconnected(so); 1704 break; 1705 1706 case SOCK_STREAM: 1707 case SOCK_SEQPACKET: 1708 KASSERT(unp2->unp_conn == NULL, 1709 ("%s: socket %p is already connected", __func__, unp2)); 1710 unp2->unp_conn = unp; 1711 if (req == PRU_CONNECT && 1712 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) 1713 soisconnecting(so); 1714 else 1715 soisconnected(so); 1716 soisconnected(so2); 1717 break; 1718 1719 default: 1720 panic("unp_connect2"); 1721 } 1722 } 1723 1724 static void 1725 unp_disconnect(struct unpcb *unp, struct unpcb *unp2) 1726 { 1727 struct socket *so, *so2; 1728 #ifdef INVARIANTS 1729 struct unpcb *unptmp; 1730 #endif 1731 1732 UNP_PCB_LOCK_ASSERT(unp); 1733 UNP_PCB_LOCK_ASSERT(unp2); 1734 KASSERT(unp->unp_conn == unp2, 1735 ("%s: unpcb %p is not connected to %p", __func__, unp, unp2)); 1736 1737 unp->unp_conn = NULL; 1738 so = unp->unp_socket; 1739 so2 = unp2->unp_socket; 1740 switch (unp->unp_socket->so_type) { 1741 case SOCK_DGRAM: 1742 UNP_REF_LIST_LOCK(); 1743 #ifdef INVARIANTS 1744 LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) { 1745 if (unptmp == unp) 1746 break; 1747 } 1748 KASSERT(unptmp != NULL, 1749 ("%s: %p not found in reflist of %p", __func__, unp, unp2)); 1750 #endif 1751 LIST_REMOVE(unp, unp_reflink); 1752 UNP_REF_LIST_UNLOCK(); 1753 if (so) { 1754 SOCK_LOCK(so); 1755 so->so_state &= ~SS_ISCONNECTED; 1756 SOCK_UNLOCK(so); 1757 } 1758 break; 1759 1760 case SOCK_STREAM: 1761 case SOCK_SEQPACKET: 1762 if (so) 1763 soisdisconnected(so); 1764 MPASS(unp2->unp_conn == unp); 1765 unp2->unp_conn = NULL; 1766 if (so2) 1767 soisdisconnected(so2); 1768 break; 1769 } 1770 1771 if (unp == unp2) { 1772 unp_pcb_rele_notlast(unp); 1773 if (!unp_pcb_rele(unp)) 1774 UNP_PCB_UNLOCK(unp); 1775 } else { 1776 if (!unp_pcb_rele(unp)) 1777 UNP_PCB_UNLOCK(unp); 1778 if (!unp_pcb_rele(unp2)) 1779 UNP_PCB_UNLOCK(unp2); 1780 } 1781 } 1782 1783 /* 1784 * unp_pcblist() walks the global list of struct unpcb's to generate a 1785 * pointer list, bumping the refcount on each unpcb. It then copies them out 1786 * sequentially, validating the generation number on each to see if it has 1787 * been detached. All of this is necessary because copyout() may sleep on 1788 * disk I/O. 1789 */ 1790 static int 1791 unp_pcblist(SYSCTL_HANDLER_ARGS) 1792 { 1793 struct unpcb *unp, **unp_list; 1794 unp_gen_t gencnt; 1795 struct xunpgen *xug; 1796 struct unp_head *head; 1797 struct xunpcb *xu; 1798 u_int i; 1799 int error, n; 1800 1801 switch ((intptr_t)arg1) { 1802 case SOCK_STREAM: 1803 head = &unp_shead; 1804 break; 1805 1806 case SOCK_DGRAM: 1807 head = &unp_dhead; 1808 break; 1809 1810 case SOCK_SEQPACKET: 1811 head = &unp_sphead; 1812 break; 1813 1814 default: 1815 panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1); 1816 } 1817 1818 /* 1819 * The process of preparing the PCB list is too time-consuming and 1820 * resource-intensive to repeat twice on every request. 1821 */ 1822 if (req->oldptr == NULL) { 1823 n = unp_count; 1824 req->oldidx = 2 * (sizeof *xug) 1825 + (n + n/8) * sizeof(struct xunpcb); 1826 return (0); 1827 } 1828 1829 if (req->newptr != NULL) 1830 return (EPERM); 1831 1832 /* 1833 * OK, now we're committed to doing something. 1834 */ 1835 xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO); 1836 UNP_LINK_RLOCK(); 1837 gencnt = unp_gencnt; 1838 n = unp_count; 1839 UNP_LINK_RUNLOCK(); 1840 1841 xug->xug_len = sizeof *xug; 1842 xug->xug_count = n; 1843 xug->xug_gen = gencnt; 1844 xug->xug_sogen = so_gencnt; 1845 error = SYSCTL_OUT(req, xug, sizeof *xug); 1846 if (error) { 1847 free(xug, M_TEMP); 1848 return (error); 1849 } 1850 1851 unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); 1852 1853 UNP_LINK_RLOCK(); 1854 for (unp = LIST_FIRST(head), i = 0; unp && i < n; 1855 unp = LIST_NEXT(unp, unp_link)) { 1856 UNP_PCB_LOCK(unp); 1857 if (unp->unp_gencnt <= gencnt) { 1858 if (cr_cansee(req->td->td_ucred, 1859 unp->unp_socket->so_cred)) { 1860 UNP_PCB_UNLOCK(unp); 1861 continue; 1862 } 1863 unp_list[i++] = unp; 1864 unp_pcb_hold(unp); 1865 } 1866 UNP_PCB_UNLOCK(unp); 1867 } 1868 UNP_LINK_RUNLOCK(); 1869 n = i; /* In case we lost some during malloc. */ 1870 1871 error = 0; 1872 xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); 1873 for (i = 0; i < n; i++) { 1874 unp = unp_list[i]; 1875 UNP_PCB_LOCK(unp); 1876 if (unp_pcb_rele(unp)) 1877 continue; 1878 1879 if (unp->unp_gencnt <= gencnt) { 1880 xu->xu_len = sizeof *xu; 1881 xu->xu_unpp = (uintptr_t)unp; 1882 /* 1883 * XXX - need more locking here to protect against 1884 * connect/disconnect races for SMP. 1885 */ 1886 if (unp->unp_addr != NULL) 1887 bcopy(unp->unp_addr, &xu->xu_addr, 1888 unp->unp_addr->sun_len); 1889 else 1890 bzero(&xu->xu_addr, sizeof(xu->xu_addr)); 1891 if (unp->unp_conn != NULL && 1892 unp->unp_conn->unp_addr != NULL) 1893 bcopy(unp->unp_conn->unp_addr, 1894 &xu->xu_caddr, 1895 unp->unp_conn->unp_addr->sun_len); 1896 else 1897 bzero(&xu->xu_caddr, sizeof(xu->xu_caddr)); 1898 xu->unp_vnode = (uintptr_t)unp->unp_vnode; 1899 xu->unp_conn = (uintptr_t)unp->unp_conn; 1900 xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs); 1901 xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink); 1902 xu->unp_gencnt = unp->unp_gencnt; 1903 sotoxsocket(unp->unp_socket, &xu->xu_socket); 1904 UNP_PCB_UNLOCK(unp); 1905 error = SYSCTL_OUT(req, xu, sizeof *xu); 1906 } else { 1907 UNP_PCB_UNLOCK(unp); 1908 } 1909 } 1910 free(xu, M_TEMP); 1911 if (!error) { 1912 /* 1913 * Give the user an updated idea of our state. If the 1914 * generation differs from what we told her before, she knows 1915 * that something happened while we were processing this 1916 * request, and it might be necessary to retry. 1917 */ 1918 xug->xug_gen = unp_gencnt; 1919 xug->xug_sogen = so_gencnt; 1920 xug->xug_count = unp_count; 1921 error = SYSCTL_OUT(req, xug, sizeof *xug); 1922 } 1923 free(unp_list, M_TEMP); 1924 free(xug, M_TEMP); 1925 return (error); 1926 } 1927 1928 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, 1929 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 1930 (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", 1931 "List of active local datagram sockets"); 1932 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, 1933 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 1934 (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", 1935 "List of active local stream sockets"); 1936 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist, 1937 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 1938 (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb", 1939 "List of active local seqpacket sockets"); 1940 1941 static void 1942 unp_shutdown(struct unpcb *unp) 1943 { 1944 struct unpcb *unp2; 1945 struct socket *so; 1946 1947 UNP_PCB_LOCK_ASSERT(unp); 1948 1949 unp2 = unp->unp_conn; 1950 if ((unp->unp_socket->so_type == SOCK_STREAM || 1951 (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) { 1952 so = unp2->unp_socket; 1953 if (so != NULL) 1954 socantrcvmore(so); 1955 } 1956 } 1957 1958 static void 1959 unp_drop(struct unpcb *unp) 1960 { 1961 struct socket *so; 1962 struct unpcb *unp2; 1963 1964 /* 1965 * Regardless of whether the socket's peer dropped the connection 1966 * with this socket by aborting or disconnecting, POSIX requires 1967 * that ECONNRESET is returned. 1968 */ 1969 1970 UNP_PCB_LOCK(unp); 1971 so = unp->unp_socket; 1972 if (so) 1973 so->so_error = ECONNRESET; 1974 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) { 1975 /* Last reference dropped in unp_disconnect(). */ 1976 unp_pcb_rele_notlast(unp); 1977 unp_disconnect(unp, unp2); 1978 } else if (!unp_pcb_rele(unp)) { 1979 UNP_PCB_UNLOCK(unp); 1980 } 1981 } 1982 1983 static void 1984 unp_freerights(struct filedescent **fdep, int fdcount) 1985 { 1986 struct file *fp; 1987 int i; 1988 1989 KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount)); 1990 1991 for (i = 0; i < fdcount; i++) { 1992 fp = fdep[i]->fde_file; 1993 filecaps_free(&fdep[i]->fde_caps); 1994 unp_discard(fp); 1995 } 1996 free(fdep[0], M_FILECAPS); 1997 } 1998 1999 static int 2000 unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags) 2001 { 2002 struct thread *td = curthread; /* XXX */ 2003 struct cmsghdr *cm = mtod(control, struct cmsghdr *); 2004 int i; 2005 int *fdp; 2006 struct filedesc *fdesc = td->td_proc->p_fd; 2007 struct filedescent **fdep; 2008 void *data; 2009 socklen_t clen = control->m_len, datalen; 2010 int error, newfds; 2011 u_int newlen; 2012 2013 UNP_LINK_UNLOCK_ASSERT(); 2014 2015 error = 0; 2016 if (controlp != NULL) /* controlp == NULL => free control messages */ 2017 *controlp = NULL; 2018 while (cm != NULL) { 2019 if (sizeof(*cm) > clen || cm->cmsg_len > clen) { 2020 error = EINVAL; 2021 break; 2022 } 2023 data = CMSG_DATA(cm); 2024 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 2025 if (cm->cmsg_level == SOL_SOCKET 2026 && cm->cmsg_type == SCM_RIGHTS) { 2027 newfds = datalen / sizeof(*fdep); 2028 if (newfds == 0) 2029 goto next; 2030 fdep = data; 2031 2032 /* If we're not outputting the descriptors free them. */ 2033 if (error || controlp == NULL) { 2034 unp_freerights(fdep, newfds); 2035 goto next; 2036 } 2037 FILEDESC_XLOCK(fdesc); 2038 2039 /* 2040 * Now change each pointer to an fd in the global 2041 * table to an integer that is the index to the local 2042 * fd table entry that we set up to point to the 2043 * global one we are transferring. 2044 */ 2045 newlen = newfds * sizeof(int); 2046 *controlp = sbcreatecontrol(NULL, newlen, 2047 SCM_RIGHTS, SOL_SOCKET); 2048 if (*controlp == NULL) { 2049 FILEDESC_XUNLOCK(fdesc); 2050 error = E2BIG; 2051 unp_freerights(fdep, newfds); 2052 goto next; 2053 } 2054 2055 fdp = (int *) 2056 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 2057 if (fdallocn(td, 0, fdp, newfds) != 0) { 2058 FILEDESC_XUNLOCK(fdesc); 2059 error = EMSGSIZE; 2060 unp_freerights(fdep, newfds); 2061 m_freem(*controlp); 2062 *controlp = NULL; 2063 goto next; 2064 } 2065 for (i = 0; i < newfds; i++, fdp++) { 2066 _finstall(fdesc, fdep[i]->fde_file, *fdp, 2067 (flags & MSG_CMSG_CLOEXEC) != 0 ? O_CLOEXEC : 0, 2068 &fdep[i]->fde_caps); 2069 unp_externalize_fp(fdep[i]->fde_file); 2070 } 2071 2072 /* 2073 * The new type indicates that the mbuf data refers to 2074 * kernel resources that may need to be released before 2075 * the mbuf is freed. 2076 */ 2077 m_chtype(*controlp, MT_EXTCONTROL); 2078 FILEDESC_XUNLOCK(fdesc); 2079 free(fdep[0], M_FILECAPS); 2080 } else { 2081 /* We can just copy anything else across. */ 2082 if (error || controlp == NULL) 2083 goto next; 2084 *controlp = sbcreatecontrol(NULL, datalen, 2085 cm->cmsg_type, cm->cmsg_level); 2086 if (*controlp == NULL) { 2087 error = ENOBUFS; 2088 goto next; 2089 } 2090 bcopy(data, 2091 CMSG_DATA(mtod(*controlp, struct cmsghdr *)), 2092 datalen); 2093 } 2094 controlp = &(*controlp)->m_next; 2095 2096 next: 2097 if (CMSG_SPACE(datalen) < clen) { 2098 clen -= CMSG_SPACE(datalen); 2099 cm = (struct cmsghdr *) 2100 ((caddr_t)cm + CMSG_SPACE(datalen)); 2101 } else { 2102 clen = 0; 2103 cm = NULL; 2104 } 2105 } 2106 2107 m_freem(control); 2108 return (error); 2109 } 2110 2111 static void 2112 unp_zone_change(void *tag) 2113 { 2114 2115 uma_zone_set_max(unp_zone, maxsockets); 2116 } 2117 2118 #ifdef INVARIANTS 2119 static void 2120 unp_zdtor(void *mem, int size __unused, void *arg __unused) 2121 { 2122 struct unpcb *unp; 2123 2124 unp = mem; 2125 2126 KASSERT(LIST_EMPTY(&unp->unp_refs), 2127 ("%s: unpcb %p has lingering refs", __func__, unp)); 2128 KASSERT(unp->unp_socket == NULL, 2129 ("%s: unpcb %p has socket backpointer", __func__, unp)); 2130 KASSERT(unp->unp_vnode == NULL, 2131 ("%s: unpcb %p has vnode references", __func__, unp)); 2132 KASSERT(unp->unp_conn == NULL, 2133 ("%s: unpcb %p is still connected", __func__, unp)); 2134 KASSERT(unp->unp_addr == NULL, 2135 ("%s: unpcb %p has leaked addr", __func__, unp)); 2136 } 2137 #endif 2138 2139 static void 2140 unp_init(void *arg __unused) 2141 { 2142 uma_dtor dtor; 2143 2144 #ifdef INVARIANTS 2145 dtor = unp_zdtor; 2146 #else 2147 dtor = NULL; 2148 #endif 2149 unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor, 2150 NULL, NULL, UMA_ALIGN_CACHE, 0); 2151 uma_zone_set_max(unp_zone, maxsockets); 2152 uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached"); 2153 EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, 2154 NULL, EVENTHANDLER_PRI_ANY); 2155 LIST_INIT(&unp_dhead); 2156 LIST_INIT(&unp_shead); 2157 LIST_INIT(&unp_sphead); 2158 SLIST_INIT(&unp_defers); 2159 TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL); 2160 TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL); 2161 UNP_LINK_LOCK_INIT(); 2162 UNP_DEFERRED_LOCK_INIT(); 2163 } 2164 SYSINIT(unp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, unp_init, NULL); 2165 2166 static void 2167 unp_internalize_cleanup_rights(struct mbuf *control) 2168 { 2169 struct cmsghdr *cp; 2170 struct mbuf *m; 2171 void *data; 2172 socklen_t datalen; 2173 2174 for (m = control; m != NULL; m = m->m_next) { 2175 cp = mtod(m, struct cmsghdr *); 2176 if (cp->cmsg_level != SOL_SOCKET || 2177 cp->cmsg_type != SCM_RIGHTS) 2178 continue; 2179 data = CMSG_DATA(cp); 2180 datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data; 2181 unp_freerights(data, datalen / sizeof(struct filedesc *)); 2182 } 2183 } 2184 2185 static int 2186 unp_internalize(struct mbuf **controlp, struct thread *td) 2187 { 2188 struct mbuf *control, **initial_controlp; 2189 struct proc *p; 2190 struct filedesc *fdesc; 2191 struct bintime *bt; 2192 struct cmsghdr *cm; 2193 struct cmsgcred *cmcred; 2194 struct filedescent *fde, **fdep, *fdev; 2195 struct file *fp; 2196 struct timeval *tv; 2197 struct timespec *ts; 2198 void *data; 2199 socklen_t clen, datalen; 2200 int i, j, error, *fdp, oldfds; 2201 u_int newlen; 2202 2203 UNP_LINK_UNLOCK_ASSERT(); 2204 2205 p = td->td_proc; 2206 fdesc = p->p_fd; 2207 error = 0; 2208 control = *controlp; 2209 clen = control->m_len; 2210 *controlp = NULL; 2211 initial_controlp = controlp; 2212 for (cm = mtod(control, struct cmsghdr *); cm != NULL;) { 2213 if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET 2214 || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) { 2215 error = EINVAL; 2216 goto out; 2217 } 2218 data = CMSG_DATA(cm); 2219 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; 2220 2221 switch (cm->cmsg_type) { 2222 /* 2223 * Fill in credential information. 2224 */ 2225 case SCM_CREDS: 2226 *controlp = sbcreatecontrol_how(NULL, sizeof(*cmcred), 2227 SCM_CREDS, SOL_SOCKET, M_WAITOK); 2228 cmcred = (struct cmsgcred *) 2229 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 2230 cmcred->cmcred_pid = p->p_pid; 2231 cmcred->cmcred_uid = td->td_ucred->cr_ruid; 2232 cmcred->cmcred_gid = td->td_ucred->cr_rgid; 2233 cmcred->cmcred_euid = td->td_ucred->cr_uid; 2234 cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, 2235 CMGROUP_MAX); 2236 for (i = 0; i < cmcred->cmcred_ngroups; i++) 2237 cmcred->cmcred_groups[i] = 2238 td->td_ucred->cr_groups[i]; 2239 break; 2240 2241 case SCM_RIGHTS: 2242 oldfds = datalen / sizeof (int); 2243 if (oldfds == 0) 2244 break; 2245 /* 2246 * Check that all the FDs passed in refer to legal 2247 * files. If not, reject the entire operation. 2248 */ 2249 fdp = data; 2250 FILEDESC_SLOCK(fdesc); 2251 for (i = 0; i < oldfds; i++, fdp++) { 2252 fp = fget_noref(fdesc, *fdp); 2253 if (fp == NULL) { 2254 FILEDESC_SUNLOCK(fdesc); 2255 error = EBADF; 2256 goto out; 2257 } 2258 if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { 2259 FILEDESC_SUNLOCK(fdesc); 2260 error = EOPNOTSUPP; 2261 goto out; 2262 } 2263 } 2264 2265 /* 2266 * Now replace the integer FDs with pointers to the 2267 * file structure and capability rights. 2268 */ 2269 newlen = oldfds * sizeof(fdep[0]); 2270 *controlp = sbcreatecontrol_how(NULL, newlen, 2271 SCM_RIGHTS, SOL_SOCKET, M_WAITOK); 2272 fdp = data; 2273 for (i = 0; i < oldfds; i++, fdp++) { 2274 if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) { 2275 fdp = data; 2276 for (j = 0; j < i; j++, fdp++) { 2277 fdrop(fdesc->fd_ofiles[*fdp]. 2278 fde_file, td); 2279 } 2280 FILEDESC_SUNLOCK(fdesc); 2281 error = EBADF; 2282 goto out; 2283 } 2284 } 2285 fdp = data; 2286 fdep = (struct filedescent **) 2287 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 2288 fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS, 2289 M_WAITOK); 2290 for (i = 0; i < oldfds; i++, fdev++, fdp++) { 2291 fde = &fdesc->fd_ofiles[*fdp]; 2292 fdep[i] = fdev; 2293 fdep[i]->fde_file = fde->fde_file; 2294 filecaps_copy(&fde->fde_caps, 2295 &fdep[i]->fde_caps, true); 2296 unp_internalize_fp(fdep[i]->fde_file); 2297 } 2298 FILEDESC_SUNLOCK(fdesc); 2299 break; 2300 2301 case SCM_TIMESTAMP: 2302 *controlp = sbcreatecontrol_how(NULL, sizeof(*tv), 2303 SCM_TIMESTAMP, SOL_SOCKET, M_WAITOK); 2304 tv = (struct timeval *) 2305 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 2306 microtime(tv); 2307 break; 2308 2309 case SCM_BINTIME: 2310 *controlp = sbcreatecontrol_how(NULL, sizeof(*bt), 2311 SCM_BINTIME, SOL_SOCKET, M_WAITOK); 2312 bt = (struct bintime *) 2313 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 2314 bintime(bt); 2315 break; 2316 2317 case SCM_REALTIME: 2318 *controlp = sbcreatecontrol_how(NULL, sizeof(*ts), 2319 SCM_REALTIME, SOL_SOCKET, M_WAITOK); 2320 ts = (struct timespec *) 2321 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 2322 nanotime(ts); 2323 break; 2324 2325 case SCM_MONOTONIC: 2326 *controlp = sbcreatecontrol_how(NULL, sizeof(*ts), 2327 SCM_MONOTONIC, SOL_SOCKET, M_WAITOK); 2328 ts = (struct timespec *) 2329 CMSG_DATA(mtod(*controlp, struct cmsghdr *)); 2330 nanouptime(ts); 2331 break; 2332 2333 default: 2334 error = EINVAL; 2335 goto out; 2336 } 2337 2338 if (*controlp != NULL) 2339 controlp = &(*controlp)->m_next; 2340 if (CMSG_SPACE(datalen) < clen) { 2341 clen -= CMSG_SPACE(datalen); 2342 cm = (struct cmsghdr *) 2343 ((caddr_t)cm + CMSG_SPACE(datalen)); 2344 } else { 2345 clen = 0; 2346 cm = NULL; 2347 } 2348 } 2349 2350 out: 2351 if (error != 0 && initial_controlp != NULL) 2352 unp_internalize_cleanup_rights(*initial_controlp); 2353 m_freem(control); 2354 return (error); 2355 } 2356 2357 static struct mbuf * 2358 unp_addsockcred(struct thread *td, struct mbuf *control, int mode) 2359 { 2360 struct mbuf *m, *n, *n_prev; 2361 const struct cmsghdr *cm; 2362 int ngroups, i, cmsgtype; 2363 size_t ctrlsz; 2364 2365 ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); 2366 if (mode & UNP_WANTCRED_ALWAYS) { 2367 ctrlsz = SOCKCRED2SIZE(ngroups); 2368 cmsgtype = SCM_CREDS2; 2369 } else { 2370 ctrlsz = SOCKCREDSIZE(ngroups); 2371 cmsgtype = SCM_CREDS; 2372 } 2373 2374 m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET); 2375 if (m == NULL) 2376 return (control); 2377 2378 if (mode & UNP_WANTCRED_ALWAYS) { 2379 struct sockcred2 *sc; 2380 2381 sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *)); 2382 sc->sc_version = 0; 2383 sc->sc_pid = td->td_proc->p_pid; 2384 sc->sc_uid = td->td_ucred->cr_ruid; 2385 sc->sc_euid = td->td_ucred->cr_uid; 2386 sc->sc_gid = td->td_ucred->cr_rgid; 2387 sc->sc_egid = td->td_ucred->cr_gid; 2388 sc->sc_ngroups = ngroups; 2389 for (i = 0; i < sc->sc_ngroups; i++) 2390 sc->sc_groups[i] = td->td_ucred->cr_groups[i]; 2391 } else { 2392 struct sockcred *sc; 2393 2394 sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *)); 2395 sc->sc_uid = td->td_ucred->cr_ruid; 2396 sc->sc_euid = td->td_ucred->cr_uid; 2397 sc->sc_gid = td->td_ucred->cr_rgid; 2398 sc->sc_egid = td->td_ucred->cr_gid; 2399 sc->sc_ngroups = ngroups; 2400 for (i = 0; i < sc->sc_ngroups; i++) 2401 sc->sc_groups[i] = td->td_ucred->cr_groups[i]; 2402 } 2403 2404 /* 2405 * Unlink SCM_CREDS control messages (struct cmsgcred), since just 2406 * created SCM_CREDS control message (struct sockcred) has another 2407 * format. 2408 */ 2409 if (control != NULL && cmsgtype == SCM_CREDS) 2410 for (n = control, n_prev = NULL; n != NULL;) { 2411 cm = mtod(n, struct cmsghdr *); 2412 if (cm->cmsg_level == SOL_SOCKET && 2413 cm->cmsg_type == SCM_CREDS) { 2414 if (n_prev == NULL) 2415 control = n->m_next; 2416 else 2417 n_prev->m_next = n->m_next; 2418 n = m_free(n); 2419 } else { 2420 n_prev = n; 2421 n = n->m_next; 2422 } 2423 } 2424 2425 /* Prepend it to the head. */ 2426 m->m_next = control; 2427 return (m); 2428 } 2429 2430 static struct unpcb * 2431 fptounp(struct file *fp) 2432 { 2433 struct socket *so; 2434 2435 if (fp->f_type != DTYPE_SOCKET) 2436 return (NULL); 2437 if ((so = fp->f_data) == NULL) 2438 return (NULL); 2439 if (so->so_proto->pr_domain != &localdomain) 2440 return (NULL); 2441 return sotounpcb(so); 2442 } 2443 2444 static void 2445 unp_discard(struct file *fp) 2446 { 2447 struct unp_defer *dr; 2448 2449 if (unp_externalize_fp(fp)) { 2450 dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK); 2451 dr->ud_fp = fp; 2452 UNP_DEFERRED_LOCK(); 2453 SLIST_INSERT_HEAD(&unp_defers, dr, ud_link); 2454 UNP_DEFERRED_UNLOCK(); 2455 atomic_add_int(&unp_defers_count, 1); 2456 taskqueue_enqueue(taskqueue_thread, &unp_defer_task); 2457 } else 2458 closef_nothread(fp); 2459 } 2460 2461 static void 2462 unp_process_defers(void *arg __unused, int pending) 2463 { 2464 struct unp_defer *dr; 2465 SLIST_HEAD(, unp_defer) drl; 2466 int count; 2467 2468 SLIST_INIT(&drl); 2469 for (;;) { 2470 UNP_DEFERRED_LOCK(); 2471 if (SLIST_FIRST(&unp_defers) == NULL) { 2472 UNP_DEFERRED_UNLOCK(); 2473 break; 2474 } 2475 SLIST_SWAP(&unp_defers, &drl, unp_defer); 2476 UNP_DEFERRED_UNLOCK(); 2477 count = 0; 2478 while ((dr = SLIST_FIRST(&drl)) != NULL) { 2479 SLIST_REMOVE_HEAD(&drl, ud_link); 2480 closef_nothread(dr->ud_fp); 2481 free(dr, M_TEMP); 2482 count++; 2483 } 2484 atomic_add_int(&unp_defers_count, -count); 2485 } 2486 } 2487 2488 static void 2489 unp_internalize_fp(struct file *fp) 2490 { 2491 struct unpcb *unp; 2492 2493 UNP_LINK_WLOCK(); 2494 if ((unp = fptounp(fp)) != NULL) { 2495 unp->unp_file = fp; 2496 unp->unp_msgcount++; 2497 } 2498 unp_rights++; 2499 UNP_LINK_WUNLOCK(); 2500 } 2501 2502 static int 2503 unp_externalize_fp(struct file *fp) 2504 { 2505 struct unpcb *unp; 2506 int ret; 2507 2508 UNP_LINK_WLOCK(); 2509 if ((unp = fptounp(fp)) != NULL) { 2510 unp->unp_msgcount--; 2511 ret = 1; 2512 } else 2513 ret = 0; 2514 unp_rights--; 2515 UNP_LINK_WUNLOCK(); 2516 return (ret); 2517 } 2518 2519 /* 2520 * unp_defer indicates whether additional work has been defered for a future 2521 * pass through unp_gc(). It is thread local and does not require explicit 2522 * synchronization. 2523 */ 2524 static int unp_marked; 2525 2526 static void 2527 unp_remove_dead_ref(struct filedescent **fdep, int fdcount) 2528 { 2529 struct unpcb *unp; 2530 struct file *fp; 2531 int i; 2532 2533 /* 2534 * This function can only be called from the gc task. 2535 */ 2536 KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0, 2537 ("%s: not on gc callout", __func__)); 2538 UNP_LINK_LOCK_ASSERT(); 2539 2540 for (i = 0; i < fdcount; i++) { 2541 fp = fdep[i]->fde_file; 2542 if ((unp = fptounp(fp)) == NULL) 2543 continue; 2544 if ((unp->unp_gcflag & UNPGC_DEAD) == 0) 2545 continue; 2546 unp->unp_gcrefs--; 2547 } 2548 } 2549 2550 static void 2551 unp_restore_undead_ref(struct filedescent **fdep, int fdcount) 2552 { 2553 struct unpcb *unp; 2554 struct file *fp; 2555 int i; 2556 2557 /* 2558 * This function can only be called from the gc task. 2559 */ 2560 KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0, 2561 ("%s: not on gc callout", __func__)); 2562 UNP_LINK_LOCK_ASSERT(); 2563 2564 for (i = 0; i < fdcount; i++) { 2565 fp = fdep[i]->fde_file; 2566 if ((unp = fptounp(fp)) == NULL) 2567 continue; 2568 if ((unp->unp_gcflag & UNPGC_DEAD) == 0) 2569 continue; 2570 unp->unp_gcrefs++; 2571 unp_marked++; 2572 } 2573 } 2574 2575 static void 2576 unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int)) 2577 { 2578 struct socket *so, *soa; 2579 2580 so = unp->unp_socket; 2581 SOCK_LOCK(so); 2582 if (SOLISTENING(so)) { 2583 /* 2584 * Mark all sockets in our accept queue. 2585 */ 2586 TAILQ_FOREACH(soa, &so->sol_comp, so_list) { 2587 if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) 2588 continue; 2589 SOCKBUF_LOCK(&soa->so_rcv); 2590 unp_scan(soa->so_rcv.sb_mb, op); 2591 SOCKBUF_UNLOCK(&soa->so_rcv); 2592 } 2593 } else { 2594 /* 2595 * Mark all sockets we reference with RIGHTS. 2596 */ 2597 if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { 2598 SOCKBUF_LOCK(&so->so_rcv); 2599 unp_scan(so->so_rcv.sb_mb, op); 2600 SOCKBUF_UNLOCK(&so->so_rcv); 2601 } 2602 } 2603 SOCK_UNLOCK(so); 2604 } 2605 2606 static int unp_recycled; 2607 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, 2608 "Number of unreachable sockets claimed by the garbage collector."); 2609 2610 static int unp_taskcount; 2611 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, 2612 "Number of times the garbage collector has run."); 2613 2614 SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0, 2615 "Number of active local sockets."); 2616 2617 static void 2618 unp_gc(__unused void *arg, int pending) 2619 { 2620 struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead, 2621 NULL }; 2622 struct unp_head **head; 2623 struct unp_head unp_deadhead; /* List of potentially-dead sockets. */ 2624 struct file *f, **unref; 2625 struct unpcb *unp, *unptmp; 2626 int i, total, unp_unreachable; 2627 2628 LIST_INIT(&unp_deadhead); 2629 unp_taskcount++; 2630 UNP_LINK_RLOCK(); 2631 /* 2632 * First determine which sockets may be in cycles. 2633 */ 2634 unp_unreachable = 0; 2635 2636 for (head = heads; *head != NULL; head++) 2637 LIST_FOREACH(unp, *head, unp_link) { 2638 KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0, 2639 ("%s: unp %p has unexpected gc flags 0x%x", 2640 __func__, unp, (unsigned int)unp->unp_gcflag)); 2641 2642 f = unp->unp_file; 2643 2644 /* 2645 * Check for an unreachable socket potentially in a 2646 * cycle. It must be in a queue as indicated by 2647 * msgcount, and this must equal the file reference 2648 * count. Note that when msgcount is 0 the file is 2649 * NULL. 2650 */ 2651 if (f != NULL && unp->unp_msgcount != 0 && 2652 refcount_load(&f->f_count) == unp->unp_msgcount) { 2653 LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead); 2654 unp->unp_gcflag |= UNPGC_DEAD; 2655 unp->unp_gcrefs = unp->unp_msgcount; 2656 unp_unreachable++; 2657 } 2658 } 2659 2660 /* 2661 * Scan all sockets previously marked as potentially being in a cycle 2662 * and remove the references each socket holds on any UNPGC_DEAD 2663 * sockets in its queue. After this step, all remaining references on 2664 * sockets marked UNPGC_DEAD should not be part of any cycle. 2665 */ 2666 LIST_FOREACH(unp, &unp_deadhead, unp_dead) 2667 unp_gc_scan(unp, unp_remove_dead_ref); 2668 2669 /* 2670 * If a socket still has a non-negative refcount, it cannot be in a 2671 * cycle. In this case increment refcount of all children iteratively. 2672 * Stop the scan once we do a complete loop without discovering 2673 * a new reachable socket. 2674 */ 2675 do { 2676 unp_marked = 0; 2677 LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp) 2678 if (unp->unp_gcrefs > 0) { 2679 unp->unp_gcflag &= ~UNPGC_DEAD; 2680 LIST_REMOVE(unp, unp_dead); 2681 KASSERT(unp_unreachable > 0, 2682 ("%s: unp_unreachable underflow.", 2683 __func__)); 2684 unp_unreachable--; 2685 unp_gc_scan(unp, unp_restore_undead_ref); 2686 } 2687 } while (unp_marked); 2688 2689 UNP_LINK_RUNLOCK(); 2690 2691 if (unp_unreachable == 0) 2692 return; 2693 2694 /* 2695 * Allocate space for a local array of dead unpcbs. 2696 * TODO: can this path be simplified by instead using the local 2697 * dead list at unp_deadhead, after taking out references 2698 * on the file object and/or unpcb and dropping the link lock? 2699 */ 2700 unref = malloc(unp_unreachable * sizeof(struct file *), 2701 M_TEMP, M_WAITOK); 2702 2703 /* 2704 * Iterate looking for sockets which have been specifically marked 2705 * as unreachable and store them locally. 2706 */ 2707 UNP_LINK_RLOCK(); 2708 total = 0; 2709 LIST_FOREACH(unp, &unp_deadhead, unp_dead) { 2710 KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0, 2711 ("%s: unp %p not marked UNPGC_DEAD", __func__, unp)); 2712 unp->unp_gcflag &= ~UNPGC_DEAD; 2713 f = unp->unp_file; 2714 if (unp->unp_msgcount == 0 || f == NULL || 2715 refcount_load(&f->f_count) != unp->unp_msgcount || 2716 !fhold(f)) 2717 continue; 2718 unref[total++] = f; 2719 KASSERT(total <= unp_unreachable, 2720 ("%s: incorrect unreachable count.", __func__)); 2721 } 2722 UNP_LINK_RUNLOCK(); 2723 2724 /* 2725 * Now flush all sockets, free'ing rights. This will free the 2726 * struct files associated with these sockets but leave each socket 2727 * with one remaining ref. 2728 */ 2729 for (i = 0; i < total; i++) { 2730 struct socket *so; 2731 2732 so = unref[i]->f_data; 2733 CURVNET_SET(so->so_vnet); 2734 sorflush(so); 2735 CURVNET_RESTORE(); 2736 } 2737 2738 /* 2739 * And finally release the sockets so they can be reclaimed. 2740 */ 2741 for (i = 0; i < total; i++) 2742 fdrop(unref[i], NULL); 2743 unp_recycled += total; 2744 free(unref, M_TEMP); 2745 } 2746 2747 static void 2748 unp_dispose_mbuf(struct mbuf *m) 2749 { 2750 2751 if (m) 2752 unp_scan(m, unp_freerights); 2753 } 2754 2755 /* 2756 * Synchronize against unp_gc, which can trip over data as we are freeing it. 2757 */ 2758 static void 2759 unp_dispose(struct socket *so) 2760 { 2761 struct sockbuf *sb = &so->so_rcv; 2762 struct unpcb *unp; 2763 struct mbuf *m; 2764 2765 MPASS(!SOLISTENING(so)); 2766 2767 unp = sotounpcb(so); 2768 UNP_LINK_WLOCK(); 2769 unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; 2770 UNP_LINK_WUNLOCK(); 2771 2772 /* 2773 * Grab our special mbufs before calling sbrelease(). 2774 */ 2775 SOCK_RECVBUF_LOCK(so); 2776 m = sbcut_locked(sb, sb->sb_ccc); 2777 KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0, 2778 ("%s: ccc %u mb %p mbcnt %u", __func__, 2779 sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt)); 2780 sbrelease_locked(so, SO_RCV); 2781 SOCK_RECVBUF_UNLOCK(so); 2782 if (SOCK_IO_RECV_OWNED(so)) 2783 SOCK_IO_RECV_UNLOCK(so); 2784 2785 unp_dispose_mbuf(m); 2786 } 2787 2788 static void 2789 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int)) 2790 { 2791 struct mbuf *m; 2792 struct cmsghdr *cm; 2793 void *data; 2794 socklen_t clen, datalen; 2795 2796 while (m0 != NULL) { 2797 for (m = m0; m; m = m->m_next) { 2798 if (m->m_type != MT_CONTROL) 2799 continue; 2800 2801 cm = mtod(m, struct cmsghdr *); 2802 clen = m->m_len; 2803 2804 while (cm != NULL) { 2805 if (sizeof(*cm) > clen || cm->cmsg_len > clen) 2806 break; 2807 2808 data = CMSG_DATA(cm); 2809 datalen = (caddr_t)cm + cm->cmsg_len 2810 - (caddr_t)data; 2811 2812 if (cm->cmsg_level == SOL_SOCKET && 2813 cm->cmsg_type == SCM_RIGHTS) { 2814 (*op)(data, datalen / 2815 sizeof(struct filedescent *)); 2816 } 2817 2818 if (CMSG_SPACE(datalen) < clen) { 2819 clen -= CMSG_SPACE(datalen); 2820 cm = (struct cmsghdr *) 2821 ((caddr_t)cm + CMSG_SPACE(datalen)); 2822 } else { 2823 clen = 0; 2824 cm = NULL; 2825 } 2826 } 2827 } 2828 m0 = m0->m_nextpkt; 2829 } 2830 } 2831 2832 /* 2833 * A helper function called by VFS before socket-type vnode reclamation. 2834 * For an active vnode it clears unp_vnode pointer and decrements unp_vnode 2835 * use count. 2836 */ 2837 void 2838 vfs_unp_reclaim(struct vnode *vp) 2839 { 2840 struct unpcb *unp; 2841 int active; 2842 struct mtx *vplock; 2843 2844 ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim"); 2845 KASSERT(vp->v_type == VSOCK, 2846 ("vfs_unp_reclaim: vp->v_type != VSOCK")); 2847 2848 active = 0; 2849 vplock = mtx_pool_find(mtxpool_sleep, vp); 2850 mtx_lock(vplock); 2851 VOP_UNP_CONNECT(vp, &unp); 2852 if (unp == NULL) 2853 goto done; 2854 UNP_PCB_LOCK(unp); 2855 if (unp->unp_vnode == vp) { 2856 VOP_UNP_DETACH(vp); 2857 unp->unp_vnode = NULL; 2858 active = 1; 2859 } 2860 UNP_PCB_UNLOCK(unp); 2861 done: 2862 mtx_unlock(vplock); 2863 if (active) 2864 vunref(vp); 2865 } 2866 2867 #ifdef DDB 2868 static void 2869 db_print_indent(int indent) 2870 { 2871 int i; 2872 2873 for (i = 0; i < indent; i++) 2874 db_printf(" "); 2875 } 2876 2877 static void 2878 db_print_unpflags(int unp_flags) 2879 { 2880 int comma; 2881 2882 comma = 0; 2883 if (unp_flags & UNP_HAVEPC) { 2884 db_printf("%sUNP_HAVEPC", comma ? ", " : ""); 2885 comma = 1; 2886 } 2887 if (unp_flags & UNP_WANTCRED_ALWAYS) { 2888 db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : ""); 2889 comma = 1; 2890 } 2891 if (unp_flags & UNP_WANTCRED_ONESHOT) { 2892 db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : ""); 2893 comma = 1; 2894 } 2895 if (unp_flags & UNP_CONNWAIT) { 2896 db_printf("%sUNP_CONNWAIT", comma ? ", " : ""); 2897 comma = 1; 2898 } 2899 if (unp_flags & UNP_CONNECTING) { 2900 db_printf("%sUNP_CONNECTING", comma ? ", " : ""); 2901 comma = 1; 2902 } 2903 if (unp_flags & UNP_BINDING) { 2904 db_printf("%sUNP_BINDING", comma ? ", " : ""); 2905 comma = 1; 2906 } 2907 } 2908 2909 static void 2910 db_print_xucred(int indent, struct xucred *xu) 2911 { 2912 int comma, i; 2913 2914 db_print_indent(indent); 2915 db_printf("cr_version: %u cr_uid: %u cr_pid: %d cr_ngroups: %d\n", 2916 xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups); 2917 db_print_indent(indent); 2918 db_printf("cr_groups: "); 2919 comma = 0; 2920 for (i = 0; i < xu->cr_ngroups; i++) { 2921 db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]); 2922 comma = 1; 2923 } 2924 db_printf("\n"); 2925 } 2926 2927 static void 2928 db_print_unprefs(int indent, struct unp_head *uh) 2929 { 2930 struct unpcb *unp; 2931 int counter; 2932 2933 counter = 0; 2934 LIST_FOREACH(unp, uh, unp_reflink) { 2935 if (counter % 4 == 0) 2936 db_print_indent(indent); 2937 db_printf("%p ", unp); 2938 if (counter % 4 == 3) 2939 db_printf("\n"); 2940 counter++; 2941 } 2942 if (counter != 0 && counter % 4 != 0) 2943 db_printf("\n"); 2944 } 2945 2946 DB_SHOW_COMMAND(unpcb, db_show_unpcb) 2947 { 2948 struct unpcb *unp; 2949 2950 if (!have_addr) { 2951 db_printf("usage: show unpcb <addr>\n"); 2952 return; 2953 } 2954 unp = (struct unpcb *)addr; 2955 2956 db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket, 2957 unp->unp_vnode); 2958 2959 db_printf("unp_ino: %ju unp_conn: %p\n", (uintmax_t)unp->unp_ino, 2960 unp->unp_conn); 2961 2962 db_printf("unp_refs:\n"); 2963 db_print_unprefs(2, &unp->unp_refs); 2964 2965 /* XXXRW: Would be nice to print the full address, if any. */ 2966 db_printf("unp_addr: %p\n", unp->unp_addr); 2967 2968 db_printf("unp_gencnt: %llu\n", 2969 (unsigned long long)unp->unp_gencnt); 2970 2971 db_printf("unp_flags: %x (", unp->unp_flags); 2972 db_print_unpflags(unp->unp_flags); 2973 db_printf(")\n"); 2974 2975 db_printf("unp_peercred:\n"); 2976 db_print_xucred(2, &unp->unp_peercred); 2977 2978 db_printf("unp_refcount: %u\n", unp->unp_refcount); 2979 } 2980 #endif 2981