xref: /freebsd/sys/kern/uipc_usrreq.c (revision f2530c80db7b29b95368fce956b3a778f096b368)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1989, 1991, 1993
5  *	The Regents of the University of California. All Rights Reserved.
6  * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
7  * Copyright (c) 2018 Matthew Macy
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
34  */
35 
36 /*
37  * UNIX Domain (Local) Sockets
38  *
39  * This is an implementation of UNIX (local) domain sockets.  Each socket has
40  * an associated struct unpcb (UNIX protocol control block).  Stream sockets
41  * may be connected to 0 or 1 other socket.  Datagram sockets may be
42  * connected to 0, 1, or many other sockets.  Sockets may be created and
43  * connected in pairs (socketpair(2)), or bound/connected to using the file
44  * system name space.  For most purposes, only the receive socket buffer is
45  * used, as sending on one socket delivers directly to the receive socket
46  * buffer of a second socket.
47  *
48  * The implementation is substantially complicated by the fact that
49  * "ancillary data", such as file descriptors or credentials, may be passed
50  * across UNIX domain sockets.  The potential for passing UNIX domain sockets
51  * over other UNIX domain sockets requires the implementation of a simple
52  * garbage collector to find and tear down cycles of disconnected sockets.
53  *
54  * TODO:
55  *	RDM
56  *	rethink name space problems
57  *	need a proper out-of-band
58  */
59 
60 #include <sys/cdefs.h>
61 __FBSDID("$FreeBSD$");
62 
63 #include "opt_ddb.h"
64 
65 #include <sys/param.h>
66 #include <sys/capsicum.h>
67 #include <sys/domain.h>
68 #include <sys/fcntl.h>
69 #include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
70 #include <sys/eventhandler.h>
71 #include <sys/file.h>
72 #include <sys/filedesc.h>
73 #include <sys/kernel.h>
74 #include <sys/lock.h>
75 #include <sys/mbuf.h>
76 #include <sys/mount.h>
77 #include <sys/mutex.h>
78 #include <sys/namei.h>
79 #include <sys/proc.h>
80 #include <sys/protosw.h>
81 #include <sys/queue.h>
82 #include <sys/resourcevar.h>
83 #include <sys/rwlock.h>
84 #include <sys/socket.h>
85 #include <sys/socketvar.h>
86 #include <sys/signalvar.h>
87 #include <sys/stat.h>
88 #include <sys/sx.h>
89 #include <sys/sysctl.h>
90 #include <sys/systm.h>
91 #include <sys/taskqueue.h>
92 #include <sys/un.h>
93 #include <sys/unpcb.h>
94 #include <sys/vnode.h>
95 
96 #include <net/vnet.h>
97 
98 #ifdef DDB
99 #include <ddb/ddb.h>
100 #endif
101 
102 #include <security/mac/mac_framework.h>
103 
104 #include <vm/uma.h>
105 
106 MALLOC_DECLARE(M_FILECAPS);
107 
108 /*
109  * Locking key:
110  * (l)	Locked using list lock
111  * (g)	Locked using linkage lock
112  */
113 
114 static uma_zone_t	unp_zone;
115 static unp_gen_t	unp_gencnt;	/* (l) */
116 static u_int		unp_count;	/* (l) Count of local sockets. */
117 static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
118 static int		unp_rights;	/* (g) File descriptors in flight. */
119 static struct unp_head	unp_shead;	/* (l) List of stream sockets. */
120 static struct unp_head	unp_dhead;	/* (l) List of datagram sockets. */
121 static struct unp_head	unp_sphead;	/* (l) List of seqpacket sockets. */
122 
123 struct unp_defer {
124 	SLIST_ENTRY(unp_defer) ud_link;
125 	struct file *ud_fp;
126 };
127 static SLIST_HEAD(, unp_defer) unp_defers;
128 static int unp_defers_count;
129 
130 static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
131 
132 /*
133  * Garbage collection of cyclic file descriptor/socket references occurs
134  * asynchronously in a taskqueue context in order to avoid recursion and
135  * reentrance in the UNIX domain socket, file descriptor, and socket layer
136  * code.  See unp_gc() for a full description.
137  */
138 static struct timeout_task unp_gc_task;
139 
140 /*
141  * The close of unix domain sockets attached as SCM_RIGHTS is
142  * postponed to the taskqueue, to avoid arbitrary recursion depth.
143  * The attached sockets might have another sockets attached.
144  */
145 static struct task	unp_defer_task;
146 
147 /*
148  * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
149  * stream sockets, although the total for sender and receiver is actually
150  * only PIPSIZ.
151  *
152  * Datagram sockets really use the sendspace as the maximum datagram size,
153  * and don't really want to reserve the sendspace.  Their recvspace should be
154  * large enough for at least one max-size datagram plus address.
155  */
156 #ifndef PIPSIZ
157 #define	PIPSIZ	8192
158 #endif
159 static u_long	unpst_sendspace = PIPSIZ;
160 static u_long	unpst_recvspace = PIPSIZ;
161 static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
162 static u_long	unpdg_recvspace = 4*1024;
163 static u_long	unpsp_sendspace = PIPSIZ;	/* really max datagram size */
164 static u_long	unpsp_recvspace = PIPSIZ;
165 
166 static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
167 static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0,
168     "SOCK_STREAM");
169 static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
170 static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW, 0,
171     "SOCK_SEQPACKET");
172 
173 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
174 	   &unpst_sendspace, 0, "Default stream send space.");
175 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
176 	   &unpst_recvspace, 0, "Default stream receive space.");
177 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
178 	   &unpdg_sendspace, 0, "Default datagram send space.");
179 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
180 	   &unpdg_recvspace, 0, "Default datagram receive space.");
181 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
182 	   &unpsp_sendspace, 0, "Default seqpacket send space.");
183 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
184 	   &unpsp_recvspace, 0, "Default seqpacket receive space.");
185 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
186     "File descriptors in flight.");
187 SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
188     &unp_defers_count, 0,
189     "File descriptors deferred to taskqueue for close.");
190 
191 /*
192  * Locking and synchronization:
193  *
194  * Three types of locks exist in the local domain socket implementation: a
195  * a global linkage rwlock, the mtxpool lock, and per-unpcb mutexes.
196  * The linkage lock protects the socket count, global generation number,
197  * and stream/datagram global lists.
198  *
199  * The mtxpool lock protects the vnode from being modified while referenced.
200  * Lock ordering requires that it be acquired before any unpcb locks.
201  *
202  * The unpcb lock (unp_mtx) protects all fields in the unpcb. Of particular
203  * note is that this includes the unp_conn field. So long as the unpcb lock
204  * is held the reference to the unpcb pointed to by unp_conn is valid. If we
205  * require that the unpcb pointed to by unp_conn remain live in cases where
206  * we need to drop the unp_mtx as when we need to acquire the lock for a
207  * second unpcb the caller must first acquire an additional reference on the
208  * second unpcb and then revalidate any state (typically check that unp_conn
209  * is non-NULL) upon requiring the initial unpcb lock. The lock ordering
210  * between unpcbs is the conventional ascending address order. Two helper
211  * routines exist for this:
212  *
213  *   - unp_pcb_lock2(unp, unp2) - which just acquires the two locks in the
214  *     safe ordering.
215  *
216  *   - unp_pcb_owned_lock2(unp, unp2, freed) - the lock for unp is held
217  *     when called. If unp is unlocked and unp2 is subsequently freed
218  *     freed will be set to 1.
219  *
220  * The helper routines for references are:
221  *
222  *   - unp_pcb_hold(unp): Can be called any time we currently hold a valid
223  *     reference to unp.
224  *
225  *    - unp_pcb_rele(unp): The caller must hold the unp lock. If we are
226  *      releasing the last reference, detach must have been called thus
227  *      unp->unp_socket be NULL.
228  *
229  * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
230  * allocated in pru_attach() and freed in pru_detach().  The validity of that
231  * pointer is an invariant, so no lock is required to dereference the so_pcb
232  * pointer if a valid socket reference is held by the caller.  In practice,
233  * this is always true during operations performed on a socket.  Each unpcb
234  * has a back-pointer to its socket, unp_socket, which will be stable under
235  * the same circumstances.
236  *
237  * This pointer may only be safely dereferenced as long as a valid reference
238  * to the unpcb is held.  Typically, this reference will be from the socket,
239  * or from another unpcb when the referring unpcb's lock is held (in order
240  * that the reference not be invalidated during use).  For example, to follow
241  * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
242  * that detach is not run clearing unp_socket.
243  *
244  * Blocking with UNIX domain sockets is a tricky issue: unlike most network
245  * protocols, bind() is a non-atomic operation, and connect() requires
246  * potential sleeping in the protocol, due to potentially waiting on local or
247  * distributed file systems.  We try to separate "lookup" operations, which
248  * may sleep, and the IPC operations themselves, which typically can occur
249  * with relative atomicity as locks can be held over the entire operation.
250  *
251  * Another tricky issue is simultaneous multi-threaded or multi-process
252  * access to a single UNIX domain socket.  These are handled by the flags
253  * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
254  * binding, both of which involve dropping UNIX domain socket locks in order
255  * to perform namei() and other file system operations.
256  */
257 static struct rwlock	unp_link_rwlock;
258 static struct mtx	unp_defers_lock;
259 
260 #define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
261 					    "unp_link_rwlock")
262 
263 #define	UNP_LINK_LOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
264 					    RA_LOCKED)
265 #define	UNP_LINK_UNLOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
266 					    RA_UNLOCKED)
267 
268 #define	UNP_LINK_RLOCK()		rw_rlock(&unp_link_rwlock)
269 #define	UNP_LINK_RUNLOCK()		rw_runlock(&unp_link_rwlock)
270 #define	UNP_LINK_WLOCK()		rw_wlock(&unp_link_rwlock)
271 #define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
272 #define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
273 					    RA_WLOCKED)
274 #define	UNP_LINK_WOWNED()		rw_wowned(&unp_link_rwlock)
275 
276 #define	UNP_DEFERRED_LOCK_INIT()	mtx_init(&unp_defers_lock, \
277 					    "unp_defer", NULL, MTX_DEF)
278 #define	UNP_DEFERRED_LOCK()		mtx_lock(&unp_defers_lock)
279 #define	UNP_DEFERRED_UNLOCK()		mtx_unlock(&unp_defers_lock)
280 
281 #define UNP_REF_LIST_LOCK()		UNP_DEFERRED_LOCK();
282 #define UNP_REF_LIST_UNLOCK()		UNP_DEFERRED_UNLOCK();
283 
284 #define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
285 					    "unp", "unp",	\
286 					    MTX_DUPOK|MTX_DEF)
287 #define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
288 #define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
289 #define	UNP_PCB_TRYLOCK(unp)		mtx_trylock(&(unp)->unp_mtx)
290 #define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
291 #define	UNP_PCB_OWNED(unp)		mtx_owned(&(unp)->unp_mtx)
292 #define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
293 #define	UNP_PCB_UNLOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
294 
295 static int	uipc_connect2(struct socket *, struct socket *);
296 static int	uipc_ctloutput(struct socket *, struct sockopt *);
297 static int	unp_connect(struct socket *, struct sockaddr *,
298 		    struct thread *);
299 static int	unp_connectat(int, struct socket *, struct sockaddr *,
300 		    struct thread *);
301 static int	unp_connect2(struct socket *so, struct socket *so2, int);
302 static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
303 static void	unp_dispose(struct socket *so);
304 static void	unp_dispose_mbuf(struct mbuf *);
305 static void	unp_shutdown(struct unpcb *);
306 static void	unp_drop(struct unpcb *);
307 static void	unp_gc(__unused void *, int);
308 static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
309 static void	unp_discard(struct file *);
310 static void	unp_freerights(struct filedescent **, int);
311 static void	unp_init(void);
312 static int	unp_internalize(struct mbuf **, struct thread *);
313 static void	unp_internalize_fp(struct file *);
314 static int	unp_externalize(struct mbuf *, struct mbuf **, int);
315 static int	unp_externalize_fp(struct file *);
316 static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
317 static void	unp_process_defers(void * __unused, int);
318 
319 static void
320 unp_pcb_hold(struct unpcb *unp)
321 {
322 	MPASS(unp->unp_refcount);
323 	refcount_acquire(&unp->unp_refcount);
324 }
325 
326 static int
327 unp_pcb_rele(struct unpcb *unp)
328 {
329 	int freed;
330 
331 	UNP_PCB_LOCK_ASSERT(unp);
332 	MPASS(unp->unp_refcount);
333 	if ((freed = refcount_release(&unp->unp_refcount))) {
334 		/* we got here with having detached? */
335 		MPASS(unp->unp_socket == NULL);
336 		UNP_PCB_UNLOCK(unp);
337 		UNP_PCB_LOCK_DESTROY(unp);
338 		uma_zfree(unp_zone, unp);
339 	}
340 	return (freed);
341 }
342 
343 static void
344 unp_pcb_lock2(struct unpcb *unp, struct unpcb *unp2)
345 {
346 	MPASS(unp != unp2);
347 	UNP_PCB_UNLOCK_ASSERT(unp);
348 	UNP_PCB_UNLOCK_ASSERT(unp2);
349 	if ((uintptr_t)unp2 > (uintptr_t)unp) {
350 		UNP_PCB_LOCK(unp);
351 		UNP_PCB_LOCK(unp2);
352 	} else {
353 		UNP_PCB_LOCK(unp2);
354 		UNP_PCB_LOCK(unp);
355 	}
356 }
357 
358 static __noinline void
359 unp_pcb_owned_lock2_slowpath(struct unpcb *unp, struct unpcb **unp2p,
360     int *freed)
361 {
362 	struct unpcb *unp2;
363 
364 	unp2 = *unp2p;
365 	unp_pcb_hold(unp2);
366 	UNP_PCB_UNLOCK(unp);
367 	UNP_PCB_LOCK(unp2);
368 	UNP_PCB_LOCK(unp);
369 	*freed = unp_pcb_rele(unp2);
370 	if (*freed)
371 		*unp2p = NULL;
372 }
373 
374 #define unp_pcb_owned_lock2(unp, unp2, freed) do {			\
375 	freed = 0;							\
376 	UNP_PCB_LOCK_ASSERT(unp);					\
377 	UNP_PCB_UNLOCK_ASSERT(unp2);					\
378 	MPASS((unp) != (unp2));						\
379 	if (__predict_true(UNP_PCB_TRYLOCK(unp2)))			\
380 		break;							\
381 	else if ((uintptr_t)(unp2) > (uintptr_t)(unp))			\
382 		UNP_PCB_LOCK(unp2);					\
383 	else								\
384 		unp_pcb_owned_lock2_slowpath((unp), &(unp2), &freed);	\
385 } while (0)
386 
387 /*
388  * Definitions of protocols supported in the LOCAL domain.
389  */
390 static struct domain localdomain;
391 static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream;
392 static struct pr_usrreqs uipc_usrreqs_seqpacket;
393 static struct protosw localsw[] = {
394 {
395 	.pr_type =		SOCK_STREAM,
396 	.pr_domain =		&localdomain,
397 	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
398 	.pr_ctloutput =		&uipc_ctloutput,
399 	.pr_usrreqs =		&uipc_usrreqs_stream
400 },
401 {
402 	.pr_type =		SOCK_DGRAM,
403 	.pr_domain =		&localdomain,
404 	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
405 	.pr_ctloutput =		&uipc_ctloutput,
406 	.pr_usrreqs =		&uipc_usrreqs_dgram
407 },
408 {
409 	.pr_type =		SOCK_SEQPACKET,
410 	.pr_domain =		&localdomain,
411 
412 	/*
413 	 * XXXRW: For now, PR_ADDR because soreceive will bump into them
414 	 * due to our use of sbappendaddr.  A new sbappend variants is needed
415 	 * that supports both atomic record writes and control data.
416 	 */
417 	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD|
418 				    PR_RIGHTS,
419 	.pr_ctloutput =		&uipc_ctloutput,
420 	.pr_usrreqs =		&uipc_usrreqs_seqpacket,
421 },
422 };
423 
424 static struct domain localdomain = {
425 	.dom_family =		AF_LOCAL,
426 	.dom_name =		"local",
427 	.dom_init =		unp_init,
428 	.dom_externalize =	unp_externalize,
429 	.dom_dispose =		unp_dispose,
430 	.dom_protosw =		localsw,
431 	.dom_protoswNPROTOSW =	&localsw[nitems(localsw)]
432 };
433 DOMAIN_SET(local);
434 
435 static void
436 uipc_abort(struct socket *so)
437 {
438 	struct unpcb *unp, *unp2;
439 
440 	unp = sotounpcb(so);
441 	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
442 	UNP_PCB_UNLOCK_ASSERT(unp);
443 
444 	UNP_PCB_LOCK(unp);
445 	unp2 = unp->unp_conn;
446 	if (unp2 != NULL) {
447 		unp_pcb_hold(unp2);
448 		UNP_PCB_UNLOCK(unp);
449 		unp_drop(unp2);
450 	} else
451 		UNP_PCB_UNLOCK(unp);
452 }
453 
454 static int
455 uipc_accept(struct socket *so, struct sockaddr **nam)
456 {
457 	struct unpcb *unp, *unp2;
458 	const struct sockaddr *sa;
459 
460 	/*
461 	 * Pass back name of connected socket, if it was bound and we are
462 	 * still connected (our peer may have closed already!).
463 	 */
464 	unp = sotounpcb(so);
465 	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
466 
467 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
468 	UNP_LINK_RLOCK();
469 	unp2 = unp->unp_conn;
470 	if (unp2 != NULL && unp2->unp_addr != NULL) {
471 		UNP_PCB_LOCK(unp2);
472 		sa = (struct sockaddr *) unp2->unp_addr;
473 		bcopy(sa, *nam, sa->sa_len);
474 		UNP_PCB_UNLOCK(unp2);
475 	} else {
476 		sa = &sun_noname;
477 		bcopy(sa, *nam, sa->sa_len);
478 	}
479 	UNP_LINK_RUNLOCK();
480 	return (0);
481 }
482 
483 static int
484 uipc_attach(struct socket *so, int proto, struct thread *td)
485 {
486 	u_long sendspace, recvspace;
487 	struct unpcb *unp;
488 	int error;
489 	bool locked;
490 
491 	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
492 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
493 		switch (so->so_type) {
494 		case SOCK_STREAM:
495 			sendspace = unpst_sendspace;
496 			recvspace = unpst_recvspace;
497 			break;
498 
499 		case SOCK_DGRAM:
500 			sendspace = unpdg_sendspace;
501 			recvspace = unpdg_recvspace;
502 			break;
503 
504 		case SOCK_SEQPACKET:
505 			sendspace = unpsp_sendspace;
506 			recvspace = unpsp_recvspace;
507 			break;
508 
509 		default:
510 			panic("uipc_attach");
511 		}
512 		error = soreserve(so, sendspace, recvspace);
513 		if (error)
514 			return (error);
515 	}
516 	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
517 	if (unp == NULL)
518 		return (ENOBUFS);
519 	LIST_INIT(&unp->unp_refs);
520 	UNP_PCB_LOCK_INIT(unp);
521 	unp->unp_socket = so;
522 	so->so_pcb = unp;
523 	unp->unp_refcount = 1;
524 	if (so->so_listen != NULL)
525 		unp->unp_flags |= UNP_NASCENT;
526 
527 	if ((locked = UNP_LINK_WOWNED()) == false)
528 		UNP_LINK_WLOCK();
529 
530 	unp->unp_gencnt = ++unp_gencnt;
531 	unp->unp_ino = ++unp_ino;
532 	unp_count++;
533 	switch (so->so_type) {
534 	case SOCK_STREAM:
535 		LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
536 		break;
537 
538 	case SOCK_DGRAM:
539 		LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
540 		break;
541 
542 	case SOCK_SEQPACKET:
543 		LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
544 		break;
545 
546 	default:
547 		panic("uipc_attach");
548 	}
549 
550 	if (locked == false)
551 		UNP_LINK_WUNLOCK();
552 
553 	return (0);
554 }
555 
556 static int
557 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
558 {
559 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
560 	struct vattr vattr;
561 	int error, namelen;
562 	struct nameidata nd;
563 	struct unpcb *unp;
564 	struct vnode *vp;
565 	struct mount *mp;
566 	cap_rights_t rights;
567 	char *buf;
568 
569 	if (nam->sa_family != AF_UNIX)
570 		return (EAFNOSUPPORT);
571 
572 	unp = sotounpcb(so);
573 	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
574 
575 	if (soun->sun_len > sizeof(struct sockaddr_un))
576 		return (EINVAL);
577 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
578 	if (namelen <= 0)
579 		return (EINVAL);
580 
581 	/*
582 	 * We don't allow simultaneous bind() calls on a single UNIX domain
583 	 * socket, so flag in-progress operations, and return an error if an
584 	 * operation is already in progress.
585 	 *
586 	 * Historically, we have not allowed a socket to be rebound, so this
587 	 * also returns an error.  Not allowing re-binding simplifies the
588 	 * implementation and avoids a great many possible failure modes.
589 	 */
590 	UNP_PCB_LOCK(unp);
591 	if (unp->unp_vnode != NULL) {
592 		UNP_PCB_UNLOCK(unp);
593 		return (EINVAL);
594 	}
595 	if (unp->unp_flags & UNP_BINDING) {
596 		UNP_PCB_UNLOCK(unp);
597 		return (EALREADY);
598 	}
599 	unp->unp_flags |= UNP_BINDING;
600 	UNP_PCB_UNLOCK(unp);
601 
602 	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
603 	bcopy(soun->sun_path, buf, namelen);
604 	buf[namelen] = 0;
605 
606 restart:
607 	NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE,
608 	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_BINDAT), td);
609 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
610 	error = namei(&nd);
611 	if (error)
612 		goto error;
613 	vp = nd.ni_vp;
614 	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
615 		NDFREE(&nd, NDF_ONLY_PNBUF);
616 		if (nd.ni_dvp == vp)
617 			vrele(nd.ni_dvp);
618 		else
619 			vput(nd.ni_dvp);
620 		if (vp != NULL) {
621 			vrele(vp);
622 			error = EADDRINUSE;
623 			goto error;
624 		}
625 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
626 		if (error)
627 			goto error;
628 		goto restart;
629 	}
630 	VATTR_NULL(&vattr);
631 	vattr.va_type = VSOCK;
632 	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
633 #ifdef MAC
634 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
635 	    &vattr);
636 #endif
637 	if (error == 0)
638 		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
639 	NDFREE(&nd, NDF_ONLY_PNBUF);
640 	vput(nd.ni_dvp);
641 	if (error) {
642 		vn_finished_write(mp);
643 		goto error;
644 	}
645 	vp = nd.ni_vp;
646 	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
647 	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
648 
649 	UNP_PCB_LOCK(unp);
650 	VOP_UNP_BIND(vp, unp);
651 	unp->unp_vnode = vp;
652 	unp->unp_addr = soun;
653 	unp->unp_flags &= ~UNP_BINDING;
654 	UNP_PCB_UNLOCK(unp);
655 	VOP_UNLOCK(vp);
656 	vn_finished_write(mp);
657 	free(buf, M_TEMP);
658 	return (0);
659 
660 error:
661 	UNP_PCB_LOCK(unp);
662 	unp->unp_flags &= ~UNP_BINDING;
663 	UNP_PCB_UNLOCK(unp);
664 	free(buf, M_TEMP);
665 	return (error);
666 }
667 
668 static int
669 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
670 {
671 
672 	return (uipc_bindat(AT_FDCWD, so, nam, td));
673 }
674 
675 static int
676 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
677 {
678 	int error;
679 
680 	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
681 	error = unp_connect(so, nam, td);
682 	return (error);
683 }
684 
685 static int
686 uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
687     struct thread *td)
688 {
689 	int error;
690 
691 	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
692 	error = unp_connectat(fd, so, nam, td);
693 	return (error);
694 }
695 
696 static void
697 uipc_close(struct socket *so)
698 {
699 	struct unpcb *unp, *unp2;
700 	struct vnode *vp = NULL;
701 	struct mtx *vplock;
702 	int freed;
703 	unp = sotounpcb(so);
704 	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
705 
706 	vplock = NULL;
707 	if ((vp = unp->unp_vnode) != NULL) {
708 		vplock = mtx_pool_find(mtxpool_sleep, vp);
709 		mtx_lock(vplock);
710 	}
711 	UNP_PCB_LOCK(unp);
712 	if (vp && unp->unp_vnode == NULL) {
713 		mtx_unlock(vplock);
714 		vp = NULL;
715 	}
716 	if (vp != NULL) {
717 		VOP_UNP_DETACH(vp);
718 		unp->unp_vnode = NULL;
719 	}
720 	unp2 = unp->unp_conn;
721 	unp_pcb_hold(unp);
722 	if (__predict_false(unp == unp2)) {
723 		unp_disconnect(unp, unp2);
724 	} else if (unp2 != NULL) {
725 		unp_pcb_hold(unp2);
726 		unp_pcb_owned_lock2(unp, unp2, freed);
727 		unp_disconnect(unp, unp2);
728 		if (unp_pcb_rele(unp2) == 0)
729 			UNP_PCB_UNLOCK(unp2);
730 	}
731 	if (unp_pcb_rele(unp) == 0)
732 		UNP_PCB_UNLOCK(unp);
733 	if (vp) {
734 		mtx_unlock(vplock);
735 		vrele(vp);
736 	}
737 }
738 
739 static int
740 uipc_connect2(struct socket *so1, struct socket *so2)
741 {
742 	struct unpcb *unp, *unp2;
743 	int error;
744 
745 	unp = so1->so_pcb;
746 	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
747 	unp2 = so2->so_pcb;
748 	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
749 	if (unp != unp2)
750 		unp_pcb_lock2(unp, unp2);
751 	else
752 		UNP_PCB_LOCK(unp);
753 	error = unp_connect2(so1, so2, PRU_CONNECT2);
754 	if (unp != unp2)
755 		UNP_PCB_UNLOCK(unp2);
756 	UNP_PCB_UNLOCK(unp);
757 	return (error);
758 }
759 
760 static void
761 uipc_detach(struct socket *so)
762 {
763 	struct unpcb *unp, *unp2;
764 	struct mtx *vplock;
765 	struct sockaddr_un *saved_unp_addr;
766 	struct vnode *vp;
767 	int freeunp, local_unp_rights;
768 
769 	unp = sotounpcb(so);
770 	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
771 
772 	vp = NULL;
773 	vplock = NULL;
774 	local_unp_rights = 0;
775 
776 	UNP_LINK_WLOCK();
777 	LIST_REMOVE(unp, unp_link);
778 	if (unp->unp_gcflag & UNPGC_DEAD)
779 		LIST_REMOVE(unp, unp_dead);
780 	unp->unp_gencnt = ++unp_gencnt;
781 	--unp_count;
782 	UNP_LINK_WUNLOCK();
783 
784 	UNP_PCB_UNLOCK_ASSERT(unp);
785  restart:
786 	if ((vp = unp->unp_vnode) != NULL) {
787 		vplock = mtx_pool_find(mtxpool_sleep, vp);
788 		mtx_lock(vplock);
789 	}
790 	UNP_PCB_LOCK(unp);
791 	if (unp->unp_vnode != vp &&
792 		unp->unp_vnode != NULL) {
793 		if (vplock)
794 			mtx_unlock(vplock);
795 		UNP_PCB_UNLOCK(unp);
796 		goto restart;
797 	}
798 	if ((unp->unp_flags & UNP_NASCENT) != 0) {
799 		goto teardown;
800 	}
801 	if ((vp = unp->unp_vnode) != NULL) {
802 		VOP_UNP_DETACH(vp);
803 		unp->unp_vnode = NULL;
804 	}
805 	if (__predict_false(unp == unp->unp_conn)) {
806 		unp_disconnect(unp, unp);
807 		unp2 = NULL;
808 		goto connect_self;
809 	}
810 	if ((unp2 = unp->unp_conn) != NULL) {
811 		unp_pcb_owned_lock2(unp, unp2, freeunp);
812 		if (freeunp)
813 			unp2 = NULL;
814 	}
815 	unp_pcb_hold(unp);
816 	if (unp2 != NULL) {
817 		unp_pcb_hold(unp2);
818 		unp_disconnect(unp, unp2);
819 		if (unp_pcb_rele(unp2) == 0)
820 			UNP_PCB_UNLOCK(unp2);
821 	}
822  connect_self:
823 	UNP_PCB_UNLOCK(unp);
824 	UNP_REF_LIST_LOCK();
825 	while (!LIST_EMPTY(&unp->unp_refs)) {
826 		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
827 
828 		unp_pcb_hold(ref);
829 		UNP_REF_LIST_UNLOCK();
830 
831 		MPASS(ref != unp);
832 		UNP_PCB_UNLOCK_ASSERT(ref);
833 		unp_drop(ref);
834 		UNP_REF_LIST_LOCK();
835 	}
836 
837 	UNP_REF_LIST_UNLOCK();
838 	UNP_PCB_LOCK(unp);
839 	freeunp = unp_pcb_rele(unp);
840 	MPASS(freeunp == 0);
841 	local_unp_rights = unp_rights;
842 teardown:
843 	unp->unp_socket->so_pcb = NULL;
844 	saved_unp_addr = unp->unp_addr;
845 	unp->unp_addr = NULL;
846 	unp->unp_socket = NULL;
847 	freeunp = unp_pcb_rele(unp);
848 	if (saved_unp_addr != NULL)
849 		free(saved_unp_addr, M_SONAME);
850 	if (!freeunp)
851 		UNP_PCB_UNLOCK(unp);
852 	if (vp) {
853 		mtx_unlock(vplock);
854 		vrele(vp);
855 	}
856 	if (local_unp_rights)
857 		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
858 }
859 
860 static int
861 uipc_disconnect(struct socket *so)
862 {
863 	struct unpcb *unp, *unp2;
864 	int freed;
865 
866 	unp = sotounpcb(so);
867 	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
868 
869 	UNP_PCB_LOCK(unp);
870 	if ((unp2 = unp->unp_conn) == NULL) {
871 		UNP_PCB_UNLOCK(unp);
872 		return (0);
873 	}
874 	if (__predict_true(unp != unp2)) {
875 		unp_pcb_owned_lock2(unp, unp2, freed);
876 		if (__predict_false(freed)) {
877 			UNP_PCB_UNLOCK(unp);
878 			return (0);
879 		}
880 		unp_pcb_hold(unp2);
881 	}
882 	unp_pcb_hold(unp);
883 	unp_disconnect(unp, unp2);
884 	if (unp_pcb_rele(unp) == 0)
885 		UNP_PCB_UNLOCK(unp);
886 	if ((unp != unp2) && unp_pcb_rele(unp2) == 0)
887 		UNP_PCB_UNLOCK(unp2);
888 	return (0);
889 }
890 
891 static int
892 uipc_listen(struct socket *so, int backlog, struct thread *td)
893 {
894 	struct unpcb *unp;
895 	int error;
896 
897 	if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET)
898 		return (EOPNOTSUPP);
899 
900 	unp = sotounpcb(so);
901 	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
902 
903 	UNP_PCB_LOCK(unp);
904 	if (unp->unp_vnode == NULL) {
905 		/* Already connected or not bound to an address. */
906 		error = unp->unp_conn != NULL ? EINVAL : EDESTADDRREQ;
907 		UNP_PCB_UNLOCK(unp);
908 		return (error);
909 	}
910 
911 	SOCK_LOCK(so);
912 	error = solisten_proto_check(so);
913 	if (error == 0) {
914 		cru2xt(td, &unp->unp_peercred);
915 		solisten_proto(so, backlog);
916 	}
917 	SOCK_UNLOCK(so);
918 	UNP_PCB_UNLOCK(unp);
919 	return (error);
920 }
921 
922 static int
923 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
924 {
925 	struct unpcb *unp, *unp2;
926 	const struct sockaddr *sa;
927 
928 	unp = sotounpcb(so);
929 	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
930 
931 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
932 	UNP_LINK_RLOCK();
933 	/*
934 	 * XXX: It seems that this test always fails even when connection is
935 	 * established.  So, this else clause is added as workaround to
936 	 * return PF_LOCAL sockaddr.
937 	 */
938 	unp2 = unp->unp_conn;
939 	if (unp2 != NULL) {
940 		UNP_PCB_LOCK(unp2);
941 		if (unp2->unp_addr != NULL)
942 			sa = (struct sockaddr *) unp2->unp_addr;
943 		else
944 			sa = &sun_noname;
945 		bcopy(sa, *nam, sa->sa_len);
946 		UNP_PCB_UNLOCK(unp2);
947 	} else {
948 		sa = &sun_noname;
949 		bcopy(sa, *nam, sa->sa_len);
950 	}
951 	UNP_LINK_RUNLOCK();
952 	return (0);
953 }
954 
955 static int
956 uipc_rcvd(struct socket *so, int flags)
957 {
958 	struct unpcb *unp, *unp2;
959 	struct socket *so2;
960 	u_int mbcnt, sbcc;
961 
962 	unp = sotounpcb(so);
963 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
964 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
965 	    ("%s: socktype %d", __func__, so->so_type));
966 
967 	/*
968 	 * Adjust backpressure on sender and wakeup any waiting to write.
969 	 *
970 	 * The unp lock is acquired to maintain the validity of the unp_conn
971 	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
972 	 * static as long as we don't permit unp2 to disconnect from unp,
973 	 * which is prevented by the lock on unp.  We cache values from
974 	 * so_rcv to avoid holding the so_rcv lock over the entire
975 	 * transaction on the remote so_snd.
976 	 */
977 	SOCKBUF_LOCK(&so->so_rcv);
978 	mbcnt = so->so_rcv.sb_mbcnt;
979 	sbcc = sbavail(&so->so_rcv);
980 	SOCKBUF_UNLOCK(&so->so_rcv);
981 	/*
982 	 * There is a benign race condition at this point.  If we're planning to
983 	 * clear SB_STOP, but uipc_send is called on the connected socket at
984 	 * this instant, it might add data to the sockbuf and set SB_STOP.  Then
985 	 * we would erroneously clear SB_STOP below, even though the sockbuf is
986 	 * full.  The race is benign because the only ill effect is to allow the
987 	 * sockbuf to exceed its size limit, and the size limits are not
988 	 * strictly guaranteed anyway.
989 	 */
990 	UNP_PCB_LOCK(unp);
991 	unp2 = unp->unp_conn;
992 	if (unp2 == NULL) {
993 		UNP_PCB_UNLOCK(unp);
994 		return (0);
995 	}
996 	so2 = unp2->unp_socket;
997 	SOCKBUF_LOCK(&so2->so_snd);
998 	if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
999 		so2->so_snd.sb_flags &= ~SB_STOP;
1000 	sowwakeup_locked(so2);
1001 	UNP_PCB_UNLOCK(unp);
1002 	return (0);
1003 }
1004 
1005 static int
1006 connect_internal(struct socket *so, struct sockaddr *nam, struct thread *td)
1007 {
1008 	int error;
1009 	struct unpcb *unp;
1010 
1011 	unp = so->so_pcb;
1012 	if (unp->unp_conn != NULL)
1013 		return (EISCONN);
1014 	error = unp_connect(so, nam, td);
1015 	if (error)
1016 		return (error);
1017 	UNP_PCB_LOCK(unp);
1018 	if (unp->unp_conn == NULL) {
1019 		UNP_PCB_UNLOCK(unp);
1020 		if (error == 0)
1021 			error = ENOTCONN;
1022 	}
1023 	return (error);
1024 }
1025 
1026 static int
1027 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
1028     struct mbuf *control, struct thread *td)
1029 {
1030 	struct unpcb *unp, *unp2;
1031 	struct socket *so2;
1032 	u_int mbcnt, sbcc;
1033 	int freed, error;
1034 
1035 	unp = sotounpcb(so);
1036 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
1037 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM ||
1038 	    so->so_type == SOCK_SEQPACKET,
1039 	    ("%s: socktype %d", __func__, so->so_type));
1040 
1041 	freed = error = 0;
1042 	if (flags & PRUS_OOB) {
1043 		error = EOPNOTSUPP;
1044 		goto release;
1045 	}
1046 	if (control != NULL && (error = unp_internalize(&control, td)))
1047 		goto release;
1048 
1049 	unp2 = NULL;
1050 	switch (so->so_type) {
1051 	case SOCK_DGRAM:
1052 	{
1053 		const struct sockaddr *from;
1054 
1055 		if (nam != NULL) {
1056 			/*
1057 			 * We return with UNP_PCB_LOCK_HELD so we know that
1058 			 * the reference is live if the pointer is valid.
1059 			 */
1060 			if ((error = connect_internal(so, nam, td)))
1061 				break;
1062 			MPASS(unp->unp_conn != NULL);
1063 			unp2 = unp->unp_conn;
1064 		} else  {
1065 			UNP_PCB_LOCK(unp);
1066 
1067 			/*
1068 			 * Because connect() and send() are non-atomic in a sendto()
1069 			 * with a target address, it's possible that the socket will
1070 			 * have disconnected before the send() can run.  In that case
1071 			 * return the slightly counter-intuitive but otherwise
1072 			 * correct error that the socket is not connected.
1073 			 */
1074 			if ((unp2 = unp->unp_conn)  == NULL) {
1075 				UNP_PCB_UNLOCK(unp);
1076 				error = ENOTCONN;
1077 				break;
1078 			}
1079 		}
1080 		if (__predict_false(unp == unp2)) {
1081 			if (unp->unp_socket == NULL) {
1082 				error = ENOTCONN;
1083 				break;
1084 			}
1085 			goto connect_self;
1086 		}
1087 		unp_pcb_owned_lock2(unp, unp2, freed);
1088 		if (__predict_false(freed)) {
1089 			UNP_PCB_UNLOCK(unp);
1090 			error = ENOTCONN;
1091 			break;
1092 		}
1093 		/*
1094 		 * The socket referencing unp2 may have been closed
1095 		 * or unp may have been disconnected if the unp lock
1096 		 * was dropped to acquire unp2.
1097 		 */
1098 		if (__predict_false(unp->unp_conn == NULL) ||
1099 			unp2->unp_socket == NULL) {
1100 			UNP_PCB_UNLOCK(unp);
1101 			if (unp_pcb_rele(unp2) == 0)
1102 				UNP_PCB_UNLOCK(unp2);
1103 			error = ENOTCONN;
1104 			break;
1105 		}
1106 	connect_self:
1107 		if (unp2->unp_flags & UNP_WANTCRED)
1108 			control = unp_addsockcred(td, control);
1109 		if (unp->unp_addr != NULL)
1110 			from = (struct sockaddr *)unp->unp_addr;
1111 		else
1112 			from = &sun_noname;
1113 		so2 = unp2->unp_socket;
1114 		SOCKBUF_LOCK(&so2->so_rcv);
1115 		if (sbappendaddr_locked(&so2->so_rcv, from, m,
1116 		    control)) {
1117 			sorwakeup_locked(so2);
1118 			m = NULL;
1119 			control = NULL;
1120 		} else {
1121 			SOCKBUF_UNLOCK(&so2->so_rcv);
1122 			error = ENOBUFS;
1123 		}
1124 		if (nam != NULL)
1125 			unp_disconnect(unp, unp2);
1126 		if (__predict_true(unp != unp2))
1127 			UNP_PCB_UNLOCK(unp2);
1128 		UNP_PCB_UNLOCK(unp);
1129 		break;
1130 	}
1131 
1132 	case SOCK_SEQPACKET:
1133 	case SOCK_STREAM:
1134 		if ((so->so_state & SS_ISCONNECTED) == 0) {
1135 			if (nam != NULL) {
1136 				if ((error = connect_internal(so, nam, td)))
1137 					break;
1138 			} else  {
1139 				error = ENOTCONN;
1140 				break;
1141 			}
1142 		} else if ((unp2 = unp->unp_conn) == NULL) {
1143 			error = ENOTCONN;
1144 			break;
1145 		} else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1146 			error = EPIPE;
1147 			break;
1148 		} else {
1149 			UNP_PCB_LOCK(unp);
1150 			if ((unp2 = unp->unp_conn) == NULL) {
1151 				UNP_PCB_UNLOCK(unp);
1152 				error = ENOTCONN;
1153 				break;
1154 			}
1155 		}
1156 		unp_pcb_owned_lock2(unp, unp2, freed);
1157 		UNP_PCB_UNLOCK(unp);
1158 		if (__predict_false(freed)) {
1159 			error = ENOTCONN;
1160 			break;
1161 		}
1162 		if ((so2 = unp2->unp_socket) == NULL) {
1163 			UNP_PCB_UNLOCK(unp2);
1164 			error = ENOTCONN;
1165 			break;
1166 		}
1167 		SOCKBUF_LOCK(&so2->so_rcv);
1168 		if (unp2->unp_flags & UNP_WANTCRED) {
1169 			/*
1170 			 * Credentials are passed only once on SOCK_STREAM
1171 			 * and SOCK_SEQPACKET.
1172 			 */
1173 			unp2->unp_flags &= ~UNP_WANTCRED;
1174 			control = unp_addsockcred(td, control);
1175 		}
1176 
1177 		/*
1178 		 * Send to paired receive port and wake up readers.  Don't
1179 		 * check for space available in the receive buffer if we're
1180 		 * attaching ancillary data; Unix domain sockets only check
1181 		 * for space in the sending sockbuf, and that check is
1182 		 * performed one level up the stack.  At that level we cannot
1183 		 * precisely account for the amount of buffer space used
1184 		 * (e.g., because control messages are not yet internalized).
1185 		 */
1186 		switch (so->so_type) {
1187 		case SOCK_STREAM:
1188 			if (control != NULL) {
1189 				sbappendcontrol_locked(&so2->so_rcv, m,
1190 				    control);
1191 				control = NULL;
1192 			} else
1193 				sbappend_locked(&so2->so_rcv, m, flags);
1194 			break;
1195 
1196 		case SOCK_SEQPACKET: {
1197 			const struct sockaddr *from;
1198 
1199 			from = &sun_noname;
1200 			if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
1201 			    from, m, control))
1202 				control = NULL;
1203 			break;
1204 			}
1205 		}
1206 
1207 		mbcnt = so2->so_rcv.sb_mbcnt;
1208 		sbcc = sbavail(&so2->so_rcv);
1209 		if (sbcc)
1210 			sorwakeup_locked(so2);
1211 		else
1212 			SOCKBUF_UNLOCK(&so2->so_rcv);
1213 
1214 		/*
1215 		 * The PCB lock on unp2 protects the SB_STOP flag.  Without it,
1216 		 * it would be possible for uipc_rcvd to be called at this
1217 		 * point, drain the receiving sockbuf, clear SB_STOP, and then
1218 		 * we would set SB_STOP below.  That could lead to an empty
1219 		 * sockbuf having SB_STOP set
1220 		 */
1221 		SOCKBUF_LOCK(&so->so_snd);
1222 		if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
1223 			so->so_snd.sb_flags |= SB_STOP;
1224 		SOCKBUF_UNLOCK(&so->so_snd);
1225 		UNP_PCB_UNLOCK(unp2);
1226 		m = NULL;
1227 		break;
1228 	}
1229 
1230 	/*
1231 	 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
1232 	 */
1233 	if (flags & PRUS_EOF) {
1234 		UNP_PCB_LOCK(unp);
1235 		socantsendmore(so);
1236 		unp_shutdown(unp);
1237 		UNP_PCB_UNLOCK(unp);
1238 	}
1239 	if (control != NULL && error != 0)
1240 		unp_dispose_mbuf(control);
1241 
1242 release:
1243 	if (control != NULL)
1244 		m_freem(control);
1245 	/*
1246 	 * In case of PRUS_NOTREADY, uipc_ready() is responsible
1247 	 * for freeing memory.
1248 	 */
1249 	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
1250 		m_freem(m);
1251 	return (error);
1252 }
1253 
1254 static int
1255 uipc_ready(struct socket *so, struct mbuf *m, int count)
1256 {
1257 	struct unpcb *unp, *unp2;
1258 	struct socket *so2;
1259 	int error;
1260 
1261 	unp = sotounpcb(so);
1262 
1263 	UNP_PCB_LOCK(unp);
1264 	if ((unp2 = unp->unp_conn) == NULL) {
1265 		UNP_PCB_UNLOCK(unp);
1266 		goto error;
1267 	}
1268 	if (unp != unp2) {
1269 		if (UNP_PCB_TRYLOCK(unp2) == 0) {
1270 			unp_pcb_hold(unp2);
1271 			UNP_PCB_UNLOCK(unp);
1272 			UNP_PCB_LOCK(unp2);
1273 			if (unp_pcb_rele(unp2))
1274 				goto error;
1275 		} else
1276 			UNP_PCB_UNLOCK(unp);
1277 	}
1278 	so2 = unp2->unp_socket;
1279 
1280 	SOCKBUF_LOCK(&so2->so_rcv);
1281 	if ((error = sbready(&so2->so_rcv, m, count)) == 0)
1282 		sorwakeup_locked(so2);
1283 	else
1284 		SOCKBUF_UNLOCK(&so2->so_rcv);
1285 
1286 	UNP_PCB_UNLOCK(unp2);
1287 
1288 	return (error);
1289  error:
1290 	for (int i = 0; i < count; i++)
1291 		m = m_free(m);
1292 	return (ECONNRESET);
1293 }
1294 
1295 static int
1296 uipc_sense(struct socket *so, struct stat *sb)
1297 {
1298 	struct unpcb *unp;
1299 
1300 	unp = sotounpcb(so);
1301 	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
1302 
1303 	sb->st_blksize = so->so_snd.sb_hiwat;
1304 	sb->st_dev = NODEV;
1305 	sb->st_ino = unp->unp_ino;
1306 	return (0);
1307 }
1308 
1309 static int
1310 uipc_shutdown(struct socket *so)
1311 {
1312 	struct unpcb *unp;
1313 
1314 	unp = sotounpcb(so);
1315 	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
1316 
1317 	UNP_PCB_LOCK(unp);
1318 	socantsendmore(so);
1319 	unp_shutdown(unp);
1320 	UNP_PCB_UNLOCK(unp);
1321 	return (0);
1322 }
1323 
1324 static int
1325 uipc_sockaddr(struct socket *so, struct sockaddr **nam)
1326 {
1327 	struct unpcb *unp;
1328 	const struct sockaddr *sa;
1329 
1330 	unp = sotounpcb(so);
1331 	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
1332 
1333 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1334 	UNP_PCB_LOCK(unp);
1335 	if (unp->unp_addr != NULL)
1336 		sa = (struct sockaddr *) unp->unp_addr;
1337 	else
1338 		sa = &sun_noname;
1339 	bcopy(sa, *nam, sa->sa_len);
1340 	UNP_PCB_UNLOCK(unp);
1341 	return (0);
1342 }
1343 
1344 static struct pr_usrreqs uipc_usrreqs_dgram = {
1345 	.pru_abort = 		uipc_abort,
1346 	.pru_accept =		uipc_accept,
1347 	.pru_attach =		uipc_attach,
1348 	.pru_bind =		uipc_bind,
1349 	.pru_bindat =		uipc_bindat,
1350 	.pru_connect =		uipc_connect,
1351 	.pru_connectat =	uipc_connectat,
1352 	.pru_connect2 =		uipc_connect2,
1353 	.pru_detach =		uipc_detach,
1354 	.pru_disconnect =	uipc_disconnect,
1355 	.pru_listen =		uipc_listen,
1356 	.pru_peeraddr =		uipc_peeraddr,
1357 	.pru_rcvd =		uipc_rcvd,
1358 	.pru_send =		uipc_send,
1359 	.pru_sense =		uipc_sense,
1360 	.pru_shutdown =		uipc_shutdown,
1361 	.pru_sockaddr =		uipc_sockaddr,
1362 	.pru_soreceive =	soreceive_dgram,
1363 	.pru_close =		uipc_close,
1364 };
1365 
1366 static struct pr_usrreqs uipc_usrreqs_seqpacket = {
1367 	.pru_abort =		uipc_abort,
1368 	.pru_accept =		uipc_accept,
1369 	.pru_attach =		uipc_attach,
1370 	.pru_bind =		uipc_bind,
1371 	.pru_bindat =		uipc_bindat,
1372 	.pru_connect =		uipc_connect,
1373 	.pru_connectat =	uipc_connectat,
1374 	.pru_connect2 =		uipc_connect2,
1375 	.pru_detach =		uipc_detach,
1376 	.pru_disconnect =	uipc_disconnect,
1377 	.pru_listen =		uipc_listen,
1378 	.pru_peeraddr =		uipc_peeraddr,
1379 	.pru_rcvd =		uipc_rcvd,
1380 	.pru_send =		uipc_send,
1381 	.pru_sense =		uipc_sense,
1382 	.pru_shutdown =		uipc_shutdown,
1383 	.pru_sockaddr =		uipc_sockaddr,
1384 	.pru_soreceive =	soreceive_generic,	/* XXX: or...? */
1385 	.pru_close =		uipc_close,
1386 };
1387 
1388 static struct pr_usrreqs uipc_usrreqs_stream = {
1389 	.pru_abort = 		uipc_abort,
1390 	.pru_accept =		uipc_accept,
1391 	.pru_attach =		uipc_attach,
1392 	.pru_bind =		uipc_bind,
1393 	.pru_bindat =		uipc_bindat,
1394 	.pru_connect =		uipc_connect,
1395 	.pru_connectat =	uipc_connectat,
1396 	.pru_connect2 =		uipc_connect2,
1397 	.pru_detach =		uipc_detach,
1398 	.pru_disconnect =	uipc_disconnect,
1399 	.pru_listen =		uipc_listen,
1400 	.pru_peeraddr =		uipc_peeraddr,
1401 	.pru_rcvd =		uipc_rcvd,
1402 	.pru_send =		uipc_send,
1403 	.pru_ready =		uipc_ready,
1404 	.pru_sense =		uipc_sense,
1405 	.pru_shutdown =		uipc_shutdown,
1406 	.pru_sockaddr =		uipc_sockaddr,
1407 	.pru_soreceive =	soreceive_generic,
1408 	.pru_close =		uipc_close,
1409 };
1410 
1411 static int
1412 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
1413 {
1414 	struct unpcb *unp;
1415 	struct xucred xu;
1416 	int error, optval;
1417 
1418 	if (sopt->sopt_level != 0)
1419 		return (EINVAL);
1420 
1421 	unp = sotounpcb(so);
1422 	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
1423 	error = 0;
1424 	switch (sopt->sopt_dir) {
1425 	case SOPT_GET:
1426 		switch (sopt->sopt_name) {
1427 		case LOCAL_PEERCRED:
1428 			UNP_PCB_LOCK(unp);
1429 			if (unp->unp_flags & UNP_HAVEPC)
1430 				xu = unp->unp_peercred;
1431 			else {
1432 				if (so->so_type == SOCK_STREAM)
1433 					error = ENOTCONN;
1434 				else
1435 					error = EINVAL;
1436 			}
1437 			UNP_PCB_UNLOCK(unp);
1438 			if (error == 0)
1439 				error = sooptcopyout(sopt, &xu, sizeof(xu));
1440 			break;
1441 
1442 		case LOCAL_CREDS:
1443 			/* Unlocked read. */
1444 			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
1445 			error = sooptcopyout(sopt, &optval, sizeof(optval));
1446 			break;
1447 
1448 		case LOCAL_CONNWAIT:
1449 			/* Unlocked read. */
1450 			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
1451 			error = sooptcopyout(sopt, &optval, sizeof(optval));
1452 			break;
1453 
1454 		default:
1455 			error = EOPNOTSUPP;
1456 			break;
1457 		}
1458 		break;
1459 
1460 	case SOPT_SET:
1461 		switch (sopt->sopt_name) {
1462 		case LOCAL_CREDS:
1463 		case LOCAL_CONNWAIT:
1464 			error = sooptcopyin(sopt, &optval, sizeof(optval),
1465 					    sizeof(optval));
1466 			if (error)
1467 				break;
1468 
1469 #define	OPTSET(bit) do {						\
1470 	UNP_PCB_LOCK(unp);						\
1471 	if (optval)							\
1472 		unp->unp_flags |= bit;					\
1473 	else								\
1474 		unp->unp_flags &= ~bit;					\
1475 	UNP_PCB_UNLOCK(unp);						\
1476 } while (0)
1477 
1478 			switch (sopt->sopt_name) {
1479 			case LOCAL_CREDS:
1480 				OPTSET(UNP_WANTCRED);
1481 				break;
1482 
1483 			case LOCAL_CONNWAIT:
1484 				OPTSET(UNP_CONNWAIT);
1485 				break;
1486 
1487 			default:
1488 				break;
1489 			}
1490 			break;
1491 #undef	OPTSET
1492 		default:
1493 			error = ENOPROTOOPT;
1494 			break;
1495 		}
1496 		break;
1497 
1498 	default:
1499 		error = EOPNOTSUPP;
1500 		break;
1501 	}
1502 	return (error);
1503 }
1504 
1505 static int
1506 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1507 {
1508 
1509 	return (unp_connectat(AT_FDCWD, so, nam, td));
1510 }
1511 
1512 static int
1513 unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
1514     struct thread *td)
1515 {
1516 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
1517 	struct vnode *vp;
1518 	struct socket *so2;
1519 	struct unpcb *unp, *unp2, *unp3;
1520 	struct nameidata nd;
1521 	char buf[SOCK_MAXADDRLEN];
1522 	struct sockaddr *sa;
1523 	cap_rights_t rights;
1524 	int error, len, freed;
1525 	struct mtx *vplock;
1526 
1527 	if (nam->sa_family != AF_UNIX)
1528 		return (EAFNOSUPPORT);
1529 	if (nam->sa_len > sizeof(struct sockaddr_un))
1530 		return (EINVAL);
1531 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
1532 	if (len <= 0)
1533 		return (EINVAL);
1534 	bcopy(soun->sun_path, buf, len);
1535 	buf[len] = 0;
1536 
1537 	unp = sotounpcb(so);
1538 	UNP_PCB_LOCK(unp);
1539 	if (unp->unp_flags & UNP_CONNECTING) {
1540 		UNP_PCB_UNLOCK(unp);
1541 		return (EALREADY);
1542 	}
1543 	unp->unp_flags |= UNP_CONNECTING;
1544 	UNP_PCB_UNLOCK(unp);
1545 
1546 	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1547 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
1548 	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_CONNECTAT), td);
1549 	error = namei(&nd);
1550 	if (error)
1551 		vp = NULL;
1552 	else
1553 		vp = nd.ni_vp;
1554 	ASSERT_VOP_LOCKED(vp, "unp_connect");
1555 	NDFREE(&nd, NDF_ONLY_PNBUF);
1556 	if (error)
1557 		goto bad;
1558 
1559 	if (vp->v_type != VSOCK) {
1560 		error = ENOTSOCK;
1561 		goto bad;
1562 	}
1563 #ifdef MAC
1564 	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
1565 	if (error)
1566 		goto bad;
1567 #endif
1568 	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
1569 	if (error)
1570 		goto bad;
1571 
1572 	unp = sotounpcb(so);
1573 	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1574 
1575 	vplock = mtx_pool_find(mtxpool_sleep, vp);
1576 	mtx_lock(vplock);
1577 	VOP_UNP_CONNECT(vp, &unp2);
1578 	if (unp2 == NULL) {
1579 		error = ECONNREFUSED;
1580 		goto bad2;
1581 	}
1582 	so2 = unp2->unp_socket;
1583 	if (so->so_type != so2->so_type) {
1584 		error = EPROTOTYPE;
1585 		goto bad2;
1586 	}
1587 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
1588 		if (so2->so_options & SO_ACCEPTCONN) {
1589 			CURVNET_SET(so2->so_vnet);
1590 			so2 = sonewconn(so2, 0);
1591 			CURVNET_RESTORE();
1592 		} else
1593 			so2 = NULL;
1594 		if (so2 == NULL) {
1595 			error = ECONNREFUSED;
1596 			goto bad2;
1597 		}
1598 		unp3 = sotounpcb(so2);
1599 		unp_pcb_lock2(unp2, unp3);
1600 		if (unp2->unp_addr != NULL) {
1601 			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
1602 			unp3->unp_addr = (struct sockaddr_un *) sa;
1603 			sa = NULL;
1604 		}
1605 
1606 		unp_copy_peercred(td, unp3, unp, unp2);
1607 
1608 		UNP_PCB_UNLOCK(unp2);
1609 		unp2 = unp3;
1610 		unp_pcb_owned_lock2(unp2, unp, freed);
1611 		if (__predict_false(freed)) {
1612 			UNP_PCB_UNLOCK(unp2);
1613 			error = ECONNREFUSED;
1614 			goto bad2;
1615 		}
1616 #ifdef MAC
1617 		mac_socketpeer_set_from_socket(so, so2);
1618 		mac_socketpeer_set_from_socket(so2, so);
1619 #endif
1620 	} else {
1621 		if (unp == unp2)
1622 			UNP_PCB_LOCK(unp);
1623 		else
1624 			unp_pcb_lock2(unp, unp2);
1625 	}
1626 	KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
1627 	    sotounpcb(so2) == unp2,
1628 	    ("%s: unp2 %p so2 %p", __func__, unp2, so2));
1629 	error = unp_connect2(so, so2, PRU_CONNECT);
1630 	if (unp != unp2)
1631 		UNP_PCB_UNLOCK(unp2);
1632 	UNP_PCB_UNLOCK(unp);
1633 bad2:
1634 	mtx_unlock(vplock);
1635 bad:
1636 	if (vp != NULL) {
1637 		vput(vp);
1638 	}
1639 	free(sa, M_SONAME);
1640 	UNP_PCB_LOCK(unp);
1641 	unp->unp_flags &= ~UNP_CONNECTING;
1642 	UNP_PCB_UNLOCK(unp);
1643 	return (error);
1644 }
1645 
1646 /*
1647  * Set socket peer credentials at connection time.
1648  *
1649  * The client's PCB credentials are copied from its process structure.  The
1650  * server's PCB credentials are copied from the socket on which it called
1651  * listen(2).  uipc_listen cached that process's credentials at the time.
1652  */
1653 void
1654 unp_copy_peercred(struct thread *td, struct unpcb *client_unp,
1655     struct unpcb *server_unp, struct unpcb *listen_unp)
1656 {
1657 	cru2xt(td, &client_unp->unp_peercred);
1658 	client_unp->unp_flags |= UNP_HAVEPC;
1659 
1660 	memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred,
1661 	    sizeof(server_unp->unp_peercred));
1662 	server_unp->unp_flags |= UNP_HAVEPC;
1663 	if (listen_unp->unp_flags & UNP_WANTCRED)
1664 		client_unp->unp_flags |= UNP_WANTCRED;
1665 }
1666 
1667 static int
1668 unp_connect2(struct socket *so, struct socket *so2, int req)
1669 {
1670 	struct unpcb *unp;
1671 	struct unpcb *unp2;
1672 
1673 	unp = sotounpcb(so);
1674 	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
1675 	unp2 = sotounpcb(so2);
1676 	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
1677 
1678 	UNP_PCB_LOCK_ASSERT(unp);
1679 	UNP_PCB_LOCK_ASSERT(unp2);
1680 
1681 	if (so2->so_type != so->so_type)
1682 		return (EPROTOTYPE);
1683 	unp2->unp_flags &= ~UNP_NASCENT;
1684 	unp->unp_conn = unp2;
1685 	unp_pcb_hold(unp2);
1686 	unp_pcb_hold(unp);
1687 	switch (so->so_type) {
1688 	case SOCK_DGRAM:
1689 		UNP_REF_LIST_LOCK();
1690 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
1691 		UNP_REF_LIST_UNLOCK();
1692 		soisconnected(so);
1693 		break;
1694 
1695 	case SOCK_STREAM:
1696 	case SOCK_SEQPACKET:
1697 		unp2->unp_conn = unp;
1698 		if (req == PRU_CONNECT &&
1699 		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1700 			soisconnecting(so);
1701 		else
1702 			soisconnected(so);
1703 		soisconnected(so2);
1704 		break;
1705 
1706 	default:
1707 		panic("unp_connect2");
1708 	}
1709 	return (0);
1710 }
1711 
1712 static void
1713 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
1714 {
1715 	struct socket *so, *so2;
1716 	int freed __unused;
1717 
1718 	KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
1719 
1720 	UNP_PCB_LOCK_ASSERT(unp);
1721 	UNP_PCB_LOCK_ASSERT(unp2);
1722 
1723 	if (unp->unp_conn == NULL && unp2->unp_conn == NULL)
1724 		return;
1725 
1726 	MPASS(unp->unp_conn == unp2);
1727 	unp->unp_conn = NULL;
1728 	so = unp->unp_socket;
1729 	so2 = unp2->unp_socket;
1730 	switch (unp->unp_socket->so_type) {
1731 	case SOCK_DGRAM:
1732 		UNP_REF_LIST_LOCK();
1733 		LIST_REMOVE(unp, unp_reflink);
1734 		UNP_REF_LIST_UNLOCK();
1735 		if (so) {
1736 			SOCK_LOCK(so);
1737 			so->so_state &= ~SS_ISCONNECTED;
1738 			SOCK_UNLOCK(so);
1739 		}
1740 		break;
1741 
1742 	case SOCK_STREAM:
1743 	case SOCK_SEQPACKET:
1744 		if (so)
1745 			soisdisconnected(so);
1746 		MPASS(unp2->unp_conn == unp);
1747 		unp2->unp_conn = NULL;
1748 		if (so2)
1749 			soisdisconnected(so2);
1750 		break;
1751 	}
1752 	freed = unp_pcb_rele(unp);
1753 	MPASS(freed == 0);
1754 	freed = unp_pcb_rele(unp2);
1755 	MPASS(freed == 0);
1756 }
1757 
1758 /*
1759  * unp_pcblist() walks the global list of struct unpcb's to generate a
1760  * pointer list, bumping the refcount on each unpcb.  It then copies them out
1761  * sequentially, validating the generation number on each to see if it has
1762  * been detached.  All of this is necessary because copyout() may sleep on
1763  * disk I/O.
1764  */
1765 static int
1766 unp_pcblist(SYSCTL_HANDLER_ARGS)
1767 {
1768 	struct unpcb *unp, **unp_list;
1769 	unp_gen_t gencnt;
1770 	struct xunpgen *xug;
1771 	struct unp_head *head;
1772 	struct xunpcb *xu;
1773 	u_int i;
1774 	int error, freeunp, n;
1775 
1776 	switch ((intptr_t)arg1) {
1777 	case SOCK_STREAM:
1778 		head = &unp_shead;
1779 		break;
1780 
1781 	case SOCK_DGRAM:
1782 		head = &unp_dhead;
1783 		break;
1784 
1785 	case SOCK_SEQPACKET:
1786 		head = &unp_sphead;
1787 		break;
1788 
1789 	default:
1790 		panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
1791 	}
1792 
1793 	/*
1794 	 * The process of preparing the PCB list is too time-consuming and
1795 	 * resource-intensive to repeat twice on every request.
1796 	 */
1797 	if (req->oldptr == NULL) {
1798 		n = unp_count;
1799 		req->oldidx = 2 * (sizeof *xug)
1800 			+ (n + n/8) * sizeof(struct xunpcb);
1801 		return (0);
1802 	}
1803 
1804 	if (req->newptr != NULL)
1805 		return (EPERM);
1806 
1807 	/*
1808 	 * OK, now we're committed to doing something.
1809 	 */
1810 	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO);
1811 	UNP_LINK_RLOCK();
1812 	gencnt = unp_gencnt;
1813 	n = unp_count;
1814 	UNP_LINK_RUNLOCK();
1815 
1816 	xug->xug_len = sizeof *xug;
1817 	xug->xug_count = n;
1818 	xug->xug_gen = gencnt;
1819 	xug->xug_sogen = so_gencnt;
1820 	error = SYSCTL_OUT(req, xug, sizeof *xug);
1821 	if (error) {
1822 		free(xug, M_TEMP);
1823 		return (error);
1824 	}
1825 
1826 	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
1827 
1828 	UNP_LINK_RLOCK();
1829 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
1830 	     unp = LIST_NEXT(unp, unp_link)) {
1831 		UNP_PCB_LOCK(unp);
1832 		if (unp->unp_gencnt <= gencnt) {
1833 			if (cr_cansee(req->td->td_ucred,
1834 			    unp->unp_socket->so_cred)) {
1835 				UNP_PCB_UNLOCK(unp);
1836 				continue;
1837 			}
1838 			unp_list[i++] = unp;
1839 			unp_pcb_hold(unp);
1840 		}
1841 		UNP_PCB_UNLOCK(unp);
1842 	}
1843 	UNP_LINK_RUNLOCK();
1844 	n = i;			/* In case we lost some during malloc. */
1845 
1846 	error = 0;
1847 	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
1848 	for (i = 0; i < n; i++) {
1849 		unp = unp_list[i];
1850 		UNP_PCB_LOCK(unp);
1851 		freeunp = unp_pcb_rele(unp);
1852 
1853 		if (freeunp == 0 && unp->unp_gencnt <= gencnt) {
1854 			xu->xu_len = sizeof *xu;
1855 			xu->xu_unpp = (uintptr_t)unp;
1856 			/*
1857 			 * XXX - need more locking here to protect against
1858 			 * connect/disconnect races for SMP.
1859 			 */
1860 			if (unp->unp_addr != NULL)
1861 				bcopy(unp->unp_addr, &xu->xu_addr,
1862 				      unp->unp_addr->sun_len);
1863 			else
1864 				bzero(&xu->xu_addr, sizeof(xu->xu_addr));
1865 			if (unp->unp_conn != NULL &&
1866 			    unp->unp_conn->unp_addr != NULL)
1867 				bcopy(unp->unp_conn->unp_addr,
1868 				      &xu->xu_caddr,
1869 				      unp->unp_conn->unp_addr->sun_len);
1870 			else
1871 				bzero(&xu->xu_caddr, sizeof(xu->xu_caddr));
1872 			xu->unp_vnode = (uintptr_t)unp->unp_vnode;
1873 			xu->unp_conn = (uintptr_t)unp->unp_conn;
1874 			xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs);
1875 			xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink);
1876 			xu->unp_gencnt = unp->unp_gencnt;
1877 			sotoxsocket(unp->unp_socket, &xu->xu_socket);
1878 			UNP_PCB_UNLOCK(unp);
1879 			error = SYSCTL_OUT(req, xu, sizeof *xu);
1880 		} else  if (freeunp == 0)
1881 			UNP_PCB_UNLOCK(unp);
1882 	}
1883 	free(xu, M_TEMP);
1884 	if (!error) {
1885 		/*
1886 		 * Give the user an updated idea of our state.  If the
1887 		 * generation differs from what we told her before, she knows
1888 		 * that something happened while we were processing this
1889 		 * request, and it might be necessary to retry.
1890 		 */
1891 		xug->xug_gen = unp_gencnt;
1892 		xug->xug_sogen = so_gencnt;
1893 		xug->xug_count = unp_count;
1894 		error = SYSCTL_OUT(req, xug, sizeof *xug);
1895 	}
1896 	free(unp_list, M_TEMP);
1897 	free(xug, M_TEMP);
1898 	return (error);
1899 }
1900 
1901 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
1902     (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
1903     "List of active local datagram sockets");
1904 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
1905     (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
1906     "List of active local stream sockets");
1907 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
1908     CTLTYPE_OPAQUE | CTLFLAG_RD,
1909     (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
1910     "List of active local seqpacket sockets");
1911 
1912 static void
1913 unp_shutdown(struct unpcb *unp)
1914 {
1915 	struct unpcb *unp2;
1916 	struct socket *so;
1917 
1918 	UNP_PCB_LOCK_ASSERT(unp);
1919 
1920 	unp2 = unp->unp_conn;
1921 	if ((unp->unp_socket->so_type == SOCK_STREAM ||
1922 	    (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
1923 		so = unp2->unp_socket;
1924 		if (so != NULL)
1925 			socantrcvmore(so);
1926 	}
1927 }
1928 
1929 static void
1930 unp_drop(struct unpcb *unp)
1931 {
1932 	struct socket *so = unp->unp_socket;
1933 	struct unpcb *unp2;
1934 	int freed;
1935 
1936 	/*
1937 	 * Regardless of whether the socket's peer dropped the connection
1938 	 * with this socket by aborting or disconnecting, POSIX requires
1939 	 * that ECONNRESET is returned.
1940 	 */
1941 	/* acquire a reference so that unp isn't freed from underneath us */
1942 
1943 	UNP_PCB_LOCK(unp);
1944 	if (so)
1945 		so->so_error = ECONNRESET;
1946 	unp2 = unp->unp_conn;
1947 	if (unp2 == unp) {
1948 		unp_disconnect(unp, unp2);
1949 	} else if (unp2 != NULL) {
1950 		unp_pcb_hold(unp2);
1951 		unp_pcb_owned_lock2(unp, unp2, freed);
1952 		unp_disconnect(unp, unp2);
1953 		if (unp_pcb_rele(unp2) == 0)
1954 			UNP_PCB_UNLOCK(unp2);
1955 	}
1956 	if (unp_pcb_rele(unp) == 0)
1957 		UNP_PCB_UNLOCK(unp);
1958 }
1959 
1960 static void
1961 unp_freerights(struct filedescent **fdep, int fdcount)
1962 {
1963 	struct file *fp;
1964 	int i;
1965 
1966 	KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
1967 
1968 	for (i = 0; i < fdcount; i++) {
1969 		fp = fdep[i]->fde_file;
1970 		filecaps_free(&fdep[i]->fde_caps);
1971 		unp_discard(fp);
1972 	}
1973 	free(fdep[0], M_FILECAPS);
1974 }
1975 
1976 static int
1977 unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
1978 {
1979 	struct thread *td = curthread;		/* XXX */
1980 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1981 	int i;
1982 	int *fdp;
1983 	struct filedesc *fdesc = td->td_proc->p_fd;
1984 	struct filedescent **fdep;
1985 	void *data;
1986 	socklen_t clen = control->m_len, datalen;
1987 	int error, newfds;
1988 	u_int newlen;
1989 
1990 	UNP_LINK_UNLOCK_ASSERT();
1991 
1992 	error = 0;
1993 	if (controlp != NULL) /* controlp == NULL => free control messages */
1994 		*controlp = NULL;
1995 	while (cm != NULL) {
1996 		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
1997 			error = EINVAL;
1998 			break;
1999 		}
2000 		data = CMSG_DATA(cm);
2001 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
2002 		if (cm->cmsg_level == SOL_SOCKET
2003 		    && cm->cmsg_type == SCM_RIGHTS) {
2004 			newfds = datalen / sizeof(*fdep);
2005 			if (newfds == 0)
2006 				goto next;
2007 			fdep = data;
2008 
2009 			/* If we're not outputting the descriptors free them. */
2010 			if (error || controlp == NULL) {
2011 				unp_freerights(fdep, newfds);
2012 				goto next;
2013 			}
2014 			FILEDESC_XLOCK(fdesc);
2015 
2016 			/*
2017 			 * Now change each pointer to an fd in the global
2018 			 * table to an integer that is the index to the local
2019 			 * fd table entry that we set up to point to the
2020 			 * global one we are transferring.
2021 			 */
2022 			newlen = newfds * sizeof(int);
2023 			*controlp = sbcreatecontrol(NULL, newlen,
2024 			    SCM_RIGHTS, SOL_SOCKET);
2025 			if (*controlp == NULL) {
2026 				FILEDESC_XUNLOCK(fdesc);
2027 				error = E2BIG;
2028 				unp_freerights(fdep, newfds);
2029 				goto next;
2030 			}
2031 
2032 			fdp = (int *)
2033 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2034 			if (fdallocn(td, 0, fdp, newfds) != 0) {
2035 				FILEDESC_XUNLOCK(fdesc);
2036 				error = EMSGSIZE;
2037 				unp_freerights(fdep, newfds);
2038 				m_freem(*controlp);
2039 				*controlp = NULL;
2040 				goto next;
2041 			}
2042 			for (i = 0; i < newfds; i++, fdp++) {
2043 				_finstall(fdesc, fdep[i]->fde_file, *fdp,
2044 				    (flags & MSG_CMSG_CLOEXEC) != 0 ? UF_EXCLOSE : 0,
2045 				    &fdep[i]->fde_caps);
2046 				unp_externalize_fp(fdep[i]->fde_file);
2047 			}
2048 
2049 			/*
2050 			 * The new type indicates that the mbuf data refers to
2051 			 * kernel resources that may need to be released before
2052 			 * the mbuf is freed.
2053 			 */
2054 			m_chtype(*controlp, MT_EXTCONTROL);
2055 			FILEDESC_XUNLOCK(fdesc);
2056 			free(fdep[0], M_FILECAPS);
2057 		} else {
2058 			/* We can just copy anything else across. */
2059 			if (error || controlp == NULL)
2060 				goto next;
2061 			*controlp = sbcreatecontrol(NULL, datalen,
2062 			    cm->cmsg_type, cm->cmsg_level);
2063 			if (*controlp == NULL) {
2064 				error = ENOBUFS;
2065 				goto next;
2066 			}
2067 			bcopy(data,
2068 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
2069 			    datalen);
2070 		}
2071 		controlp = &(*controlp)->m_next;
2072 
2073 next:
2074 		if (CMSG_SPACE(datalen) < clen) {
2075 			clen -= CMSG_SPACE(datalen);
2076 			cm = (struct cmsghdr *)
2077 			    ((caddr_t)cm + CMSG_SPACE(datalen));
2078 		} else {
2079 			clen = 0;
2080 			cm = NULL;
2081 		}
2082 	}
2083 
2084 	m_freem(control);
2085 	return (error);
2086 }
2087 
2088 static void
2089 unp_zone_change(void *tag)
2090 {
2091 
2092 	uma_zone_set_max(unp_zone, maxsockets);
2093 }
2094 
2095 static void
2096 unp_init(void)
2097 {
2098 
2099 #ifdef VIMAGE
2100 	if (!IS_DEFAULT_VNET(curvnet))
2101 		return;
2102 #endif
2103 	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
2104 	    NULL, NULL, UMA_ALIGN_CACHE, 0);
2105 	if (unp_zone == NULL)
2106 		panic("unp_init");
2107 	uma_zone_set_max(unp_zone, maxsockets);
2108 	uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
2109 	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
2110 	    NULL, EVENTHANDLER_PRI_ANY);
2111 	LIST_INIT(&unp_dhead);
2112 	LIST_INIT(&unp_shead);
2113 	LIST_INIT(&unp_sphead);
2114 	SLIST_INIT(&unp_defers);
2115 	TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
2116 	TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
2117 	UNP_LINK_LOCK_INIT();
2118 	UNP_DEFERRED_LOCK_INIT();
2119 }
2120 
2121 static void
2122 unp_internalize_cleanup_rights(struct mbuf *control)
2123 {
2124 	struct cmsghdr *cp;
2125 	struct mbuf *m;
2126 	void *data;
2127 	socklen_t datalen;
2128 
2129 	for (m = control; m != NULL; m = m->m_next) {
2130 		cp = mtod(m, struct cmsghdr *);
2131 		if (cp->cmsg_level != SOL_SOCKET ||
2132 		    cp->cmsg_type != SCM_RIGHTS)
2133 			continue;
2134 		data = CMSG_DATA(cp);
2135 		datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data;
2136 		unp_freerights(data, datalen / sizeof(struct filedesc *));
2137 	}
2138 }
2139 
2140 static int
2141 unp_internalize(struct mbuf **controlp, struct thread *td)
2142 {
2143 	struct mbuf *control, **initial_controlp;
2144 	struct proc *p;
2145 	struct filedesc *fdesc;
2146 	struct bintime *bt;
2147 	struct cmsghdr *cm;
2148 	struct cmsgcred *cmcred;
2149 	struct filedescent *fde, **fdep, *fdev;
2150 	struct file *fp;
2151 	struct timeval *tv;
2152 	struct timespec *ts;
2153 	void *data;
2154 	socklen_t clen, datalen;
2155 	int i, j, error, *fdp, oldfds;
2156 	u_int newlen;
2157 
2158 	UNP_LINK_UNLOCK_ASSERT();
2159 
2160 	p = td->td_proc;
2161 	fdesc = p->p_fd;
2162 	error = 0;
2163 	control = *controlp;
2164 	clen = control->m_len;
2165 	*controlp = NULL;
2166 	initial_controlp = controlp;
2167 	for (cm = mtod(control, struct cmsghdr *); cm != NULL;) {
2168 		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
2169 		    || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) {
2170 			error = EINVAL;
2171 			goto out;
2172 		}
2173 		data = CMSG_DATA(cm);
2174 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
2175 
2176 		switch (cm->cmsg_type) {
2177 		/*
2178 		 * Fill in credential information.
2179 		 */
2180 		case SCM_CREDS:
2181 			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
2182 			    SCM_CREDS, SOL_SOCKET);
2183 			if (*controlp == NULL) {
2184 				error = ENOBUFS;
2185 				goto out;
2186 			}
2187 			cmcred = (struct cmsgcred *)
2188 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2189 			cmcred->cmcred_pid = p->p_pid;
2190 			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
2191 			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
2192 			cmcred->cmcred_euid = td->td_ucred->cr_uid;
2193 			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
2194 			    CMGROUP_MAX);
2195 			for (i = 0; i < cmcred->cmcred_ngroups; i++)
2196 				cmcred->cmcred_groups[i] =
2197 				    td->td_ucred->cr_groups[i];
2198 			break;
2199 
2200 		case SCM_RIGHTS:
2201 			oldfds = datalen / sizeof (int);
2202 			if (oldfds == 0)
2203 				break;
2204 			/*
2205 			 * Check that all the FDs passed in refer to legal
2206 			 * files.  If not, reject the entire operation.
2207 			 */
2208 			fdp = data;
2209 			FILEDESC_SLOCK(fdesc);
2210 			for (i = 0; i < oldfds; i++, fdp++) {
2211 				fp = fget_locked(fdesc, *fdp);
2212 				if (fp == NULL) {
2213 					FILEDESC_SUNLOCK(fdesc);
2214 					error = EBADF;
2215 					goto out;
2216 				}
2217 				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
2218 					FILEDESC_SUNLOCK(fdesc);
2219 					error = EOPNOTSUPP;
2220 					goto out;
2221 				}
2222 
2223 			}
2224 
2225 			/*
2226 			 * Now replace the integer FDs with pointers to the
2227 			 * file structure and capability rights.
2228 			 */
2229 			newlen = oldfds * sizeof(fdep[0]);
2230 			*controlp = sbcreatecontrol(NULL, newlen,
2231 			    SCM_RIGHTS, SOL_SOCKET);
2232 			if (*controlp == NULL) {
2233 				FILEDESC_SUNLOCK(fdesc);
2234 				error = E2BIG;
2235 				goto out;
2236 			}
2237 			fdp = data;
2238 			for (i = 0; i < oldfds; i++, fdp++) {
2239 				if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) {
2240 					fdp = data;
2241 					for (j = 0; j < i; j++, fdp++) {
2242 						fdrop(fdesc->fd_ofiles[*fdp].
2243 						    fde_file, td);
2244 					}
2245 					FILEDESC_SUNLOCK(fdesc);
2246 					error = EBADF;
2247 					goto out;
2248 				}
2249 			}
2250 			fdp = data;
2251 			fdep = (struct filedescent **)
2252 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2253 			fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
2254 			    M_WAITOK);
2255 			for (i = 0; i < oldfds; i++, fdev++, fdp++) {
2256 				fde = &fdesc->fd_ofiles[*fdp];
2257 				fdep[i] = fdev;
2258 				fdep[i]->fde_file = fde->fde_file;
2259 				filecaps_copy(&fde->fde_caps,
2260 				    &fdep[i]->fde_caps, true);
2261 				unp_internalize_fp(fdep[i]->fde_file);
2262 			}
2263 			FILEDESC_SUNLOCK(fdesc);
2264 			break;
2265 
2266 		case SCM_TIMESTAMP:
2267 			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
2268 			    SCM_TIMESTAMP, SOL_SOCKET);
2269 			if (*controlp == NULL) {
2270 				error = ENOBUFS;
2271 				goto out;
2272 			}
2273 			tv = (struct timeval *)
2274 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2275 			microtime(tv);
2276 			break;
2277 
2278 		case SCM_BINTIME:
2279 			*controlp = sbcreatecontrol(NULL, sizeof(*bt),
2280 			    SCM_BINTIME, SOL_SOCKET);
2281 			if (*controlp == NULL) {
2282 				error = ENOBUFS;
2283 				goto out;
2284 			}
2285 			bt = (struct bintime *)
2286 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2287 			bintime(bt);
2288 			break;
2289 
2290 		case SCM_REALTIME:
2291 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
2292 			    SCM_REALTIME, SOL_SOCKET);
2293 			if (*controlp == NULL) {
2294 				error = ENOBUFS;
2295 				goto out;
2296 			}
2297 			ts = (struct timespec *)
2298 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2299 			nanotime(ts);
2300 			break;
2301 
2302 		case SCM_MONOTONIC:
2303 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
2304 			    SCM_MONOTONIC, SOL_SOCKET);
2305 			if (*controlp == NULL) {
2306 				error = ENOBUFS;
2307 				goto out;
2308 			}
2309 			ts = (struct timespec *)
2310 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2311 			nanouptime(ts);
2312 			break;
2313 
2314 		default:
2315 			error = EINVAL;
2316 			goto out;
2317 		}
2318 
2319 		if (*controlp != NULL)
2320 			controlp = &(*controlp)->m_next;
2321 		if (CMSG_SPACE(datalen) < clen) {
2322 			clen -= CMSG_SPACE(datalen);
2323 			cm = (struct cmsghdr *)
2324 			    ((caddr_t)cm + CMSG_SPACE(datalen));
2325 		} else {
2326 			clen = 0;
2327 			cm = NULL;
2328 		}
2329 	}
2330 
2331 out:
2332 	if (error != 0 && initial_controlp != NULL)
2333 		unp_internalize_cleanup_rights(*initial_controlp);
2334 	m_freem(control);
2335 	return (error);
2336 }
2337 
2338 static struct mbuf *
2339 unp_addsockcred(struct thread *td, struct mbuf *control)
2340 {
2341 	struct mbuf *m, *n, *n_prev;
2342 	struct sockcred *sc;
2343 	const struct cmsghdr *cm;
2344 	int ngroups;
2345 	int i;
2346 
2347 	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
2348 	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
2349 	if (m == NULL)
2350 		return (control);
2351 
2352 	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
2353 	sc->sc_uid = td->td_ucred->cr_ruid;
2354 	sc->sc_euid = td->td_ucred->cr_uid;
2355 	sc->sc_gid = td->td_ucred->cr_rgid;
2356 	sc->sc_egid = td->td_ucred->cr_gid;
2357 	sc->sc_ngroups = ngroups;
2358 	for (i = 0; i < sc->sc_ngroups; i++)
2359 		sc->sc_groups[i] = td->td_ucred->cr_groups[i];
2360 
2361 	/*
2362 	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
2363 	 * created SCM_CREDS control message (struct sockcred) has another
2364 	 * format.
2365 	 */
2366 	if (control != NULL)
2367 		for (n = control, n_prev = NULL; n != NULL;) {
2368 			cm = mtod(n, struct cmsghdr *);
2369     			if (cm->cmsg_level == SOL_SOCKET &&
2370 			    cm->cmsg_type == SCM_CREDS) {
2371     				if (n_prev == NULL)
2372 					control = n->m_next;
2373 				else
2374 					n_prev->m_next = n->m_next;
2375 				n = m_free(n);
2376 			} else {
2377 				n_prev = n;
2378 				n = n->m_next;
2379 			}
2380 		}
2381 
2382 	/* Prepend it to the head. */
2383 	m->m_next = control;
2384 	return (m);
2385 }
2386 
2387 static struct unpcb *
2388 fptounp(struct file *fp)
2389 {
2390 	struct socket *so;
2391 
2392 	if (fp->f_type != DTYPE_SOCKET)
2393 		return (NULL);
2394 	if ((so = fp->f_data) == NULL)
2395 		return (NULL);
2396 	if (so->so_proto->pr_domain != &localdomain)
2397 		return (NULL);
2398 	return sotounpcb(so);
2399 }
2400 
2401 static void
2402 unp_discard(struct file *fp)
2403 {
2404 	struct unp_defer *dr;
2405 
2406 	if (unp_externalize_fp(fp)) {
2407 		dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
2408 		dr->ud_fp = fp;
2409 		UNP_DEFERRED_LOCK();
2410 		SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
2411 		UNP_DEFERRED_UNLOCK();
2412 		atomic_add_int(&unp_defers_count, 1);
2413 		taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
2414 	} else
2415 		(void) closef(fp, (struct thread *)NULL);
2416 }
2417 
2418 static void
2419 unp_process_defers(void *arg __unused, int pending)
2420 {
2421 	struct unp_defer *dr;
2422 	SLIST_HEAD(, unp_defer) drl;
2423 	int count;
2424 
2425 	SLIST_INIT(&drl);
2426 	for (;;) {
2427 		UNP_DEFERRED_LOCK();
2428 		if (SLIST_FIRST(&unp_defers) == NULL) {
2429 			UNP_DEFERRED_UNLOCK();
2430 			break;
2431 		}
2432 		SLIST_SWAP(&unp_defers, &drl, unp_defer);
2433 		UNP_DEFERRED_UNLOCK();
2434 		count = 0;
2435 		while ((dr = SLIST_FIRST(&drl)) != NULL) {
2436 			SLIST_REMOVE_HEAD(&drl, ud_link);
2437 			closef(dr->ud_fp, NULL);
2438 			free(dr, M_TEMP);
2439 			count++;
2440 		}
2441 		atomic_add_int(&unp_defers_count, -count);
2442 	}
2443 }
2444 
2445 static void
2446 unp_internalize_fp(struct file *fp)
2447 {
2448 	struct unpcb *unp;
2449 
2450 	UNP_LINK_WLOCK();
2451 	if ((unp = fptounp(fp)) != NULL) {
2452 		unp->unp_file = fp;
2453 		unp->unp_msgcount++;
2454 	}
2455 	unp_rights++;
2456 	UNP_LINK_WUNLOCK();
2457 }
2458 
2459 static int
2460 unp_externalize_fp(struct file *fp)
2461 {
2462 	struct unpcb *unp;
2463 	int ret;
2464 
2465 	UNP_LINK_WLOCK();
2466 	if ((unp = fptounp(fp)) != NULL) {
2467 		unp->unp_msgcount--;
2468 		ret = 1;
2469 	} else
2470 		ret = 0;
2471 	unp_rights--;
2472 	UNP_LINK_WUNLOCK();
2473 	return (ret);
2474 }
2475 
2476 /*
2477  * unp_defer indicates whether additional work has been defered for a future
2478  * pass through unp_gc().  It is thread local and does not require explicit
2479  * synchronization.
2480  */
2481 static int	unp_marked;
2482 
2483 static void
2484 unp_remove_dead_ref(struct filedescent **fdep, int fdcount)
2485 {
2486 	struct unpcb *unp;
2487 	struct file *fp;
2488 	int i;
2489 
2490 	/*
2491 	 * This function can only be called from the gc task.
2492 	 */
2493 	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
2494 	    ("%s: not on gc callout", __func__));
2495 	UNP_LINK_LOCK_ASSERT();
2496 
2497 	for (i = 0; i < fdcount; i++) {
2498 		fp = fdep[i]->fde_file;
2499 		if ((unp = fptounp(fp)) == NULL)
2500 			continue;
2501 		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
2502 			continue;
2503 		unp->unp_gcrefs--;
2504 	}
2505 }
2506 
2507 static void
2508 unp_restore_undead_ref(struct filedescent **fdep, int fdcount)
2509 {
2510 	struct unpcb *unp;
2511 	struct file *fp;
2512 	int i;
2513 
2514 	/*
2515 	 * This function can only be called from the gc task.
2516 	 */
2517 	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
2518 	    ("%s: not on gc callout", __func__));
2519 	UNP_LINK_LOCK_ASSERT();
2520 
2521 	for (i = 0; i < fdcount; i++) {
2522 		fp = fdep[i]->fde_file;
2523 		if ((unp = fptounp(fp)) == NULL)
2524 			continue;
2525 		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
2526 			continue;
2527 		unp->unp_gcrefs++;
2528 		unp_marked++;
2529 	}
2530 }
2531 
2532 static void
2533 unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int))
2534 {
2535 	struct socket *so, *soa;
2536 
2537 	so = unp->unp_socket;
2538 	SOCK_LOCK(so);
2539 	if (SOLISTENING(so)) {
2540 		/*
2541 		 * Mark all sockets in our accept queue.
2542 		 */
2543 		TAILQ_FOREACH(soa, &so->sol_comp, so_list) {
2544 			if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
2545 				continue;
2546 			SOCKBUF_LOCK(&soa->so_rcv);
2547 			unp_scan(soa->so_rcv.sb_mb, op);
2548 			SOCKBUF_UNLOCK(&soa->so_rcv);
2549 		}
2550 	} else {
2551 		/*
2552 		 * Mark all sockets we reference with RIGHTS.
2553 		 */
2554 		if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
2555 			SOCKBUF_LOCK(&so->so_rcv);
2556 			unp_scan(so->so_rcv.sb_mb, op);
2557 			SOCKBUF_UNLOCK(&so->so_rcv);
2558 		}
2559 	}
2560 	SOCK_UNLOCK(so);
2561 }
2562 
2563 static int unp_recycled;
2564 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0,
2565     "Number of unreachable sockets claimed by the garbage collector.");
2566 
2567 static int unp_taskcount;
2568 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0,
2569     "Number of times the garbage collector has run.");
2570 
2571 SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0,
2572     "Number of active local sockets.");
2573 
2574 static void
2575 unp_gc(__unused void *arg, int pending)
2576 {
2577 	struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
2578 				    NULL };
2579 	struct unp_head **head;
2580 	struct unp_head unp_deadhead;	/* List of potentially-dead sockets. */
2581 	struct file *f, **unref;
2582 	struct unpcb *unp, *unptmp;
2583 	int i, total, unp_unreachable;
2584 
2585 	LIST_INIT(&unp_deadhead);
2586 	unp_taskcount++;
2587 	UNP_LINK_RLOCK();
2588 	/*
2589 	 * First determine which sockets may be in cycles.
2590 	 */
2591 	unp_unreachable = 0;
2592 
2593 	for (head = heads; *head != NULL; head++)
2594 		LIST_FOREACH(unp, *head, unp_link) {
2595 
2596 			KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0,
2597 			    ("%s: unp %p has unexpected gc flags 0x%x",
2598 			    __func__, unp, (unsigned int)unp->unp_gcflag));
2599 
2600 			f = unp->unp_file;
2601 
2602 			/*
2603 			 * Check for an unreachable socket potentially in a
2604 			 * cycle.  It must be in a queue as indicated by
2605 			 * msgcount, and this must equal the file reference
2606 			 * count.  Note that when msgcount is 0 the file is
2607 			 * NULL.
2608 			 */
2609 			if (f != NULL && unp->unp_msgcount != 0 &&
2610 			    f->f_count == unp->unp_msgcount) {
2611 				LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead);
2612 				unp->unp_gcflag |= UNPGC_DEAD;
2613 				unp->unp_gcrefs = unp->unp_msgcount;
2614 				unp_unreachable++;
2615 			}
2616 		}
2617 
2618 	/*
2619 	 * Scan all sockets previously marked as potentially being in a cycle
2620 	 * and remove the references each socket holds on any UNPGC_DEAD
2621 	 * sockets in its queue.  After this step, all remaining references on
2622 	 * sockets marked UNPGC_DEAD should not be part of any cycle.
2623 	 */
2624 	LIST_FOREACH(unp, &unp_deadhead, unp_dead)
2625 		unp_gc_scan(unp, unp_remove_dead_ref);
2626 
2627 	/*
2628 	 * If a socket still has a non-negative refcount, it cannot be in a
2629 	 * cycle.  In this case increment refcount of all children iteratively.
2630 	 * Stop the scan once we do a complete loop without discovering
2631 	 * a new reachable socket.
2632 	 */
2633 	do {
2634 		unp_marked = 0;
2635 		LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp)
2636 			if (unp->unp_gcrefs > 0) {
2637 				unp->unp_gcflag &= ~UNPGC_DEAD;
2638 				LIST_REMOVE(unp, unp_dead);
2639 				KASSERT(unp_unreachable > 0,
2640 				    ("%s: unp_unreachable underflow.",
2641 				    __func__));
2642 				unp_unreachable--;
2643 				unp_gc_scan(unp, unp_restore_undead_ref);
2644 			}
2645 	} while (unp_marked);
2646 
2647 	UNP_LINK_RUNLOCK();
2648 
2649 	if (unp_unreachable == 0)
2650 		return;
2651 
2652 	/*
2653 	 * Allocate space for a local array of dead unpcbs.
2654 	 * TODO: can this path be simplified by instead using the local
2655 	 * dead list at unp_deadhead, after taking out references
2656 	 * on the file object and/or unpcb and dropping the link lock?
2657 	 */
2658 	unref = malloc(unp_unreachable * sizeof(struct file *),
2659 	    M_TEMP, M_WAITOK);
2660 
2661 	/*
2662 	 * Iterate looking for sockets which have been specifically marked
2663 	 * as unreachable and store them locally.
2664 	 */
2665 	UNP_LINK_RLOCK();
2666 	total = 0;
2667 	LIST_FOREACH(unp, &unp_deadhead, unp_dead) {
2668 		KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0,
2669 		    ("%s: unp %p not marked UNPGC_DEAD", __func__, unp));
2670 		unp->unp_gcflag &= ~UNPGC_DEAD;
2671 		f = unp->unp_file;
2672 		if (unp->unp_msgcount == 0 || f == NULL ||
2673 		    f->f_count != unp->unp_msgcount ||
2674 		    !fhold(f))
2675 			continue;
2676 		unref[total++] = f;
2677 		KASSERT(total <= unp_unreachable,
2678 		    ("%s: incorrect unreachable count.", __func__));
2679 	}
2680 	UNP_LINK_RUNLOCK();
2681 
2682 	/*
2683 	 * Now flush all sockets, free'ing rights.  This will free the
2684 	 * struct files associated with these sockets but leave each socket
2685 	 * with one remaining ref.
2686 	 */
2687 	for (i = 0; i < total; i++) {
2688 		struct socket *so;
2689 
2690 		so = unref[i]->f_data;
2691 		CURVNET_SET(so->so_vnet);
2692 		sorflush(so);
2693 		CURVNET_RESTORE();
2694 	}
2695 
2696 	/*
2697 	 * And finally release the sockets so they can be reclaimed.
2698 	 */
2699 	for (i = 0; i < total; i++)
2700 		fdrop(unref[i], NULL);
2701 	unp_recycled += total;
2702 	free(unref, M_TEMP);
2703 }
2704 
2705 static void
2706 unp_dispose_mbuf(struct mbuf *m)
2707 {
2708 
2709 	if (m)
2710 		unp_scan(m, unp_freerights);
2711 }
2712 
2713 /*
2714  * Synchronize against unp_gc, which can trip over data as we are freeing it.
2715  */
2716 static void
2717 unp_dispose(struct socket *so)
2718 {
2719 	struct unpcb *unp;
2720 
2721 	unp = sotounpcb(so);
2722 	UNP_LINK_WLOCK();
2723 	unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
2724 	UNP_LINK_WUNLOCK();
2725 	if (!SOLISTENING(so))
2726 		unp_dispose_mbuf(so->so_rcv.sb_mb);
2727 }
2728 
2729 static void
2730 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
2731 {
2732 	struct mbuf *m;
2733 	struct cmsghdr *cm;
2734 	void *data;
2735 	socklen_t clen, datalen;
2736 
2737 	while (m0 != NULL) {
2738 		for (m = m0; m; m = m->m_next) {
2739 			if (m->m_type != MT_CONTROL)
2740 				continue;
2741 
2742 			cm = mtod(m, struct cmsghdr *);
2743 			clen = m->m_len;
2744 
2745 			while (cm != NULL) {
2746 				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
2747 					break;
2748 
2749 				data = CMSG_DATA(cm);
2750 				datalen = (caddr_t)cm + cm->cmsg_len
2751 				    - (caddr_t)data;
2752 
2753 				if (cm->cmsg_level == SOL_SOCKET &&
2754 				    cm->cmsg_type == SCM_RIGHTS) {
2755 					(*op)(data, datalen /
2756 					    sizeof(struct filedescent *));
2757 				}
2758 
2759 				if (CMSG_SPACE(datalen) < clen) {
2760 					clen -= CMSG_SPACE(datalen);
2761 					cm = (struct cmsghdr *)
2762 					    ((caddr_t)cm + CMSG_SPACE(datalen));
2763 				} else {
2764 					clen = 0;
2765 					cm = NULL;
2766 				}
2767 			}
2768 		}
2769 		m0 = m0->m_nextpkt;
2770 	}
2771 }
2772 
2773 /*
2774  * A helper function called by VFS before socket-type vnode reclamation.
2775  * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
2776  * use count.
2777  */
2778 void
2779 vfs_unp_reclaim(struct vnode *vp)
2780 {
2781 	struct unpcb *unp;
2782 	int active;
2783 	struct mtx *vplock;
2784 
2785 	ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
2786 	KASSERT(vp->v_type == VSOCK,
2787 	    ("vfs_unp_reclaim: vp->v_type != VSOCK"));
2788 
2789 	active = 0;
2790 	vplock = mtx_pool_find(mtxpool_sleep, vp);
2791 	mtx_lock(vplock);
2792 	VOP_UNP_CONNECT(vp, &unp);
2793 	if (unp == NULL)
2794 		goto done;
2795 	UNP_PCB_LOCK(unp);
2796 	if (unp->unp_vnode == vp) {
2797 		VOP_UNP_DETACH(vp);
2798 		unp->unp_vnode = NULL;
2799 		active = 1;
2800 	}
2801 	UNP_PCB_UNLOCK(unp);
2802  done:
2803 	mtx_unlock(vplock);
2804 	if (active)
2805 		vunref(vp);
2806 }
2807 
2808 #ifdef DDB
2809 static void
2810 db_print_indent(int indent)
2811 {
2812 	int i;
2813 
2814 	for (i = 0; i < indent; i++)
2815 		db_printf(" ");
2816 }
2817 
2818 static void
2819 db_print_unpflags(int unp_flags)
2820 {
2821 	int comma;
2822 
2823 	comma = 0;
2824 	if (unp_flags & UNP_HAVEPC) {
2825 		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
2826 		comma = 1;
2827 	}
2828 	if (unp_flags & UNP_WANTCRED) {
2829 		db_printf("%sUNP_WANTCRED", comma ? ", " : "");
2830 		comma = 1;
2831 	}
2832 	if (unp_flags & UNP_CONNWAIT) {
2833 		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
2834 		comma = 1;
2835 	}
2836 	if (unp_flags & UNP_CONNECTING) {
2837 		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
2838 		comma = 1;
2839 	}
2840 	if (unp_flags & UNP_BINDING) {
2841 		db_printf("%sUNP_BINDING", comma ? ", " : "");
2842 		comma = 1;
2843 	}
2844 }
2845 
2846 static void
2847 db_print_xucred(int indent, struct xucred *xu)
2848 {
2849 	int comma, i;
2850 
2851 	db_print_indent(indent);
2852 	db_printf("cr_version: %u   cr_uid: %u   cr_pid: %d   cr_ngroups: %d\n",
2853 	    xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups);
2854 	db_print_indent(indent);
2855 	db_printf("cr_groups: ");
2856 	comma = 0;
2857 	for (i = 0; i < xu->cr_ngroups; i++) {
2858 		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
2859 		comma = 1;
2860 	}
2861 	db_printf("\n");
2862 }
2863 
2864 static void
2865 db_print_unprefs(int indent, struct unp_head *uh)
2866 {
2867 	struct unpcb *unp;
2868 	int counter;
2869 
2870 	counter = 0;
2871 	LIST_FOREACH(unp, uh, unp_reflink) {
2872 		if (counter % 4 == 0)
2873 			db_print_indent(indent);
2874 		db_printf("%p  ", unp);
2875 		if (counter % 4 == 3)
2876 			db_printf("\n");
2877 		counter++;
2878 	}
2879 	if (counter != 0 && counter % 4 != 0)
2880 		db_printf("\n");
2881 }
2882 
2883 DB_SHOW_COMMAND(unpcb, db_show_unpcb)
2884 {
2885 	struct unpcb *unp;
2886 
2887         if (!have_addr) {
2888                 db_printf("usage: show unpcb <addr>\n");
2889                 return;
2890         }
2891         unp = (struct unpcb *)addr;
2892 
2893 	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
2894 	    unp->unp_vnode);
2895 
2896 	db_printf("unp_ino: %ju   unp_conn: %p\n", (uintmax_t)unp->unp_ino,
2897 	    unp->unp_conn);
2898 
2899 	db_printf("unp_refs:\n");
2900 	db_print_unprefs(2, &unp->unp_refs);
2901 
2902 	/* XXXRW: Would be nice to print the full address, if any. */
2903 	db_printf("unp_addr: %p\n", unp->unp_addr);
2904 
2905 	db_printf("unp_gencnt: %llu\n",
2906 	    (unsigned long long)unp->unp_gencnt);
2907 
2908 	db_printf("unp_flags: %x (", unp->unp_flags);
2909 	db_print_unpflags(unp->unp_flags);
2910 	db_printf(")\n");
2911 
2912 	db_printf("unp_peercred:\n");
2913 	db_print_xucred(2, &unp->unp_peercred);
2914 
2915 	db_printf("unp_refcount: %u\n", unp->unp_refcount);
2916 }
2917 #endif
2918