xref: /freebsd/sys/kern/uipc_usrreq.c (revision afdb42987ca82869eeaecf6dc25c2b6fb7b8370e)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1989, 1991, 1993
5  *	The Regents of the University of California. All Rights Reserved.
6  * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
7  * Copyright (c) 2018 Matthew Macy
8  * Copyright (c) 2022 Gleb Smirnoff <glebius@FreeBSD.org>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
35  */
36 
37 /*
38  * UNIX Domain (Local) Sockets
39  *
40  * This is an implementation of UNIX (local) domain sockets.  Each socket has
41  * an associated struct unpcb (UNIX protocol control block).  Stream sockets
42  * may be connected to 0 or 1 other socket.  Datagram sockets may be
43  * connected to 0, 1, or many other sockets.  Sockets may be created and
44  * connected in pairs (socketpair(2)), or bound/connected to using the file
45  * system name space.  For most purposes, only the receive socket buffer is
46  * used, as sending on one socket delivers directly to the receive socket
47  * buffer of a second socket.
48  *
49  * The implementation is substantially complicated by the fact that
50  * "ancillary data", such as file descriptors or credentials, may be passed
51  * across UNIX domain sockets.  The potential for passing UNIX domain sockets
52  * over other UNIX domain sockets requires the implementation of a simple
53  * garbage collector to find and tear down cycles of disconnected sockets.
54  *
55  * TODO:
56  *	RDM
57  *	rethink name space problems
58  *	need a proper out-of-band
59  */
60 
61 #include <sys/cdefs.h>
62 __FBSDID("$FreeBSD$");
63 
64 #include "opt_ddb.h"
65 
66 #include <sys/param.h>
67 #include <sys/capsicum.h>
68 #include <sys/domain.h>
69 #include <sys/eventhandler.h>
70 #include <sys/fcntl.h>
71 #include <sys/file.h>
72 #include <sys/filedesc.h>
73 #include <sys/kernel.h>
74 #include <sys/lock.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/mount.h>
78 #include <sys/mutex.h>
79 #include <sys/namei.h>
80 #include <sys/proc.h>
81 #include <sys/protosw.h>
82 #include <sys/queue.h>
83 #include <sys/resourcevar.h>
84 #include <sys/rwlock.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/signalvar.h>
88 #include <sys/stat.h>
89 #include <sys/sx.h>
90 #include <sys/sysctl.h>
91 #include <sys/systm.h>
92 #include <sys/taskqueue.h>
93 #include <sys/un.h>
94 #include <sys/unpcb.h>
95 #include <sys/vnode.h>
96 
97 #include <net/vnet.h>
98 
99 #ifdef DDB
100 #include <ddb/ddb.h>
101 #endif
102 
103 #include <security/mac/mac_framework.h>
104 
105 #include <vm/uma.h>
106 
107 MALLOC_DECLARE(M_FILECAPS);
108 
109 static struct domain localdomain;
110 
111 static uma_zone_t	unp_zone;
112 static unp_gen_t	unp_gencnt;	/* (l) */
113 static u_int		unp_count;	/* (l) Count of local sockets. */
114 static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
115 static int		unp_rights;	/* (g) File descriptors in flight. */
116 static struct unp_head	unp_shead;	/* (l) List of stream sockets. */
117 static struct unp_head	unp_dhead;	/* (l) List of datagram sockets. */
118 static struct unp_head	unp_sphead;	/* (l) List of seqpacket sockets. */
119 
120 struct unp_defer {
121 	SLIST_ENTRY(unp_defer) ud_link;
122 	struct file *ud_fp;
123 };
124 static SLIST_HEAD(, unp_defer) unp_defers;
125 static int unp_defers_count;
126 
127 static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
128 
129 /*
130  * Garbage collection of cyclic file descriptor/socket references occurs
131  * asynchronously in a taskqueue context in order to avoid recursion and
132  * reentrance in the UNIX domain socket, file descriptor, and socket layer
133  * code.  See unp_gc() for a full description.
134  */
135 static struct timeout_task unp_gc_task;
136 
137 /*
138  * The close of unix domain sockets attached as SCM_RIGHTS is
139  * postponed to the taskqueue, to avoid arbitrary recursion depth.
140  * The attached sockets might have another sockets attached.
141  */
142 static struct task	unp_defer_task;
143 
144 /*
145  * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
146  * stream sockets, although the total for sender and receiver is actually
147  * only PIPSIZ.
148  *
149  * Datagram sockets really use the sendspace as the maximum datagram size,
150  * and don't really want to reserve the sendspace.  Their recvspace should be
151  * large enough for at least one max-size datagram plus address.
152  */
153 #ifndef PIPSIZ
154 #define	PIPSIZ	8192
155 #endif
156 static u_long	unpst_sendspace = PIPSIZ;
157 static u_long	unpst_recvspace = PIPSIZ;
158 static u_long	unpdg_maxdgram = 2*1024;
159 static u_long	unpdg_recvspace = 16*1024;	/* support 8KB syslog msgs */
160 static u_long	unpsp_sendspace = PIPSIZ;	/* really max datagram size */
161 static u_long	unpsp_recvspace = PIPSIZ;
162 
163 static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
164     "Local domain");
165 static SYSCTL_NODE(_net_local, SOCK_STREAM, stream,
166     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
167     "SOCK_STREAM");
168 static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram,
169     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
170     "SOCK_DGRAM");
171 static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket,
172     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
173     "SOCK_SEQPACKET");
174 
175 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
176 	   &unpst_sendspace, 0, "Default stream send space.");
177 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
178 	   &unpst_recvspace, 0, "Default stream receive space.");
179 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
180 	   &unpdg_maxdgram, 0, "Maximum datagram size.");
181 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
182 	   &unpdg_recvspace, 0, "Default datagram receive space.");
183 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
184 	   &unpsp_sendspace, 0, "Default seqpacket send space.");
185 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
186 	   &unpsp_recvspace, 0, "Default seqpacket receive space.");
187 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
188     "File descriptors in flight.");
189 SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
190     &unp_defers_count, 0,
191     "File descriptors deferred to taskqueue for close.");
192 
193 /*
194  * Locking and synchronization:
195  *
196  * Several types of locks exist in the local domain socket implementation:
197  * - a global linkage lock
198  * - a global connection list lock
199  * - the mtxpool lock
200  * - per-unpcb mutexes
201  *
202  * The linkage lock protects the global socket lists, the generation number
203  * counter and garbage collector state.
204  *
205  * The connection list lock protects the list of referring sockets in a datagram
206  * socket PCB.  This lock is also overloaded to protect a global list of
207  * sockets whose buffers contain socket references in the form of SCM_RIGHTS
208  * messages.  To avoid recursion, such references are released by a dedicated
209  * thread.
210  *
211  * The mtxpool lock protects the vnode from being modified while referenced.
212  * Lock ordering rules require that it be acquired before any PCB locks.
213  *
214  * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the
215  * unpcb.  This includes the unp_conn field, which either links two connected
216  * PCBs together (for connected socket types) or points at the destination
217  * socket (for connectionless socket types).  The operations of creating or
218  * destroying a connection therefore involve locking multiple PCBs.  To avoid
219  * lock order reversals, in some cases this involves dropping a PCB lock and
220  * using a reference counter to maintain liveness.
221  *
222  * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
223  * allocated in pr_attach() and freed in pr_detach().  The validity of that
224  * pointer is an invariant, so no lock is required to dereference the so_pcb
225  * pointer if a valid socket reference is held by the caller.  In practice,
226  * this is always true during operations performed on a socket.  Each unpcb
227  * has a back-pointer to its socket, unp_socket, which will be stable under
228  * the same circumstances.
229  *
230  * This pointer may only be safely dereferenced as long as a valid reference
231  * to the unpcb is held.  Typically, this reference will be from the socket,
232  * or from another unpcb when the referring unpcb's lock is held (in order
233  * that the reference not be invalidated during use).  For example, to follow
234  * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
235  * that detach is not run clearing unp_socket.
236  *
237  * Blocking with UNIX domain sockets is a tricky issue: unlike most network
238  * protocols, bind() is a non-atomic operation, and connect() requires
239  * potential sleeping in the protocol, due to potentially waiting on local or
240  * distributed file systems.  We try to separate "lookup" operations, which
241  * may sleep, and the IPC operations themselves, which typically can occur
242  * with relative atomicity as locks can be held over the entire operation.
243  *
244  * Another tricky issue is simultaneous multi-threaded or multi-process
245  * access to a single UNIX domain socket.  These are handled by the flags
246  * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
247  * binding, both of which involve dropping UNIX domain socket locks in order
248  * to perform namei() and other file system operations.
249  */
250 static struct rwlock	unp_link_rwlock;
251 static struct mtx	unp_defers_lock;
252 
253 #define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
254 					    "unp_link_rwlock")
255 
256 #define	UNP_LINK_LOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
257 					    RA_LOCKED)
258 #define	UNP_LINK_UNLOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
259 					    RA_UNLOCKED)
260 
261 #define	UNP_LINK_RLOCK()		rw_rlock(&unp_link_rwlock)
262 #define	UNP_LINK_RUNLOCK()		rw_runlock(&unp_link_rwlock)
263 #define	UNP_LINK_WLOCK()		rw_wlock(&unp_link_rwlock)
264 #define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
265 #define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
266 					    RA_WLOCKED)
267 #define	UNP_LINK_WOWNED()		rw_wowned(&unp_link_rwlock)
268 
269 #define	UNP_DEFERRED_LOCK_INIT()	mtx_init(&unp_defers_lock, \
270 					    "unp_defer", NULL, MTX_DEF)
271 #define	UNP_DEFERRED_LOCK()		mtx_lock(&unp_defers_lock)
272 #define	UNP_DEFERRED_UNLOCK()		mtx_unlock(&unp_defers_lock)
273 
274 #define UNP_REF_LIST_LOCK()		UNP_DEFERRED_LOCK();
275 #define UNP_REF_LIST_UNLOCK()		UNP_DEFERRED_UNLOCK();
276 
277 #define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
278 					    "unp", "unp",	\
279 					    MTX_DUPOK|MTX_DEF)
280 #define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
281 #define	UNP_PCB_LOCKPTR(unp)		(&(unp)->unp_mtx)
282 #define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
283 #define	UNP_PCB_TRYLOCK(unp)		mtx_trylock(&(unp)->unp_mtx)
284 #define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
285 #define	UNP_PCB_OWNED(unp)		mtx_owned(&(unp)->unp_mtx)
286 #define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
287 #define	UNP_PCB_UNLOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
288 
289 static int	uipc_connect2(struct socket *, struct socket *);
290 static int	uipc_ctloutput(struct socket *, struct sockopt *);
291 static int	unp_connect(struct socket *, struct sockaddr *,
292 		    struct thread *);
293 static int	unp_connectat(int, struct socket *, struct sockaddr *,
294 		    struct thread *, bool);
295 typedef enum { PRU_CONNECT, PRU_CONNECT2 } conn2_how;
296 static void	unp_connect2(struct socket *so, struct socket *so2, conn2_how);
297 static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
298 static void	unp_dispose(struct socket *so);
299 static void	unp_shutdown(struct unpcb *);
300 static void	unp_drop(struct unpcb *);
301 static void	unp_gc(__unused void *, int);
302 static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
303 static void	unp_discard(struct file *);
304 static void	unp_freerights(struct filedescent **, int);
305 static int	unp_internalize(struct mbuf **, struct thread *,
306 		    struct mbuf **, u_int *, u_int *);
307 static void	unp_internalize_fp(struct file *);
308 static int	unp_externalize(struct mbuf *, struct mbuf **, int);
309 static int	unp_externalize_fp(struct file *);
310 static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *,
311 		    int, struct mbuf **, u_int *, u_int *);
312 static void	unp_process_defers(void * __unused, int);
313 
314 static void
315 unp_pcb_hold(struct unpcb *unp)
316 {
317 	u_int old __unused;
318 
319 	old = refcount_acquire(&unp->unp_refcount);
320 	KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp));
321 }
322 
323 static __result_use_check bool
324 unp_pcb_rele(struct unpcb *unp)
325 {
326 	bool ret;
327 
328 	UNP_PCB_LOCK_ASSERT(unp);
329 
330 	if ((ret = refcount_release(&unp->unp_refcount))) {
331 		UNP_PCB_UNLOCK(unp);
332 		UNP_PCB_LOCK_DESTROY(unp);
333 		uma_zfree(unp_zone, unp);
334 	}
335 	return (ret);
336 }
337 
338 static void
339 unp_pcb_rele_notlast(struct unpcb *unp)
340 {
341 	bool ret __unused;
342 
343 	ret = refcount_release(&unp->unp_refcount);
344 	KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp));
345 }
346 
347 static void
348 unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2)
349 {
350 	UNP_PCB_UNLOCK_ASSERT(unp);
351 	UNP_PCB_UNLOCK_ASSERT(unp2);
352 
353 	if (unp == unp2) {
354 		UNP_PCB_LOCK(unp);
355 	} else if ((uintptr_t)unp2 > (uintptr_t)unp) {
356 		UNP_PCB_LOCK(unp);
357 		UNP_PCB_LOCK(unp2);
358 	} else {
359 		UNP_PCB_LOCK(unp2);
360 		UNP_PCB_LOCK(unp);
361 	}
362 }
363 
364 static void
365 unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2)
366 {
367 	UNP_PCB_UNLOCK(unp);
368 	if (unp != unp2)
369 		UNP_PCB_UNLOCK(unp2);
370 }
371 
372 /*
373  * Try to lock the connected peer of an already locked socket.  In some cases
374  * this requires that we unlock the current socket.  The pairbusy counter is
375  * used to block concurrent connection attempts while the lock is dropped.  The
376  * caller must be careful to revalidate PCB state.
377  */
378 static struct unpcb *
379 unp_pcb_lock_peer(struct unpcb *unp)
380 {
381 	struct unpcb *unp2;
382 
383 	UNP_PCB_LOCK_ASSERT(unp);
384 	unp2 = unp->unp_conn;
385 	if (unp2 == NULL)
386 		return (NULL);
387 	if (__predict_false(unp == unp2))
388 		return (unp);
389 
390 	UNP_PCB_UNLOCK_ASSERT(unp2);
391 
392 	if (__predict_true(UNP_PCB_TRYLOCK(unp2)))
393 		return (unp2);
394 	if ((uintptr_t)unp2 > (uintptr_t)unp) {
395 		UNP_PCB_LOCK(unp2);
396 		return (unp2);
397 	}
398 	unp->unp_pairbusy++;
399 	unp_pcb_hold(unp2);
400 	UNP_PCB_UNLOCK(unp);
401 
402 	UNP_PCB_LOCK(unp2);
403 	UNP_PCB_LOCK(unp);
404 	KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL,
405 	    ("%s: socket %p was reconnected", __func__, unp));
406 	if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) {
407 		unp->unp_flags &= ~UNP_WAITING;
408 		wakeup(unp);
409 	}
410 	if (unp_pcb_rele(unp2)) {
411 		/* unp2 is unlocked. */
412 		return (NULL);
413 	}
414 	if (unp->unp_conn == NULL) {
415 		UNP_PCB_UNLOCK(unp2);
416 		return (NULL);
417 	}
418 	return (unp2);
419 }
420 
421 static void
422 uipc_abort(struct socket *so)
423 {
424 	struct unpcb *unp, *unp2;
425 
426 	unp = sotounpcb(so);
427 	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
428 	UNP_PCB_UNLOCK_ASSERT(unp);
429 
430 	UNP_PCB_LOCK(unp);
431 	unp2 = unp->unp_conn;
432 	if (unp2 != NULL) {
433 		unp_pcb_hold(unp2);
434 		UNP_PCB_UNLOCK(unp);
435 		unp_drop(unp2);
436 	} else
437 		UNP_PCB_UNLOCK(unp);
438 }
439 
440 static int
441 uipc_accept(struct socket *so, struct sockaddr **nam)
442 {
443 	struct unpcb *unp, *unp2;
444 	const struct sockaddr *sa;
445 
446 	/*
447 	 * Pass back name of connected socket, if it was bound and we are
448 	 * still connected (our peer may have closed already!).
449 	 */
450 	unp = sotounpcb(so);
451 	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
452 
453 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
454 	UNP_PCB_LOCK(unp);
455 	unp2 = unp_pcb_lock_peer(unp);
456 	if (unp2 != NULL && unp2->unp_addr != NULL)
457 		sa = (struct sockaddr *)unp2->unp_addr;
458 	else
459 		sa = &sun_noname;
460 	bcopy(sa, *nam, sa->sa_len);
461 	if (unp2 != NULL)
462 		unp_pcb_unlock_pair(unp, unp2);
463 	else
464 		UNP_PCB_UNLOCK(unp);
465 	return (0);
466 }
467 
468 static int
469 uipc_attach(struct socket *so, int proto, struct thread *td)
470 {
471 	u_long sendspace, recvspace;
472 	struct unpcb *unp;
473 	int error;
474 	bool locked;
475 
476 	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
477 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
478 		switch (so->so_type) {
479 		case SOCK_STREAM:
480 			sendspace = unpst_sendspace;
481 			recvspace = unpst_recvspace;
482 			break;
483 
484 		case SOCK_DGRAM:
485 			STAILQ_INIT(&so->so_rcv.uxdg_mb);
486 			STAILQ_INIT(&so->so_snd.uxdg_mb);
487 			TAILQ_INIT(&so->so_rcv.uxdg_conns);
488 			/*
489 			 * Since send buffer is either bypassed or is a part
490 			 * of one-to-many receive buffer, we assign both space
491 			 * limits to unpdg_recvspace.
492 			 */
493 			sendspace = recvspace = unpdg_recvspace;
494 			break;
495 
496 		case SOCK_SEQPACKET:
497 			sendspace = unpsp_sendspace;
498 			recvspace = unpsp_recvspace;
499 			break;
500 
501 		default:
502 			panic("uipc_attach");
503 		}
504 		error = soreserve(so, sendspace, recvspace);
505 		if (error)
506 			return (error);
507 	}
508 	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
509 	if (unp == NULL)
510 		return (ENOBUFS);
511 	LIST_INIT(&unp->unp_refs);
512 	UNP_PCB_LOCK_INIT(unp);
513 	unp->unp_socket = so;
514 	so->so_pcb = unp;
515 	refcount_init(&unp->unp_refcount, 1);
516 
517 	if ((locked = UNP_LINK_WOWNED()) == false)
518 		UNP_LINK_WLOCK();
519 
520 	unp->unp_gencnt = ++unp_gencnt;
521 	unp->unp_ino = ++unp_ino;
522 	unp_count++;
523 	switch (so->so_type) {
524 	case SOCK_STREAM:
525 		LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
526 		break;
527 
528 	case SOCK_DGRAM:
529 		LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
530 		break;
531 
532 	case SOCK_SEQPACKET:
533 		LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
534 		break;
535 
536 	default:
537 		panic("uipc_attach");
538 	}
539 
540 	if (locked == false)
541 		UNP_LINK_WUNLOCK();
542 
543 	return (0);
544 }
545 
546 static int
547 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
548 {
549 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
550 	struct vattr vattr;
551 	int error, namelen;
552 	struct nameidata nd;
553 	struct unpcb *unp;
554 	struct vnode *vp;
555 	struct mount *mp;
556 	cap_rights_t rights;
557 	char *buf;
558 
559 	if (nam->sa_family != AF_UNIX)
560 		return (EAFNOSUPPORT);
561 
562 	unp = sotounpcb(so);
563 	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
564 
565 	if (soun->sun_len > sizeof(struct sockaddr_un))
566 		return (EINVAL);
567 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
568 	if (namelen <= 0)
569 		return (EINVAL);
570 
571 	/*
572 	 * We don't allow simultaneous bind() calls on a single UNIX domain
573 	 * socket, so flag in-progress operations, and return an error if an
574 	 * operation is already in progress.
575 	 *
576 	 * Historically, we have not allowed a socket to be rebound, so this
577 	 * also returns an error.  Not allowing re-binding simplifies the
578 	 * implementation and avoids a great many possible failure modes.
579 	 */
580 	UNP_PCB_LOCK(unp);
581 	if (unp->unp_vnode != NULL) {
582 		UNP_PCB_UNLOCK(unp);
583 		return (EINVAL);
584 	}
585 	if (unp->unp_flags & UNP_BINDING) {
586 		UNP_PCB_UNLOCK(unp);
587 		return (EALREADY);
588 	}
589 	unp->unp_flags |= UNP_BINDING;
590 	UNP_PCB_UNLOCK(unp);
591 
592 	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
593 	bcopy(soun->sun_path, buf, namelen);
594 	buf[namelen] = 0;
595 
596 restart:
597 	NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | NOCACHE,
598 	    UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT));
599 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
600 	error = namei(&nd);
601 	if (error)
602 		goto error;
603 	vp = nd.ni_vp;
604 	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
605 		NDFREE_PNBUF(&nd);
606 		if (nd.ni_dvp == vp)
607 			vrele(nd.ni_dvp);
608 		else
609 			vput(nd.ni_dvp);
610 		if (vp != NULL) {
611 			vrele(vp);
612 			error = EADDRINUSE;
613 			goto error;
614 		}
615 		error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
616 		if (error)
617 			goto error;
618 		goto restart;
619 	}
620 	VATTR_NULL(&vattr);
621 	vattr.va_type = VSOCK;
622 	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_pd->pd_cmask);
623 #ifdef MAC
624 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
625 	    &vattr);
626 #endif
627 	if (error == 0)
628 		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
629 	NDFREE_PNBUF(&nd);
630 	if (error) {
631 		VOP_VPUT_PAIR(nd.ni_dvp, NULL, true);
632 		vn_finished_write(mp);
633 		if (error == ERELOOKUP)
634 			goto restart;
635 		goto error;
636 	}
637 	vp = nd.ni_vp;
638 	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
639 	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
640 
641 	UNP_PCB_LOCK(unp);
642 	VOP_UNP_BIND(vp, unp);
643 	unp->unp_vnode = vp;
644 	unp->unp_addr = soun;
645 	unp->unp_flags &= ~UNP_BINDING;
646 	UNP_PCB_UNLOCK(unp);
647 	vref(vp);
648 	VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
649 	vn_finished_write(mp);
650 	free(buf, M_TEMP);
651 	return (0);
652 
653 error:
654 	UNP_PCB_LOCK(unp);
655 	unp->unp_flags &= ~UNP_BINDING;
656 	UNP_PCB_UNLOCK(unp);
657 	free(buf, M_TEMP);
658 	return (error);
659 }
660 
661 static int
662 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
663 {
664 
665 	return (uipc_bindat(AT_FDCWD, so, nam, td));
666 }
667 
668 static int
669 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
670 {
671 	int error;
672 
673 	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
674 	error = unp_connect(so, nam, td);
675 	return (error);
676 }
677 
678 static int
679 uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
680     struct thread *td)
681 {
682 	int error;
683 
684 	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
685 	error = unp_connectat(fd, so, nam, td, false);
686 	return (error);
687 }
688 
689 static void
690 uipc_close(struct socket *so)
691 {
692 	struct unpcb *unp, *unp2;
693 	struct vnode *vp = NULL;
694 	struct mtx *vplock;
695 
696 	unp = sotounpcb(so);
697 	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
698 
699 	vplock = NULL;
700 	if ((vp = unp->unp_vnode) != NULL) {
701 		vplock = mtx_pool_find(mtxpool_sleep, vp);
702 		mtx_lock(vplock);
703 	}
704 	UNP_PCB_LOCK(unp);
705 	if (vp && unp->unp_vnode == NULL) {
706 		mtx_unlock(vplock);
707 		vp = NULL;
708 	}
709 	if (vp != NULL) {
710 		VOP_UNP_DETACH(vp);
711 		unp->unp_vnode = NULL;
712 	}
713 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
714 		unp_disconnect(unp, unp2);
715 	else
716 		UNP_PCB_UNLOCK(unp);
717 	if (vp) {
718 		mtx_unlock(vplock);
719 		vrele(vp);
720 	}
721 }
722 
723 static int
724 uipc_connect2(struct socket *so1, struct socket *so2)
725 {
726 	struct unpcb *unp, *unp2;
727 
728 	if (so1->so_type != so2->so_type)
729 		return (EPROTOTYPE);
730 
731 	unp = so1->so_pcb;
732 	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
733 	unp2 = so2->so_pcb;
734 	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
735 	unp_pcb_lock_pair(unp, unp2);
736 	unp_connect2(so1, so2, PRU_CONNECT2);
737 	unp_pcb_unlock_pair(unp, unp2);
738 
739 	return (0);
740 }
741 
742 static void
743 uipc_detach(struct socket *so)
744 {
745 	struct unpcb *unp, *unp2;
746 	struct mtx *vplock;
747 	struct vnode *vp;
748 	int local_unp_rights;
749 
750 	unp = sotounpcb(so);
751 	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
752 
753 	vp = NULL;
754 	vplock = NULL;
755 
756 	UNP_LINK_WLOCK();
757 	LIST_REMOVE(unp, unp_link);
758 	if (unp->unp_gcflag & UNPGC_DEAD)
759 		LIST_REMOVE(unp, unp_dead);
760 	unp->unp_gencnt = ++unp_gencnt;
761 	--unp_count;
762 	UNP_LINK_WUNLOCK();
763 
764 	UNP_PCB_UNLOCK_ASSERT(unp);
765  restart:
766 	if ((vp = unp->unp_vnode) != NULL) {
767 		vplock = mtx_pool_find(mtxpool_sleep, vp);
768 		mtx_lock(vplock);
769 	}
770 	UNP_PCB_LOCK(unp);
771 	if (unp->unp_vnode != vp && unp->unp_vnode != NULL) {
772 		if (vplock)
773 			mtx_unlock(vplock);
774 		UNP_PCB_UNLOCK(unp);
775 		goto restart;
776 	}
777 	if ((vp = unp->unp_vnode) != NULL) {
778 		VOP_UNP_DETACH(vp);
779 		unp->unp_vnode = NULL;
780 	}
781 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
782 		unp_disconnect(unp, unp2);
783 	else
784 		UNP_PCB_UNLOCK(unp);
785 
786 	UNP_REF_LIST_LOCK();
787 	while (!LIST_EMPTY(&unp->unp_refs)) {
788 		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
789 
790 		unp_pcb_hold(ref);
791 		UNP_REF_LIST_UNLOCK();
792 
793 		MPASS(ref != unp);
794 		UNP_PCB_UNLOCK_ASSERT(ref);
795 		unp_drop(ref);
796 		UNP_REF_LIST_LOCK();
797 	}
798 	UNP_REF_LIST_UNLOCK();
799 
800 	UNP_PCB_LOCK(unp);
801 	local_unp_rights = unp_rights;
802 	unp->unp_socket->so_pcb = NULL;
803 	unp->unp_socket = NULL;
804 	free(unp->unp_addr, M_SONAME);
805 	unp->unp_addr = NULL;
806 	if (!unp_pcb_rele(unp))
807 		UNP_PCB_UNLOCK(unp);
808 	if (vp) {
809 		mtx_unlock(vplock);
810 		vrele(vp);
811 	}
812 	if (local_unp_rights)
813 		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
814 
815 	switch (so->so_type) {
816 	case SOCK_DGRAM:
817 		/*
818 		 * Everything should have been unlinked/freed by unp_dispose()
819 		 * and/or unp_disconnect().
820 		 */
821 		MPASS(so->so_rcv.uxdg_peeked == NULL);
822 		MPASS(STAILQ_EMPTY(&so->so_rcv.uxdg_mb));
823 		MPASS(TAILQ_EMPTY(&so->so_rcv.uxdg_conns));
824 		MPASS(STAILQ_EMPTY(&so->so_snd.uxdg_mb));
825 	}
826 }
827 
828 static int
829 uipc_disconnect(struct socket *so)
830 {
831 	struct unpcb *unp, *unp2;
832 
833 	unp = sotounpcb(so);
834 	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
835 
836 	UNP_PCB_LOCK(unp);
837 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
838 		unp_disconnect(unp, unp2);
839 	else
840 		UNP_PCB_UNLOCK(unp);
841 	return (0);
842 }
843 
844 static int
845 uipc_listen(struct socket *so, int backlog, struct thread *td)
846 {
847 	struct unpcb *unp;
848 	int error;
849 
850 	MPASS(so->so_type != SOCK_DGRAM);
851 
852 	/*
853 	 * Synchronize with concurrent connection attempts.
854 	 */
855 	error = 0;
856 	unp = sotounpcb(so);
857 	UNP_PCB_LOCK(unp);
858 	if (unp->unp_conn != NULL || (unp->unp_flags & UNP_CONNECTING) != 0)
859 		error = EINVAL;
860 	else if (unp->unp_vnode == NULL)
861 		error = EDESTADDRREQ;
862 	if (error != 0) {
863 		UNP_PCB_UNLOCK(unp);
864 		return (error);
865 	}
866 
867 	SOCK_LOCK(so);
868 	error = solisten_proto_check(so);
869 	if (error == 0) {
870 		cru2xt(td, &unp->unp_peercred);
871 		solisten_proto(so, backlog);
872 	}
873 	SOCK_UNLOCK(so);
874 	UNP_PCB_UNLOCK(unp);
875 	return (error);
876 }
877 
878 static int
879 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
880 {
881 	struct unpcb *unp, *unp2;
882 	const struct sockaddr *sa;
883 
884 	unp = sotounpcb(so);
885 	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
886 
887 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
888 
889 	UNP_PCB_LOCK(unp);
890 	unp2 = unp_pcb_lock_peer(unp);
891 	if (unp2 != NULL) {
892 		if (unp2->unp_addr != NULL)
893 			sa = (struct sockaddr *) unp2->unp_addr;
894 		else
895 			sa = &sun_noname;
896 		bcopy(sa, *nam, sa->sa_len);
897 		UNP_PCB_UNLOCK(unp2);
898 	} else {
899 		sa = &sun_noname;
900 		bcopy(sa, *nam, sa->sa_len);
901 	}
902 	UNP_PCB_UNLOCK(unp);
903 	return (0);
904 }
905 
906 static int
907 uipc_rcvd(struct socket *so, int flags)
908 {
909 	struct unpcb *unp, *unp2;
910 	struct socket *so2;
911 	u_int mbcnt, sbcc;
912 
913 	unp = sotounpcb(so);
914 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
915 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
916 	    ("%s: socktype %d", __func__, so->so_type));
917 
918 	/*
919 	 * Adjust backpressure on sender and wakeup any waiting to write.
920 	 *
921 	 * The unp lock is acquired to maintain the validity of the unp_conn
922 	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
923 	 * static as long as we don't permit unp2 to disconnect from unp,
924 	 * which is prevented by the lock on unp.  We cache values from
925 	 * so_rcv to avoid holding the so_rcv lock over the entire
926 	 * transaction on the remote so_snd.
927 	 */
928 	SOCKBUF_LOCK(&so->so_rcv);
929 	mbcnt = so->so_rcv.sb_mbcnt;
930 	sbcc = sbavail(&so->so_rcv);
931 	SOCKBUF_UNLOCK(&so->so_rcv);
932 	/*
933 	 * There is a benign race condition at this point.  If we're planning to
934 	 * clear SB_STOP, but uipc_send is called on the connected socket at
935 	 * this instant, it might add data to the sockbuf and set SB_STOP.  Then
936 	 * we would erroneously clear SB_STOP below, even though the sockbuf is
937 	 * full.  The race is benign because the only ill effect is to allow the
938 	 * sockbuf to exceed its size limit, and the size limits are not
939 	 * strictly guaranteed anyway.
940 	 */
941 	UNP_PCB_LOCK(unp);
942 	unp2 = unp->unp_conn;
943 	if (unp2 == NULL) {
944 		UNP_PCB_UNLOCK(unp);
945 		return (0);
946 	}
947 	so2 = unp2->unp_socket;
948 	SOCKBUF_LOCK(&so2->so_snd);
949 	if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
950 		so2->so_snd.sb_flags &= ~SB_STOP;
951 	sowwakeup_locked(so2);
952 	UNP_PCB_UNLOCK(unp);
953 	return (0);
954 }
955 
956 static int
957 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
958     struct mbuf *control, struct thread *td)
959 {
960 	struct unpcb *unp, *unp2;
961 	struct socket *so2;
962 	u_int mbcnt, sbcc;
963 	int error;
964 
965 	unp = sotounpcb(so);
966 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
967 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
968 	    ("%s: socktype %d", __func__, so->so_type));
969 
970 	error = 0;
971 	if (flags & PRUS_OOB) {
972 		error = EOPNOTSUPP;
973 		goto release;
974 	}
975 	if (control != NULL &&
976 	    (error = unp_internalize(&control, td, NULL, NULL, NULL)))
977 		goto release;
978 
979 	unp2 = NULL;
980 	if ((so->so_state & SS_ISCONNECTED) == 0) {
981 		if (nam != NULL) {
982 			if ((error = unp_connect(so, nam, td)) != 0)
983 				goto out;
984 		} else {
985 			error = ENOTCONN;
986 			goto out;
987 		}
988 	}
989 
990 	UNP_PCB_LOCK(unp);
991 	if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) {
992 		UNP_PCB_UNLOCK(unp);
993 		error = ENOTCONN;
994 		goto out;
995 	} else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
996 		unp_pcb_unlock_pair(unp, unp2);
997 		error = EPIPE;
998 		goto out;
999 	}
1000 	UNP_PCB_UNLOCK(unp);
1001 	if ((so2 = unp2->unp_socket) == NULL) {
1002 		UNP_PCB_UNLOCK(unp2);
1003 		error = ENOTCONN;
1004 		goto out;
1005 	}
1006 	SOCKBUF_LOCK(&so2->so_rcv);
1007 	if (unp2->unp_flags & UNP_WANTCRED_MASK) {
1008 		/*
1009 		 * Credentials are passed only once on SOCK_STREAM and
1010 		 * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
1011 		 * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
1012 		 */
1013 		control = unp_addsockcred(td, control, unp2->unp_flags, NULL,
1014 		    NULL, NULL);
1015 		unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
1016 	}
1017 
1018 	/*
1019 	 * Send to paired receive port and wake up readers.  Don't
1020 	 * check for space available in the receive buffer if we're
1021 	 * attaching ancillary data; Unix domain sockets only check
1022 	 * for space in the sending sockbuf, and that check is
1023 	 * performed one level up the stack.  At that level we cannot
1024 	 * precisely account for the amount of buffer space used
1025 	 * (e.g., because control messages are not yet internalized).
1026 	 */
1027 	switch (so->so_type) {
1028 	case SOCK_STREAM:
1029 		if (control != NULL) {
1030 			sbappendcontrol_locked(&so2->so_rcv, m,
1031 			    control, flags);
1032 			control = NULL;
1033 		} else
1034 			sbappend_locked(&so2->so_rcv, m, flags);
1035 		break;
1036 
1037 	case SOCK_SEQPACKET:
1038 		if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
1039 		    &sun_noname, m, control))
1040 			control = NULL;
1041 		break;
1042 	}
1043 
1044 	mbcnt = so2->so_rcv.sb_mbcnt;
1045 	sbcc = sbavail(&so2->so_rcv);
1046 	if (sbcc)
1047 		sorwakeup_locked(so2);
1048 	else
1049 		SOCKBUF_UNLOCK(&so2->so_rcv);
1050 
1051 	/*
1052 	 * The PCB lock on unp2 protects the SB_STOP flag.  Without it,
1053 	 * it would be possible for uipc_rcvd to be called at this
1054 	 * point, drain the receiving sockbuf, clear SB_STOP, and then
1055 	 * we would set SB_STOP below.  That could lead to an empty
1056 	 * sockbuf having SB_STOP set
1057 	 */
1058 	SOCKBUF_LOCK(&so->so_snd);
1059 	if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
1060 		so->so_snd.sb_flags |= SB_STOP;
1061 	SOCKBUF_UNLOCK(&so->so_snd);
1062 	UNP_PCB_UNLOCK(unp2);
1063 	m = NULL;
1064 out:
1065 	/*
1066 	 * PRUS_EOF is equivalent to pr_send followed by pr_shutdown.
1067 	 */
1068 	if (flags & PRUS_EOF) {
1069 		UNP_PCB_LOCK(unp);
1070 		socantsendmore(so);
1071 		unp_shutdown(unp);
1072 		UNP_PCB_UNLOCK(unp);
1073 	}
1074 	if (control != NULL && error != 0)
1075 		unp_scan(control, unp_freerights);
1076 
1077 release:
1078 	if (control != NULL)
1079 		m_freem(control);
1080 	/*
1081 	 * In case of PRUS_NOTREADY, uipc_ready() is responsible
1082 	 * for freeing memory.
1083 	 */
1084 	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
1085 		m_freem(m);
1086 	return (error);
1087 }
1088 
1089 /* PF_UNIX/SOCK_DGRAM version of sbspace() */
1090 static inline bool
1091 uipc_dgram_sbspace(struct sockbuf *sb, u_int cc, u_int mbcnt)
1092 {
1093 	u_int bleft, mleft;
1094 
1095 	/*
1096 	 * Negative space may happen if send(2) is followed by
1097 	 * setsockopt(SO_SNDBUF/SO_RCVBUF) that shrinks maximum.
1098 	 */
1099 	if (__predict_false(sb->sb_hiwat < sb->uxdg_cc ||
1100 	    sb->sb_mbmax < sb->uxdg_mbcnt))
1101 		return (false);
1102 
1103 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE))
1104 		return (false);
1105 
1106 	bleft = sb->sb_hiwat - sb->uxdg_cc;
1107 	mleft = sb->sb_mbmax - sb->uxdg_mbcnt;
1108 
1109 	return (bleft >= cc && mleft >= mbcnt);
1110 }
1111 
1112 /*
1113  * PF_UNIX/SOCK_DGRAM send
1114  *
1115  * Allocate a record consisting of 3 mbufs in the sequence of
1116  * from -> control -> data and append it to the socket buffer.
1117  *
1118  * The first mbuf carries sender's name and is a pkthdr that stores
1119  * overall length of datagram, its memory consumption and control length.
1120  */
1121 #define	ctllen	PH_loc.thirtytwo[1]
1122 _Static_assert(offsetof(struct pkthdr, memlen) + sizeof(u_int) <=
1123     offsetof(struct pkthdr, ctllen), "unix/dgram can not store ctllen");
1124 static int
1125 uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1126     struct mbuf *m, struct mbuf *c, int flags, struct thread *td)
1127 {
1128 	struct unpcb *unp, *unp2;
1129 	const struct sockaddr *from;
1130 	struct socket *so2;
1131 	struct sockbuf *sb;
1132 	struct mbuf *f, *clast;
1133 	u_int cc, ctl, mbcnt;
1134 	u_int dcc __diagused, dctl __diagused, dmbcnt __diagused;
1135 	int error;
1136 
1137 	MPASS((uio != NULL && m == NULL) || (m != NULL && uio == NULL));
1138 
1139 	error = 0;
1140 	f = NULL;
1141 	ctl = 0;
1142 
1143 	if (__predict_false(flags & MSG_OOB)) {
1144 		error = EOPNOTSUPP;
1145 		goto out;
1146 	}
1147 	if (m == NULL) {
1148 		if (__predict_false(uio->uio_resid > unpdg_maxdgram)) {
1149 			error = EMSGSIZE;
1150 			goto out;
1151 		}
1152 		m = m_uiotombuf(uio, M_WAITOK, 0, max_hdr, M_PKTHDR);
1153 		if (__predict_false(m == NULL)) {
1154 			error = EFAULT;
1155 			goto out;
1156 		}
1157 		f = m_gethdr(M_WAITOK, MT_SONAME);
1158 		cc = m->m_pkthdr.len;
1159 		mbcnt = MSIZE + m->m_pkthdr.memlen;
1160 		if (c != NULL &&
1161 		    (error = unp_internalize(&c, td, &clast, &ctl, &mbcnt)))
1162 			goto out;
1163 	} else {
1164 		/* pr_sosend() with mbuf usually is a kernel thread. */
1165 
1166 		M_ASSERTPKTHDR(m);
1167 		if (__predict_false(c != NULL))
1168 			panic("%s: control from a kernel thread", __func__);
1169 
1170 		if (__predict_false(m->m_pkthdr.len > unpdg_maxdgram)) {
1171 			error = EMSGSIZE;
1172 			goto out;
1173 		}
1174 		if ((f = m_gethdr(M_NOWAIT, MT_SONAME)) == NULL) {
1175 			error = ENOBUFS;
1176 			goto out;
1177 		}
1178 		/* Condition the foreign mbuf to our standards. */
1179 		m_clrprotoflags(m);
1180 		m_tag_delete_chain(m, NULL);
1181 		m->m_pkthdr.rcvif = NULL;
1182 		m->m_pkthdr.flowid = 0;
1183 		m->m_pkthdr.csum_flags = 0;
1184 		m->m_pkthdr.fibnum = 0;
1185 		m->m_pkthdr.rsstype = 0;
1186 
1187 		cc = m->m_pkthdr.len;
1188 		mbcnt = MSIZE;
1189 		for (struct mbuf *mb = m; mb != NULL; mb = mb->m_next) {
1190 			mbcnt += MSIZE;
1191 			if (mb->m_flags & M_EXT)
1192 				mbcnt += mb->m_ext.ext_size;
1193 		}
1194 	}
1195 
1196 	unp = sotounpcb(so);
1197 	MPASS(unp);
1198 
1199 	/*
1200 	 * XXXGL: would be cool to fully remove so_snd out of the equation
1201 	 * and avoid this lock, which is not only extraneous, but also being
1202 	 * released, thus still leaving possibility for a race.  We can easily
1203 	 * handle SBS_CANTSENDMORE/SS_ISCONNECTED complement in unpcb, but it
1204 	 * is more difficult to invent something to handle so_error.
1205 	 */
1206 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1207 	if (error)
1208 		goto out2;
1209 	SOCK_SENDBUF_LOCK(so);
1210 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1211 		SOCK_SENDBUF_UNLOCK(so);
1212 		error = EPIPE;
1213 		goto out3;
1214 	}
1215 	if (so->so_error != 0) {
1216 		error = so->so_error;
1217 		so->so_error = 0;
1218 		SOCK_SENDBUF_UNLOCK(so);
1219 		goto out3;
1220 	}
1221 	if (((so->so_state & SS_ISCONNECTED) == 0) && addr == NULL) {
1222 		SOCK_SENDBUF_UNLOCK(so);
1223 		error = EDESTADDRREQ;
1224 		goto out3;
1225 	}
1226 	SOCK_SENDBUF_UNLOCK(so);
1227 
1228 	if (addr != NULL) {
1229 		if ((error = unp_connectat(AT_FDCWD, so, addr, td, true)))
1230 			goto out3;
1231 		UNP_PCB_LOCK_ASSERT(unp);
1232 		unp2 = unp->unp_conn;
1233 		UNP_PCB_LOCK_ASSERT(unp2);
1234 	} else {
1235 		UNP_PCB_LOCK(unp);
1236 		unp2 = unp_pcb_lock_peer(unp);
1237 		if (unp2 == NULL) {
1238 			UNP_PCB_UNLOCK(unp);
1239 			error = ENOTCONN;
1240 			goto out3;
1241 		}
1242 	}
1243 
1244 	if (unp2->unp_flags & UNP_WANTCRED_MASK)
1245 		c = unp_addsockcred(td, c, unp2->unp_flags, &clast, &ctl,
1246 		    &mbcnt);
1247 	if (unp->unp_addr != NULL)
1248 		from = (struct sockaddr *)unp->unp_addr;
1249 	else
1250 		from = &sun_noname;
1251 	f->m_len = from->sa_len;
1252 	MPASS(from->sa_len <= MLEN);
1253 	bcopy(from, mtod(f, void *), from->sa_len);
1254 	ctl += f->m_len;
1255 
1256 	/*
1257 	 * Concatenate mbufs: from -> control -> data.
1258 	 * Save overall cc and mbcnt in "from" mbuf.
1259 	 */
1260 	if (c != NULL) {
1261 #ifdef INVARIANTS
1262 		struct mbuf *mc;
1263 
1264 		for (mc = c; mc->m_next != NULL; mc = mc->m_next);
1265 		MPASS(mc == clast);
1266 #endif
1267 		f->m_next = c;
1268 		clast->m_next = m;
1269 		c = NULL;
1270 	} else
1271 		f->m_next = m;
1272 	m = NULL;
1273 #ifdef INVARIANTS
1274 	dcc = dctl = dmbcnt = 0;
1275 	for (struct mbuf *mb = f; mb != NULL; mb = mb->m_next) {
1276 		if (mb->m_type == MT_DATA)
1277 			dcc += mb->m_len;
1278 		else
1279 			dctl += mb->m_len;
1280 		dmbcnt += MSIZE;
1281 		if (mb->m_flags & M_EXT)
1282 			dmbcnt += mb->m_ext.ext_size;
1283 	}
1284 	MPASS(dcc == cc);
1285 	MPASS(dctl == ctl);
1286 	MPASS(dmbcnt == mbcnt);
1287 #endif
1288 	f->m_pkthdr.len = cc + ctl;
1289 	f->m_pkthdr.memlen = mbcnt;
1290 	f->m_pkthdr.ctllen = ctl;
1291 
1292 	/*
1293 	 * Destination socket buffer selection.
1294 	 *
1295 	 * Unconnected sends, when !(so->so_state & SS_ISCONNECTED) and the
1296 	 * destination address is supplied, create a temporary connection for
1297 	 * the run time of the function (see call to unp_connectat() above and
1298 	 * to unp_disconnect() below).  We distinguish them by condition of
1299 	 * (addr != NULL).  We intentionally avoid adding 'bool connected' for
1300 	 * that condition, since, again, through the run time of this code we
1301 	 * are always connected.  For such "unconnected" sends, the destination
1302 	 * buffer would be the receive buffer of destination socket so2.
1303 	 *
1304 	 * For connected sends, data lands on the send buffer of the sender's
1305 	 * socket "so".  Then, if we just added the very first datagram
1306 	 * on this send buffer, we need to add the send buffer on to the
1307 	 * receiving socket's buffer list.  We put ourselves on top of the
1308 	 * list.  Such logic gives infrequent senders priority over frequent
1309 	 * senders.
1310 	 *
1311 	 * Note on byte count management. As long as event methods kevent(2),
1312 	 * select(2) are not protocol specific (yet), we need to maintain
1313 	 * meaningful values on the receive buffer.  So, the receive buffer
1314 	 * would accumulate counters from all connected buffers potentially
1315 	 * having sb_ccc > sb_hiwat or sb_mbcnt > sb_mbmax.
1316 	 */
1317 	so2 = unp2->unp_socket;
1318 	sb = (addr == NULL) ? &so->so_snd : &so2->so_rcv;
1319 	SOCK_RECVBUF_LOCK(so2);
1320 	if (uipc_dgram_sbspace(sb, cc + ctl, mbcnt)) {
1321 		if (addr == NULL && STAILQ_EMPTY(&sb->uxdg_mb))
1322 			TAILQ_INSERT_HEAD(&so2->so_rcv.uxdg_conns, &so->so_snd,
1323 			    uxdg_clist);
1324 		STAILQ_INSERT_TAIL(&sb->uxdg_mb, f, m_stailqpkt);
1325 		sb->uxdg_cc += cc + ctl;
1326 		sb->uxdg_ctl += ctl;
1327 		sb->uxdg_mbcnt += mbcnt;
1328 		so2->so_rcv.sb_acc += cc + ctl;
1329 		so2->so_rcv.sb_ccc += cc + ctl;
1330 		so2->so_rcv.sb_ctl += ctl;
1331 		so2->so_rcv.sb_mbcnt += mbcnt;
1332 		sorwakeup_locked(so2);
1333 		f = NULL;
1334 	} else {
1335 		soroverflow_locked(so2);
1336 		error = ENOBUFS;
1337 		if (f->m_next->m_type == MT_CONTROL)
1338 			unp_scan(f->m_next, unp_freerights);
1339 	}
1340 
1341 	if (addr != NULL)
1342 		unp_disconnect(unp, unp2);
1343 	else
1344 		unp_pcb_unlock_pair(unp, unp2);
1345 
1346 	td->td_ru.ru_msgsnd++;
1347 
1348 out3:
1349 	SOCK_IO_SEND_UNLOCK(so);
1350 out2:
1351 	if (c)
1352 		unp_scan(c, unp_freerights);
1353 out:
1354 	if (f)
1355 		m_freem(f);
1356 	if (c)
1357 		m_freem(c);
1358 	if (m)
1359 		m_freem(m);
1360 
1361 	return (error);
1362 }
1363 
1364 /*
1365  * PF_UNIX/SOCK_DGRAM receive with MSG_PEEK.
1366  * The mbuf has already been unlinked from the uxdg_mb of socket buffer
1367  * and needs to be linked onto uxdg_peeked of receive socket buffer.
1368  */
1369 static int
1370 uipc_peek_dgram(struct socket *so, struct mbuf *m, struct sockaddr **psa,
1371     struct uio *uio, struct mbuf **controlp, int *flagsp)
1372 {
1373 	ssize_t len = 0;
1374 	int error;
1375 
1376 	so->so_rcv.uxdg_peeked = m;
1377 	so->so_rcv.uxdg_cc += m->m_pkthdr.len;
1378 	so->so_rcv.uxdg_ctl += m->m_pkthdr.ctllen;
1379 	so->so_rcv.uxdg_mbcnt += m->m_pkthdr.memlen;
1380 	SOCK_RECVBUF_UNLOCK(so);
1381 
1382 	KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
1383 	if (psa != NULL)
1384 		*psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
1385 
1386 	m = m->m_next;
1387 	KASSERT(m, ("%s: no data or control after soname", __func__));
1388 
1389 	/*
1390 	 * With MSG_PEEK the control isn't executed, just copied.
1391 	 */
1392 	while (m != NULL && m->m_type == MT_CONTROL) {
1393 		if (controlp != NULL) {
1394 			*controlp = m_copym(m, 0, m->m_len, M_WAITOK);
1395 			controlp = &(*controlp)->m_next;
1396 		}
1397 		m = m->m_next;
1398 	}
1399 	KASSERT(m == NULL || m->m_type == MT_DATA,
1400 	    ("%s: not MT_DATA mbuf %p", __func__, m));
1401 	while (m != NULL && uio->uio_resid > 0) {
1402 		len = uio->uio_resid;
1403 		if (len > m->m_len)
1404 			len = m->m_len;
1405 		error = uiomove(mtod(m, char *), (int)len, uio);
1406 		if (error) {
1407 			SOCK_IO_RECV_UNLOCK(so);
1408 			return (error);
1409 		}
1410 		if (len == m->m_len)
1411 			m = m->m_next;
1412 	}
1413 	SOCK_IO_RECV_UNLOCK(so);
1414 
1415 	if (flagsp != NULL) {
1416 		if (m != NULL) {
1417 			if (*flagsp & MSG_TRUNC) {
1418 				/* Report real length of the packet */
1419 				uio->uio_resid -= m_length(m, NULL) - len;
1420 			}
1421 			*flagsp |= MSG_TRUNC;
1422 		} else
1423 			*flagsp &= ~MSG_TRUNC;
1424 	}
1425 
1426 	return (0);
1427 }
1428 
1429 /*
1430  * PF_UNIX/SOCK_DGRAM receive
1431  */
1432 static int
1433 uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
1434     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1435 {
1436 	struct sockbuf *sb = NULL;
1437 	struct mbuf *m;
1438 	int flags, error;
1439 	ssize_t len = 0;
1440 	bool nonblock;
1441 
1442 	MPASS(mp0 == NULL);
1443 
1444 	if (psa != NULL)
1445 		*psa = NULL;
1446 	if (controlp != NULL)
1447 		*controlp = NULL;
1448 
1449 	flags = flagsp != NULL ? *flagsp : 0;
1450 	nonblock = (so->so_state & SS_NBIO) ||
1451 	    (flags & (MSG_DONTWAIT | MSG_NBIO));
1452 
1453 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
1454 	if (__predict_false(error))
1455 		return (error);
1456 
1457 	/*
1458 	 * Loop blocking while waiting for a datagram.  Prioritize connected
1459 	 * peers over unconnected sends.  Set sb to selected socket buffer
1460 	 * containing an mbuf on exit from the wait loop.  A datagram that
1461 	 * had already been peeked at has top priority.
1462 	 */
1463 	SOCK_RECVBUF_LOCK(so);
1464 	while ((m = so->so_rcv.uxdg_peeked) == NULL &&
1465 	    (sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) == NULL &&
1466 	    (m = STAILQ_FIRST(&so->so_rcv.uxdg_mb)) == NULL) {
1467 		if (so->so_error) {
1468 			error = so->so_error;
1469 			so->so_error = 0;
1470 			SOCK_RECVBUF_UNLOCK(so);
1471 			SOCK_IO_RECV_UNLOCK(so);
1472 			return (error);
1473 		}
1474 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
1475 		    uio->uio_resid == 0) {
1476 			SOCK_RECVBUF_UNLOCK(so);
1477 			SOCK_IO_RECV_UNLOCK(so);
1478 			return (0);
1479 		}
1480 		if (nonblock) {
1481 			SOCK_RECVBUF_UNLOCK(so);
1482 			SOCK_IO_RECV_UNLOCK(so);
1483 			return (EWOULDBLOCK);
1484 		}
1485 		error = sbwait(so, SO_RCV);
1486 		if (error) {
1487 			SOCK_RECVBUF_UNLOCK(so);
1488 			SOCK_IO_RECV_UNLOCK(so);
1489 			return (error);
1490 		}
1491 	}
1492 
1493 	if (sb == NULL)
1494 		sb = &so->so_rcv;
1495 	else if (m == NULL)
1496 		m = STAILQ_FIRST(&sb->uxdg_mb);
1497 	else
1498 		MPASS(m == so->so_rcv.uxdg_peeked);
1499 
1500 	MPASS(sb->uxdg_cc > 0);
1501 	M_ASSERTPKTHDR(m);
1502 	KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
1503 
1504 	if (uio->uio_td)
1505 		uio->uio_td->td_ru.ru_msgrcv++;
1506 
1507 	if (__predict_true(m != so->so_rcv.uxdg_peeked)) {
1508 		STAILQ_REMOVE_HEAD(&sb->uxdg_mb, m_stailqpkt);
1509 		if (STAILQ_EMPTY(&sb->uxdg_mb) && sb != &so->so_rcv)
1510 			TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist);
1511 	} else
1512 		so->so_rcv.uxdg_peeked = NULL;
1513 
1514 	sb->uxdg_cc -= m->m_pkthdr.len;
1515 	sb->uxdg_ctl -= m->m_pkthdr.ctllen;
1516 	sb->uxdg_mbcnt -= m->m_pkthdr.memlen;
1517 
1518 	if (__predict_false(flags & MSG_PEEK))
1519 		return (uipc_peek_dgram(so, m, psa, uio, controlp, flagsp));
1520 
1521 	so->so_rcv.sb_acc -= m->m_pkthdr.len;
1522 	so->so_rcv.sb_ccc -= m->m_pkthdr.len;
1523 	so->so_rcv.sb_ctl -= m->m_pkthdr.ctllen;
1524 	so->so_rcv.sb_mbcnt -= m->m_pkthdr.memlen;
1525 	SOCK_RECVBUF_UNLOCK(so);
1526 
1527 	if (psa != NULL)
1528 		*psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
1529 	m = m_free(m);
1530 	KASSERT(m, ("%s: no data or control after soname", __func__));
1531 
1532 	/*
1533 	 * Packet to copyout() is now in 'm' and it is disconnected from the
1534 	 * queue.
1535 	 *
1536 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
1537 	 * in the first mbuf chain on the socket buffer.  We call into the
1538 	 * unp_externalize() to perform externalization (or freeing if
1539 	 * controlp == NULL). In some cases there can be only MT_CONTROL mbufs
1540 	 * without MT_DATA mbufs.
1541 	 */
1542 	while (m != NULL && m->m_type == MT_CONTROL) {
1543 		struct mbuf *cm;
1544 
1545 		/* XXXGL: unp_externalize() is also dom_externalize() KBI and
1546 		 * it frees whole chain, so we must disconnect the mbuf.
1547 		 */
1548 		cm = m; m = m->m_next; cm->m_next = NULL;
1549 		error = unp_externalize(cm, controlp, flags);
1550 		if (error != 0) {
1551 			SOCK_IO_RECV_UNLOCK(so);
1552 			unp_scan(m, unp_freerights);
1553 			m_freem(m);
1554 			return (error);
1555 		}
1556 		if (controlp != NULL) {
1557 			while (*controlp != NULL)
1558 				controlp = &(*controlp)->m_next;
1559 		}
1560 	}
1561 	KASSERT(m == NULL || m->m_type == MT_DATA,
1562 	    ("%s: not MT_DATA mbuf %p", __func__, m));
1563 	while (m != NULL && uio->uio_resid > 0) {
1564 		len = uio->uio_resid;
1565 		if (len > m->m_len)
1566 			len = m->m_len;
1567 		error = uiomove(mtod(m, char *), (int)len, uio);
1568 		if (error) {
1569 			SOCK_IO_RECV_UNLOCK(so);
1570 			m_freem(m);
1571 			return (error);
1572 		}
1573 		if (len == m->m_len)
1574 			m = m_free(m);
1575 		else {
1576 			m->m_data += len;
1577 			m->m_len -= len;
1578 		}
1579 	}
1580 	SOCK_IO_RECV_UNLOCK(so);
1581 
1582 	if (m != NULL) {
1583 		if (flagsp != NULL) {
1584 			if (flags & MSG_TRUNC) {
1585 				/* Report real length of the packet */
1586 				uio->uio_resid -= m_length(m, NULL);
1587 			}
1588 			*flagsp |= MSG_TRUNC;
1589 		}
1590 		m_freem(m);
1591 	} else if (flagsp != NULL)
1592 		*flagsp &= ~MSG_TRUNC;
1593 
1594 	return (0);
1595 }
1596 
1597 static bool
1598 uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp)
1599 {
1600 	struct mbuf *mb, *n;
1601 	struct sockbuf *sb;
1602 
1603 	SOCK_LOCK(so);
1604 	if (SOLISTENING(so)) {
1605 		SOCK_UNLOCK(so);
1606 		return (false);
1607 	}
1608 	mb = NULL;
1609 	sb = &so->so_rcv;
1610 	SOCKBUF_LOCK(sb);
1611 	if (sb->sb_fnrdy != NULL) {
1612 		for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) {
1613 			if (mb == m) {
1614 				*errorp = sbready(sb, m, count);
1615 				break;
1616 			}
1617 			mb = mb->m_next;
1618 			if (mb == NULL) {
1619 				mb = n;
1620 				if (mb != NULL)
1621 					n = mb->m_nextpkt;
1622 			}
1623 		}
1624 	}
1625 	SOCKBUF_UNLOCK(sb);
1626 	SOCK_UNLOCK(so);
1627 	return (mb != NULL);
1628 }
1629 
1630 static int
1631 uipc_ready(struct socket *so, struct mbuf *m, int count)
1632 {
1633 	struct unpcb *unp, *unp2;
1634 	struct socket *so2;
1635 	int error, i;
1636 
1637 	unp = sotounpcb(so);
1638 
1639 	KASSERT(so->so_type == SOCK_STREAM,
1640 	    ("%s: unexpected socket type for %p", __func__, so));
1641 
1642 	UNP_PCB_LOCK(unp);
1643 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
1644 		UNP_PCB_UNLOCK(unp);
1645 		so2 = unp2->unp_socket;
1646 		SOCKBUF_LOCK(&so2->so_rcv);
1647 		if ((error = sbready(&so2->so_rcv, m, count)) == 0)
1648 			sorwakeup_locked(so2);
1649 		else
1650 			SOCKBUF_UNLOCK(&so2->so_rcv);
1651 		UNP_PCB_UNLOCK(unp2);
1652 		return (error);
1653 	}
1654 	UNP_PCB_UNLOCK(unp);
1655 
1656 	/*
1657 	 * The receiving socket has been disconnected, but may still be valid.
1658 	 * In this case, the now-ready mbufs are still present in its socket
1659 	 * buffer, so perform an exhaustive search before giving up and freeing
1660 	 * the mbufs.
1661 	 */
1662 	UNP_LINK_RLOCK();
1663 	LIST_FOREACH(unp, &unp_shead, unp_link) {
1664 		if (uipc_ready_scan(unp->unp_socket, m, count, &error))
1665 			break;
1666 	}
1667 	UNP_LINK_RUNLOCK();
1668 
1669 	if (unp == NULL) {
1670 		for (i = 0; i < count; i++)
1671 			m = m_free(m);
1672 		error = ECONNRESET;
1673 	}
1674 	return (error);
1675 }
1676 
1677 static int
1678 uipc_sense(struct socket *so, struct stat *sb)
1679 {
1680 	struct unpcb *unp;
1681 
1682 	unp = sotounpcb(so);
1683 	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
1684 
1685 	sb->st_blksize = so->so_snd.sb_hiwat;
1686 	sb->st_dev = NODEV;
1687 	sb->st_ino = unp->unp_ino;
1688 	return (0);
1689 }
1690 
1691 static int
1692 uipc_shutdown(struct socket *so)
1693 {
1694 	struct unpcb *unp;
1695 
1696 	unp = sotounpcb(so);
1697 	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
1698 
1699 	UNP_PCB_LOCK(unp);
1700 	socantsendmore(so);
1701 	unp_shutdown(unp);
1702 	UNP_PCB_UNLOCK(unp);
1703 	return (0);
1704 }
1705 
1706 static int
1707 uipc_sockaddr(struct socket *so, struct sockaddr **nam)
1708 {
1709 	struct unpcb *unp;
1710 	const struct sockaddr *sa;
1711 
1712 	unp = sotounpcb(so);
1713 	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
1714 
1715 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1716 	UNP_PCB_LOCK(unp);
1717 	if (unp->unp_addr != NULL)
1718 		sa = (struct sockaddr *) unp->unp_addr;
1719 	else
1720 		sa = &sun_noname;
1721 	bcopy(sa, *nam, sa->sa_len);
1722 	UNP_PCB_UNLOCK(unp);
1723 	return (0);
1724 }
1725 
1726 static int
1727 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
1728 {
1729 	struct unpcb *unp;
1730 	struct xucred xu;
1731 	int error, optval;
1732 
1733 	if (sopt->sopt_level != SOL_LOCAL)
1734 		return (EINVAL);
1735 
1736 	unp = sotounpcb(so);
1737 	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
1738 	error = 0;
1739 	switch (sopt->sopt_dir) {
1740 	case SOPT_GET:
1741 		switch (sopt->sopt_name) {
1742 		case LOCAL_PEERCRED:
1743 			UNP_PCB_LOCK(unp);
1744 			if (unp->unp_flags & UNP_HAVEPC)
1745 				xu = unp->unp_peercred;
1746 			else {
1747 				if (so->so_type == SOCK_STREAM)
1748 					error = ENOTCONN;
1749 				else
1750 					error = EINVAL;
1751 			}
1752 			UNP_PCB_UNLOCK(unp);
1753 			if (error == 0)
1754 				error = sooptcopyout(sopt, &xu, sizeof(xu));
1755 			break;
1756 
1757 		case LOCAL_CREDS:
1758 			/* Unlocked read. */
1759 			optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0;
1760 			error = sooptcopyout(sopt, &optval, sizeof(optval));
1761 			break;
1762 
1763 		case LOCAL_CREDS_PERSISTENT:
1764 			/* Unlocked read. */
1765 			optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0;
1766 			error = sooptcopyout(sopt, &optval, sizeof(optval));
1767 			break;
1768 
1769 		case LOCAL_CONNWAIT:
1770 			/* Unlocked read. */
1771 			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
1772 			error = sooptcopyout(sopt, &optval, sizeof(optval));
1773 			break;
1774 
1775 		default:
1776 			error = EOPNOTSUPP;
1777 			break;
1778 		}
1779 		break;
1780 
1781 	case SOPT_SET:
1782 		switch (sopt->sopt_name) {
1783 		case LOCAL_CREDS:
1784 		case LOCAL_CREDS_PERSISTENT:
1785 		case LOCAL_CONNWAIT:
1786 			error = sooptcopyin(sopt, &optval, sizeof(optval),
1787 					    sizeof(optval));
1788 			if (error)
1789 				break;
1790 
1791 #define	OPTSET(bit, exclusive) do {					\
1792 	UNP_PCB_LOCK(unp);						\
1793 	if (optval) {							\
1794 		if ((unp->unp_flags & (exclusive)) != 0) {		\
1795 			UNP_PCB_UNLOCK(unp);				\
1796 			error = EINVAL;					\
1797 			break;						\
1798 		}							\
1799 		unp->unp_flags |= (bit);				\
1800 	} else								\
1801 		unp->unp_flags &= ~(bit);				\
1802 	UNP_PCB_UNLOCK(unp);						\
1803 } while (0)
1804 
1805 			switch (sopt->sopt_name) {
1806 			case LOCAL_CREDS:
1807 				OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS);
1808 				break;
1809 
1810 			case LOCAL_CREDS_PERSISTENT:
1811 				OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT);
1812 				break;
1813 
1814 			case LOCAL_CONNWAIT:
1815 				OPTSET(UNP_CONNWAIT, 0);
1816 				break;
1817 
1818 			default:
1819 				break;
1820 			}
1821 			break;
1822 #undef	OPTSET
1823 		default:
1824 			error = ENOPROTOOPT;
1825 			break;
1826 		}
1827 		break;
1828 
1829 	default:
1830 		error = EOPNOTSUPP;
1831 		break;
1832 	}
1833 	return (error);
1834 }
1835 
1836 static int
1837 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1838 {
1839 
1840 	return (unp_connectat(AT_FDCWD, so, nam, td, false));
1841 }
1842 
1843 static int
1844 unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
1845     struct thread *td, bool return_locked)
1846 {
1847 	struct mtx *vplock;
1848 	struct sockaddr_un *soun;
1849 	struct vnode *vp;
1850 	struct socket *so2;
1851 	struct unpcb *unp, *unp2, *unp3;
1852 	struct nameidata nd;
1853 	char buf[SOCK_MAXADDRLEN];
1854 	struct sockaddr *sa;
1855 	cap_rights_t rights;
1856 	int error, len;
1857 	bool connreq;
1858 
1859 	if (nam->sa_family != AF_UNIX)
1860 		return (EAFNOSUPPORT);
1861 	if (nam->sa_len > sizeof(struct sockaddr_un))
1862 		return (EINVAL);
1863 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
1864 	if (len <= 0)
1865 		return (EINVAL);
1866 	soun = (struct sockaddr_un *)nam;
1867 	bcopy(soun->sun_path, buf, len);
1868 	buf[len] = 0;
1869 
1870 	error = 0;
1871 	unp = sotounpcb(so);
1872 	UNP_PCB_LOCK(unp);
1873 	for (;;) {
1874 		/*
1875 		 * Wait for connection state to stabilize.  If a connection
1876 		 * already exists, give up.  For datagram sockets, which permit
1877 		 * multiple consecutive connect(2) calls, upper layers are
1878 		 * responsible for disconnecting in advance of a subsequent
1879 		 * connect(2), but this is not synchronized with PCB connection
1880 		 * state.
1881 		 *
1882 		 * Also make sure that no threads are currently attempting to
1883 		 * lock the peer socket, to ensure that unp_conn cannot
1884 		 * transition between two valid sockets while locks are dropped.
1885 		 */
1886 		if (SOLISTENING(so))
1887 			error = EOPNOTSUPP;
1888 		else if (unp->unp_conn != NULL)
1889 			error = EISCONN;
1890 		else if ((unp->unp_flags & UNP_CONNECTING) != 0) {
1891 			error = EALREADY;
1892 		}
1893 		if (error != 0) {
1894 			UNP_PCB_UNLOCK(unp);
1895 			return (error);
1896 		}
1897 		if (unp->unp_pairbusy > 0) {
1898 			unp->unp_flags |= UNP_WAITING;
1899 			mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0);
1900 			continue;
1901 		}
1902 		break;
1903 	}
1904 	unp->unp_flags |= UNP_CONNECTING;
1905 	UNP_PCB_UNLOCK(unp);
1906 
1907 	connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0;
1908 	if (connreq)
1909 		sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1910 	else
1911 		sa = NULL;
1912 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
1913 	    UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT));
1914 	error = namei(&nd);
1915 	if (error)
1916 		vp = NULL;
1917 	else
1918 		vp = nd.ni_vp;
1919 	ASSERT_VOP_LOCKED(vp, "unp_connect");
1920 	if (error)
1921 		goto bad;
1922 	NDFREE_PNBUF(&nd);
1923 
1924 	if (vp->v_type != VSOCK) {
1925 		error = ENOTSOCK;
1926 		goto bad;
1927 	}
1928 #ifdef MAC
1929 	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
1930 	if (error)
1931 		goto bad;
1932 #endif
1933 	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
1934 	if (error)
1935 		goto bad;
1936 
1937 	unp = sotounpcb(so);
1938 	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1939 
1940 	vplock = mtx_pool_find(mtxpool_sleep, vp);
1941 	mtx_lock(vplock);
1942 	VOP_UNP_CONNECT(vp, &unp2);
1943 	if (unp2 == NULL) {
1944 		error = ECONNREFUSED;
1945 		goto bad2;
1946 	}
1947 	so2 = unp2->unp_socket;
1948 	if (so->so_type != so2->so_type) {
1949 		error = EPROTOTYPE;
1950 		goto bad2;
1951 	}
1952 	if (connreq) {
1953 		if (SOLISTENING(so2)) {
1954 			CURVNET_SET(so2->so_vnet);
1955 			so2 = sonewconn(so2, 0);
1956 			CURVNET_RESTORE();
1957 		} else
1958 			so2 = NULL;
1959 		if (so2 == NULL) {
1960 			error = ECONNREFUSED;
1961 			goto bad2;
1962 		}
1963 		unp3 = sotounpcb(so2);
1964 		unp_pcb_lock_pair(unp2, unp3);
1965 		if (unp2->unp_addr != NULL) {
1966 			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
1967 			unp3->unp_addr = (struct sockaddr_un *) sa;
1968 			sa = NULL;
1969 		}
1970 
1971 		unp_copy_peercred(td, unp3, unp, unp2);
1972 
1973 		UNP_PCB_UNLOCK(unp2);
1974 		unp2 = unp3;
1975 
1976 		/*
1977 		 * It is safe to block on the PCB lock here since unp2 is
1978 		 * nascent and cannot be connected to any other sockets.
1979 		 */
1980 		UNP_PCB_LOCK(unp);
1981 #ifdef MAC
1982 		mac_socketpeer_set_from_socket(so, so2);
1983 		mac_socketpeer_set_from_socket(so2, so);
1984 #endif
1985 	} else {
1986 		unp_pcb_lock_pair(unp, unp2);
1987 	}
1988 	KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
1989 	    sotounpcb(so2) == unp2,
1990 	    ("%s: unp2 %p so2 %p", __func__, unp2, so2));
1991 	unp_connect2(so, so2, PRU_CONNECT);
1992 	KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
1993 	    ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
1994 	unp->unp_flags &= ~UNP_CONNECTING;
1995 	if (!return_locked)
1996 		unp_pcb_unlock_pair(unp, unp2);
1997 bad2:
1998 	mtx_unlock(vplock);
1999 bad:
2000 	if (vp != NULL) {
2001 		/*
2002 		 * If we are returning locked (called via uipc_sosend_dgram()),
2003 		 * we need to be sure that vput() won't sleep.  This is
2004 		 * guaranteed by VOP_UNP_CONNECT() call above and unp2 lock.
2005 		 * SOCK_STREAM/SEQPACKET can't request return_locked (yet).
2006 		 */
2007 		MPASS(!(return_locked && connreq));
2008 		vput(vp);
2009 	}
2010 	free(sa, M_SONAME);
2011 	if (__predict_false(error)) {
2012 		UNP_PCB_LOCK(unp);
2013 		KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
2014 		    ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
2015 		unp->unp_flags &= ~UNP_CONNECTING;
2016 		UNP_PCB_UNLOCK(unp);
2017 	}
2018 	return (error);
2019 }
2020 
2021 /*
2022  * Set socket peer credentials at connection time.
2023  *
2024  * The client's PCB credentials are copied from its process structure.  The
2025  * server's PCB credentials are copied from the socket on which it called
2026  * listen(2).  uipc_listen cached that process's credentials at the time.
2027  */
2028 void
2029 unp_copy_peercred(struct thread *td, struct unpcb *client_unp,
2030     struct unpcb *server_unp, struct unpcb *listen_unp)
2031 {
2032 	cru2xt(td, &client_unp->unp_peercred);
2033 	client_unp->unp_flags |= UNP_HAVEPC;
2034 
2035 	memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred,
2036 	    sizeof(server_unp->unp_peercred));
2037 	server_unp->unp_flags |= UNP_HAVEPC;
2038 	client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK);
2039 }
2040 
2041 static void
2042 unp_connect2(struct socket *so, struct socket *so2, conn2_how req)
2043 {
2044 	struct unpcb *unp;
2045 	struct unpcb *unp2;
2046 
2047 	MPASS(so2->so_type == so->so_type);
2048 	unp = sotounpcb(so);
2049 	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
2050 	unp2 = sotounpcb(so2);
2051 	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
2052 
2053 	UNP_PCB_LOCK_ASSERT(unp);
2054 	UNP_PCB_LOCK_ASSERT(unp2);
2055 	KASSERT(unp->unp_conn == NULL,
2056 	    ("%s: socket %p is already connected", __func__, unp));
2057 
2058 	unp->unp_conn = unp2;
2059 	unp_pcb_hold(unp2);
2060 	unp_pcb_hold(unp);
2061 	switch (so->so_type) {
2062 	case SOCK_DGRAM:
2063 		UNP_REF_LIST_LOCK();
2064 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
2065 		UNP_REF_LIST_UNLOCK();
2066 		soisconnected(so);
2067 		break;
2068 
2069 	case SOCK_STREAM:
2070 	case SOCK_SEQPACKET:
2071 		KASSERT(unp2->unp_conn == NULL,
2072 		    ("%s: socket %p is already connected", __func__, unp2));
2073 		unp2->unp_conn = unp;
2074 		if (req == PRU_CONNECT &&
2075 		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
2076 			soisconnecting(so);
2077 		else
2078 			soisconnected(so);
2079 		soisconnected(so2);
2080 		break;
2081 
2082 	default:
2083 		panic("unp_connect2");
2084 	}
2085 }
2086 
2087 static void
2088 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
2089 {
2090 	struct socket *so, *so2;
2091 	struct mbuf *m = NULL;
2092 #ifdef INVARIANTS
2093 	struct unpcb *unptmp;
2094 #endif
2095 
2096 	UNP_PCB_LOCK_ASSERT(unp);
2097 	UNP_PCB_LOCK_ASSERT(unp2);
2098 	KASSERT(unp->unp_conn == unp2,
2099 	    ("%s: unpcb %p is not connected to %p", __func__, unp, unp2));
2100 
2101 	unp->unp_conn = NULL;
2102 	so = unp->unp_socket;
2103 	so2 = unp2->unp_socket;
2104 	switch (unp->unp_socket->so_type) {
2105 	case SOCK_DGRAM:
2106 		/*
2107 		 * Remove our send socket buffer from the peer's receive buffer.
2108 		 * Move the data to the receive buffer only if it is empty.
2109 		 * This is a protection against a scenario where a peer
2110 		 * connects, floods and disconnects, effectively blocking
2111 		 * sendto() from unconnected sockets.
2112 		 */
2113 		SOCK_RECVBUF_LOCK(so2);
2114 		if (!STAILQ_EMPTY(&so->so_snd.uxdg_mb)) {
2115 			TAILQ_REMOVE(&so2->so_rcv.uxdg_conns, &so->so_snd,
2116 			    uxdg_clist);
2117 			if (__predict_true((so2->so_rcv.sb_state &
2118 			    SBS_CANTRCVMORE) == 0) &&
2119 			    STAILQ_EMPTY(&so2->so_rcv.uxdg_mb)) {
2120 				STAILQ_CONCAT(&so2->so_rcv.uxdg_mb,
2121 				    &so->so_snd.uxdg_mb);
2122 				so2->so_rcv.uxdg_cc += so->so_snd.uxdg_cc;
2123 				so2->so_rcv.uxdg_ctl += so->so_snd.uxdg_ctl;
2124 				so2->so_rcv.uxdg_mbcnt += so->so_snd.uxdg_mbcnt;
2125 			} else {
2126 				m = STAILQ_FIRST(&so->so_snd.uxdg_mb);
2127 				STAILQ_INIT(&so->so_snd.uxdg_mb);
2128 				so2->so_rcv.sb_acc -= so->so_snd.uxdg_cc;
2129 				so2->so_rcv.sb_ccc -= so->so_snd.uxdg_cc;
2130 				so2->so_rcv.sb_ctl -= so->so_snd.uxdg_ctl;
2131 				so2->so_rcv.sb_mbcnt -= so->so_snd.uxdg_mbcnt;
2132 			}
2133 			/* Note: so may reconnect. */
2134 			so->so_snd.uxdg_cc = 0;
2135 			so->so_snd.uxdg_ctl = 0;
2136 			so->so_snd.uxdg_mbcnt = 0;
2137 		}
2138 		SOCK_RECVBUF_UNLOCK(so2);
2139 		UNP_REF_LIST_LOCK();
2140 #ifdef INVARIANTS
2141 		LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) {
2142 			if (unptmp == unp)
2143 				break;
2144 		}
2145 		KASSERT(unptmp != NULL,
2146 		    ("%s: %p not found in reflist of %p", __func__, unp, unp2));
2147 #endif
2148 		LIST_REMOVE(unp, unp_reflink);
2149 		UNP_REF_LIST_UNLOCK();
2150 		if (so) {
2151 			SOCK_LOCK(so);
2152 			so->so_state &= ~SS_ISCONNECTED;
2153 			SOCK_UNLOCK(so);
2154 		}
2155 		break;
2156 
2157 	case SOCK_STREAM:
2158 	case SOCK_SEQPACKET:
2159 		if (so)
2160 			soisdisconnected(so);
2161 		MPASS(unp2->unp_conn == unp);
2162 		unp2->unp_conn = NULL;
2163 		if (so2)
2164 			soisdisconnected(so2);
2165 		break;
2166 	}
2167 
2168 	if (unp == unp2) {
2169 		unp_pcb_rele_notlast(unp);
2170 		if (!unp_pcb_rele(unp))
2171 			UNP_PCB_UNLOCK(unp);
2172 	} else {
2173 		if (!unp_pcb_rele(unp))
2174 			UNP_PCB_UNLOCK(unp);
2175 		if (!unp_pcb_rele(unp2))
2176 			UNP_PCB_UNLOCK(unp2);
2177 	}
2178 
2179 	if (m != NULL) {
2180 		unp_scan(m, unp_freerights);
2181 		m_freem(m);
2182 	}
2183 }
2184 
2185 /*
2186  * unp_pcblist() walks the global list of struct unpcb's to generate a
2187  * pointer list, bumping the refcount on each unpcb.  It then copies them out
2188  * sequentially, validating the generation number on each to see if it has
2189  * been detached.  All of this is necessary because copyout() may sleep on
2190  * disk I/O.
2191  */
2192 static int
2193 unp_pcblist(SYSCTL_HANDLER_ARGS)
2194 {
2195 	struct unpcb *unp, **unp_list;
2196 	unp_gen_t gencnt;
2197 	struct xunpgen *xug;
2198 	struct unp_head *head;
2199 	struct xunpcb *xu;
2200 	u_int i;
2201 	int error, n;
2202 
2203 	switch ((intptr_t)arg1) {
2204 	case SOCK_STREAM:
2205 		head = &unp_shead;
2206 		break;
2207 
2208 	case SOCK_DGRAM:
2209 		head = &unp_dhead;
2210 		break;
2211 
2212 	case SOCK_SEQPACKET:
2213 		head = &unp_sphead;
2214 		break;
2215 
2216 	default:
2217 		panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
2218 	}
2219 
2220 	/*
2221 	 * The process of preparing the PCB list is too time-consuming and
2222 	 * resource-intensive to repeat twice on every request.
2223 	 */
2224 	if (req->oldptr == NULL) {
2225 		n = unp_count;
2226 		req->oldidx = 2 * (sizeof *xug)
2227 			+ (n + n/8) * sizeof(struct xunpcb);
2228 		return (0);
2229 	}
2230 
2231 	if (req->newptr != NULL)
2232 		return (EPERM);
2233 
2234 	/*
2235 	 * OK, now we're committed to doing something.
2236 	 */
2237 	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO);
2238 	UNP_LINK_RLOCK();
2239 	gencnt = unp_gencnt;
2240 	n = unp_count;
2241 	UNP_LINK_RUNLOCK();
2242 
2243 	xug->xug_len = sizeof *xug;
2244 	xug->xug_count = n;
2245 	xug->xug_gen = gencnt;
2246 	xug->xug_sogen = so_gencnt;
2247 	error = SYSCTL_OUT(req, xug, sizeof *xug);
2248 	if (error) {
2249 		free(xug, M_TEMP);
2250 		return (error);
2251 	}
2252 
2253 	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
2254 
2255 	UNP_LINK_RLOCK();
2256 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
2257 	     unp = LIST_NEXT(unp, unp_link)) {
2258 		UNP_PCB_LOCK(unp);
2259 		if (unp->unp_gencnt <= gencnt) {
2260 			if (cr_cansee(req->td->td_ucred,
2261 			    unp->unp_socket->so_cred)) {
2262 				UNP_PCB_UNLOCK(unp);
2263 				continue;
2264 			}
2265 			unp_list[i++] = unp;
2266 			unp_pcb_hold(unp);
2267 		}
2268 		UNP_PCB_UNLOCK(unp);
2269 	}
2270 	UNP_LINK_RUNLOCK();
2271 	n = i;			/* In case we lost some during malloc. */
2272 
2273 	error = 0;
2274 	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
2275 	for (i = 0; i < n; i++) {
2276 		unp = unp_list[i];
2277 		UNP_PCB_LOCK(unp);
2278 		if (unp_pcb_rele(unp))
2279 			continue;
2280 
2281 		if (unp->unp_gencnt <= gencnt) {
2282 			xu->xu_len = sizeof *xu;
2283 			xu->xu_unpp = (uintptr_t)unp;
2284 			/*
2285 			 * XXX - need more locking here to protect against
2286 			 * connect/disconnect races for SMP.
2287 			 */
2288 			if (unp->unp_addr != NULL)
2289 				bcopy(unp->unp_addr, &xu->xu_addr,
2290 				      unp->unp_addr->sun_len);
2291 			else
2292 				bzero(&xu->xu_addr, sizeof(xu->xu_addr));
2293 			if (unp->unp_conn != NULL &&
2294 			    unp->unp_conn->unp_addr != NULL)
2295 				bcopy(unp->unp_conn->unp_addr,
2296 				      &xu->xu_caddr,
2297 				      unp->unp_conn->unp_addr->sun_len);
2298 			else
2299 				bzero(&xu->xu_caddr, sizeof(xu->xu_caddr));
2300 			xu->unp_vnode = (uintptr_t)unp->unp_vnode;
2301 			xu->unp_conn = (uintptr_t)unp->unp_conn;
2302 			xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs);
2303 			xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink);
2304 			xu->unp_gencnt = unp->unp_gencnt;
2305 			sotoxsocket(unp->unp_socket, &xu->xu_socket);
2306 			UNP_PCB_UNLOCK(unp);
2307 			error = SYSCTL_OUT(req, xu, sizeof *xu);
2308 		} else {
2309 			UNP_PCB_UNLOCK(unp);
2310 		}
2311 	}
2312 	free(xu, M_TEMP);
2313 	if (!error) {
2314 		/*
2315 		 * Give the user an updated idea of our state.  If the
2316 		 * generation differs from what we told her before, she knows
2317 		 * that something happened while we were processing this
2318 		 * request, and it might be necessary to retry.
2319 		 */
2320 		xug->xug_gen = unp_gencnt;
2321 		xug->xug_sogen = so_gencnt;
2322 		xug->xug_count = unp_count;
2323 		error = SYSCTL_OUT(req, xug, sizeof *xug);
2324 	}
2325 	free(unp_list, M_TEMP);
2326 	free(xug, M_TEMP);
2327 	return (error);
2328 }
2329 
2330 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist,
2331     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
2332     (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
2333     "List of active local datagram sockets");
2334 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist,
2335     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
2336     (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
2337     "List of active local stream sockets");
2338 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
2339     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
2340     (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
2341     "List of active local seqpacket sockets");
2342 
2343 static void
2344 unp_shutdown(struct unpcb *unp)
2345 {
2346 	struct unpcb *unp2;
2347 	struct socket *so;
2348 
2349 	UNP_PCB_LOCK_ASSERT(unp);
2350 
2351 	unp2 = unp->unp_conn;
2352 	if ((unp->unp_socket->so_type == SOCK_STREAM ||
2353 	    (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
2354 		so = unp2->unp_socket;
2355 		if (so != NULL)
2356 			socantrcvmore(so);
2357 	}
2358 }
2359 
2360 static void
2361 unp_drop(struct unpcb *unp)
2362 {
2363 	struct socket *so;
2364 	struct unpcb *unp2;
2365 
2366 	/*
2367 	 * Regardless of whether the socket's peer dropped the connection
2368 	 * with this socket by aborting or disconnecting, POSIX requires
2369 	 * that ECONNRESET is returned.
2370 	 */
2371 
2372 	UNP_PCB_LOCK(unp);
2373 	so = unp->unp_socket;
2374 	if (so)
2375 		so->so_error = ECONNRESET;
2376 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
2377 		/* Last reference dropped in unp_disconnect(). */
2378 		unp_pcb_rele_notlast(unp);
2379 		unp_disconnect(unp, unp2);
2380 	} else if (!unp_pcb_rele(unp)) {
2381 		UNP_PCB_UNLOCK(unp);
2382 	}
2383 }
2384 
2385 static void
2386 unp_freerights(struct filedescent **fdep, int fdcount)
2387 {
2388 	struct file *fp;
2389 	int i;
2390 
2391 	KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
2392 
2393 	for (i = 0; i < fdcount; i++) {
2394 		fp = fdep[i]->fde_file;
2395 		filecaps_free(&fdep[i]->fde_caps);
2396 		unp_discard(fp);
2397 	}
2398 	free(fdep[0], M_FILECAPS);
2399 }
2400 
2401 static int
2402 unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
2403 {
2404 	struct thread *td = curthread;		/* XXX */
2405 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
2406 	int i;
2407 	int *fdp;
2408 	struct filedesc *fdesc = td->td_proc->p_fd;
2409 	struct filedescent **fdep;
2410 	void *data;
2411 	socklen_t clen = control->m_len, datalen;
2412 	int error, newfds;
2413 	u_int newlen;
2414 
2415 	UNP_LINK_UNLOCK_ASSERT();
2416 
2417 	error = 0;
2418 	if (controlp != NULL) /* controlp == NULL => free control messages */
2419 		*controlp = NULL;
2420 	while (cm != NULL) {
2421 		MPASS(clen >= sizeof(*cm) && clen >= cm->cmsg_len);
2422 
2423 		data = CMSG_DATA(cm);
2424 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
2425 		if (cm->cmsg_level == SOL_SOCKET
2426 		    && cm->cmsg_type == SCM_RIGHTS) {
2427 			newfds = datalen / sizeof(*fdep);
2428 			if (newfds == 0)
2429 				goto next;
2430 			fdep = data;
2431 
2432 			/* If we're not outputting the descriptors free them. */
2433 			if (error || controlp == NULL) {
2434 				unp_freerights(fdep, newfds);
2435 				goto next;
2436 			}
2437 			FILEDESC_XLOCK(fdesc);
2438 
2439 			/*
2440 			 * Now change each pointer to an fd in the global
2441 			 * table to an integer that is the index to the local
2442 			 * fd table entry that we set up to point to the
2443 			 * global one we are transferring.
2444 			 */
2445 			newlen = newfds * sizeof(int);
2446 			*controlp = sbcreatecontrol(NULL, newlen,
2447 			    SCM_RIGHTS, SOL_SOCKET, M_WAITOK);
2448 
2449 			fdp = (int *)
2450 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2451 			if ((error = fdallocn(td, 0, fdp, newfds))) {
2452 				FILEDESC_XUNLOCK(fdesc);
2453 				unp_freerights(fdep, newfds);
2454 				m_freem(*controlp);
2455 				*controlp = NULL;
2456 				goto next;
2457 			}
2458 			for (i = 0; i < newfds; i++, fdp++) {
2459 				_finstall(fdesc, fdep[i]->fde_file, *fdp,
2460 				    (flags & MSG_CMSG_CLOEXEC) != 0 ? O_CLOEXEC : 0,
2461 				    &fdep[i]->fde_caps);
2462 				unp_externalize_fp(fdep[i]->fde_file);
2463 			}
2464 
2465 			/*
2466 			 * The new type indicates that the mbuf data refers to
2467 			 * kernel resources that may need to be released before
2468 			 * the mbuf is freed.
2469 			 */
2470 			m_chtype(*controlp, MT_EXTCONTROL);
2471 			FILEDESC_XUNLOCK(fdesc);
2472 			free(fdep[0], M_FILECAPS);
2473 		} else {
2474 			/* We can just copy anything else across. */
2475 			if (error || controlp == NULL)
2476 				goto next;
2477 			*controlp = sbcreatecontrol(NULL, datalen,
2478 			    cm->cmsg_type, cm->cmsg_level, M_WAITOK);
2479 			bcopy(data,
2480 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
2481 			    datalen);
2482 		}
2483 		controlp = &(*controlp)->m_next;
2484 
2485 next:
2486 		if (CMSG_SPACE(datalen) < clen) {
2487 			clen -= CMSG_SPACE(datalen);
2488 			cm = (struct cmsghdr *)
2489 			    ((caddr_t)cm + CMSG_SPACE(datalen));
2490 		} else {
2491 			clen = 0;
2492 			cm = NULL;
2493 		}
2494 	}
2495 
2496 	m_freem(control);
2497 	return (error);
2498 }
2499 
2500 static void
2501 unp_zone_change(void *tag)
2502 {
2503 
2504 	uma_zone_set_max(unp_zone, maxsockets);
2505 }
2506 
2507 #ifdef INVARIANTS
2508 static void
2509 unp_zdtor(void *mem, int size __unused, void *arg __unused)
2510 {
2511 	struct unpcb *unp;
2512 
2513 	unp = mem;
2514 
2515 	KASSERT(LIST_EMPTY(&unp->unp_refs),
2516 	    ("%s: unpcb %p has lingering refs", __func__, unp));
2517 	KASSERT(unp->unp_socket == NULL,
2518 	    ("%s: unpcb %p has socket backpointer", __func__, unp));
2519 	KASSERT(unp->unp_vnode == NULL,
2520 	    ("%s: unpcb %p has vnode references", __func__, unp));
2521 	KASSERT(unp->unp_conn == NULL,
2522 	    ("%s: unpcb %p is still connected", __func__, unp));
2523 	KASSERT(unp->unp_addr == NULL,
2524 	    ("%s: unpcb %p has leaked addr", __func__, unp));
2525 }
2526 #endif
2527 
2528 static void
2529 unp_init(void *arg __unused)
2530 {
2531 	uma_dtor dtor;
2532 
2533 #ifdef INVARIANTS
2534 	dtor = unp_zdtor;
2535 #else
2536 	dtor = NULL;
2537 #endif
2538 	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor,
2539 	    NULL, NULL, UMA_ALIGN_CACHE, 0);
2540 	uma_zone_set_max(unp_zone, maxsockets);
2541 	uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
2542 	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
2543 	    NULL, EVENTHANDLER_PRI_ANY);
2544 	LIST_INIT(&unp_dhead);
2545 	LIST_INIT(&unp_shead);
2546 	LIST_INIT(&unp_sphead);
2547 	SLIST_INIT(&unp_defers);
2548 	TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
2549 	TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
2550 	UNP_LINK_LOCK_INIT();
2551 	UNP_DEFERRED_LOCK_INIT();
2552 }
2553 SYSINIT(unp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, unp_init, NULL);
2554 
2555 static void
2556 unp_internalize_cleanup_rights(struct mbuf *control)
2557 {
2558 	struct cmsghdr *cp;
2559 	struct mbuf *m;
2560 	void *data;
2561 	socklen_t datalen;
2562 
2563 	for (m = control; m != NULL; m = m->m_next) {
2564 		cp = mtod(m, struct cmsghdr *);
2565 		if (cp->cmsg_level != SOL_SOCKET ||
2566 		    cp->cmsg_type != SCM_RIGHTS)
2567 			continue;
2568 		data = CMSG_DATA(cp);
2569 		datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data;
2570 		unp_freerights(data, datalen / sizeof(struct filedesc *));
2571 	}
2572 }
2573 
2574 static int
2575 unp_internalize(struct mbuf **controlp, struct thread *td,
2576     struct mbuf **clast, u_int *space, u_int *mbcnt)
2577 {
2578 	struct mbuf *control, **initial_controlp;
2579 	struct proc *p;
2580 	struct filedesc *fdesc;
2581 	struct bintime *bt;
2582 	struct cmsghdr *cm;
2583 	struct cmsgcred *cmcred;
2584 	struct filedescent *fde, **fdep, *fdev;
2585 	struct file *fp;
2586 	struct timeval *tv;
2587 	struct timespec *ts;
2588 	void *data;
2589 	socklen_t clen, datalen;
2590 	int i, j, error, *fdp, oldfds;
2591 	u_int newlen;
2592 
2593 	MPASS((*controlp)->m_next == NULL); /* COMPAT_OLDSOCK may violate */
2594 	UNP_LINK_UNLOCK_ASSERT();
2595 
2596 	p = td->td_proc;
2597 	fdesc = p->p_fd;
2598 	error = 0;
2599 	control = *controlp;
2600 	*controlp = NULL;
2601 	initial_controlp = controlp;
2602 	for (clen = control->m_len, cm = mtod(control, struct cmsghdr *),
2603 	    data = CMSG_DATA(cm);
2604 
2605 	    clen >= sizeof(*cm) && cm->cmsg_level == SOL_SOCKET &&
2606 	    clen >= cm->cmsg_len && cm->cmsg_len >= sizeof(*cm) &&
2607 	    (char *)cm + cm->cmsg_len >= (char *)data;
2608 
2609 	    clen -= min(CMSG_SPACE(datalen), clen),
2610 	    cm = (struct cmsghdr *) ((char *)cm + CMSG_SPACE(datalen)),
2611 	    data = CMSG_DATA(cm)) {
2612 		datalen = (char *)cm + cm->cmsg_len - (char *)data;
2613 		switch (cm->cmsg_type) {
2614 		case SCM_CREDS:
2615 			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
2616 			    SCM_CREDS, SOL_SOCKET, M_WAITOK);
2617 			cmcred = (struct cmsgcred *)
2618 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2619 			cmcred->cmcred_pid = p->p_pid;
2620 			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
2621 			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
2622 			cmcred->cmcred_euid = td->td_ucred->cr_uid;
2623 			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
2624 			    CMGROUP_MAX);
2625 			for (i = 0; i < cmcred->cmcred_ngroups; i++)
2626 				cmcred->cmcred_groups[i] =
2627 				    td->td_ucred->cr_groups[i];
2628 			break;
2629 
2630 		case SCM_RIGHTS:
2631 			oldfds = datalen / sizeof (int);
2632 			if (oldfds == 0)
2633 				continue;
2634 			/* On some machines sizeof pointer is bigger than
2635 			 * sizeof int, so we need to check if data fits into
2636 			 * single mbuf.  We could allocate several mbufs, and
2637 			 * unp_externalize() should even properly handle that.
2638 			 * But it is not worth to complicate the code for an
2639 			 * insane scenario of passing over 200 file descriptors
2640 			 * at once.
2641 			 */
2642 			newlen = oldfds * sizeof(fdep[0]);
2643 			if (CMSG_SPACE(newlen) > MCLBYTES) {
2644 				error = EMSGSIZE;
2645 				goto out;
2646 			}
2647 			/*
2648 			 * Check that all the FDs passed in refer to legal
2649 			 * files.  If not, reject the entire operation.
2650 			 */
2651 			fdp = data;
2652 			FILEDESC_SLOCK(fdesc);
2653 			for (i = 0; i < oldfds; i++, fdp++) {
2654 				fp = fget_noref(fdesc, *fdp);
2655 				if (fp == NULL) {
2656 					FILEDESC_SUNLOCK(fdesc);
2657 					error = EBADF;
2658 					goto out;
2659 				}
2660 				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
2661 					FILEDESC_SUNLOCK(fdesc);
2662 					error = EOPNOTSUPP;
2663 					goto out;
2664 				}
2665 			}
2666 
2667 			/*
2668 			 * Now replace the integer FDs with pointers to the
2669 			 * file structure and capability rights.
2670 			 */
2671 			*controlp = sbcreatecontrol(NULL, newlen,
2672 			    SCM_RIGHTS, SOL_SOCKET, M_WAITOK);
2673 			fdp = data;
2674 			for (i = 0; i < oldfds; i++, fdp++) {
2675 				if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) {
2676 					fdp = data;
2677 					for (j = 0; j < i; j++, fdp++) {
2678 						fdrop(fdesc->fd_ofiles[*fdp].
2679 						    fde_file, td);
2680 					}
2681 					FILEDESC_SUNLOCK(fdesc);
2682 					error = EBADF;
2683 					goto out;
2684 				}
2685 			}
2686 			fdp = data;
2687 			fdep = (struct filedescent **)
2688 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2689 			fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
2690 			    M_WAITOK);
2691 			for (i = 0; i < oldfds; i++, fdev++, fdp++) {
2692 				fde = &fdesc->fd_ofiles[*fdp];
2693 				fdep[i] = fdev;
2694 				fdep[i]->fde_file = fde->fde_file;
2695 				filecaps_copy(&fde->fde_caps,
2696 				    &fdep[i]->fde_caps, true);
2697 				unp_internalize_fp(fdep[i]->fde_file);
2698 			}
2699 			FILEDESC_SUNLOCK(fdesc);
2700 			break;
2701 
2702 		case SCM_TIMESTAMP:
2703 			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
2704 			    SCM_TIMESTAMP, SOL_SOCKET, M_WAITOK);
2705 			tv = (struct timeval *)
2706 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2707 			microtime(tv);
2708 			break;
2709 
2710 		case SCM_BINTIME:
2711 			*controlp = sbcreatecontrol(NULL, sizeof(*bt),
2712 			    SCM_BINTIME, SOL_SOCKET, M_WAITOK);
2713 			bt = (struct bintime *)
2714 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2715 			bintime(bt);
2716 			break;
2717 
2718 		case SCM_REALTIME:
2719 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
2720 			    SCM_REALTIME, SOL_SOCKET, M_WAITOK);
2721 			ts = (struct timespec *)
2722 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2723 			nanotime(ts);
2724 			break;
2725 
2726 		case SCM_MONOTONIC:
2727 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
2728 			    SCM_MONOTONIC, SOL_SOCKET, M_WAITOK);
2729 			ts = (struct timespec *)
2730 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2731 			nanouptime(ts);
2732 			break;
2733 
2734 		default:
2735 			error = EINVAL;
2736 			goto out;
2737 		}
2738 
2739 		if (space != NULL) {
2740 			*space += (*controlp)->m_len;
2741 			*mbcnt += MSIZE;
2742 			if ((*controlp)->m_flags & M_EXT)
2743 				*mbcnt += (*controlp)->m_ext.ext_size;
2744 			*clast = *controlp;
2745 		}
2746 		controlp = &(*controlp)->m_next;
2747 	}
2748 	if (clen > 0)
2749 		error = EINVAL;
2750 
2751 out:
2752 	if (error != 0 && initial_controlp != NULL)
2753 		unp_internalize_cleanup_rights(*initial_controlp);
2754 	m_freem(control);
2755 	return (error);
2756 }
2757 
2758 static struct mbuf *
2759 unp_addsockcred(struct thread *td, struct mbuf *control, int mode,
2760     struct mbuf **clast, u_int *space, u_int *mbcnt)
2761 {
2762 	struct mbuf *m, *n, *n_prev;
2763 	const struct cmsghdr *cm;
2764 	int ngroups, i, cmsgtype;
2765 	size_t ctrlsz;
2766 
2767 	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
2768 	if (mode & UNP_WANTCRED_ALWAYS) {
2769 		ctrlsz = SOCKCRED2SIZE(ngroups);
2770 		cmsgtype = SCM_CREDS2;
2771 	} else {
2772 		ctrlsz = SOCKCREDSIZE(ngroups);
2773 		cmsgtype = SCM_CREDS;
2774 	}
2775 
2776 	m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET, M_NOWAIT);
2777 	if (m == NULL)
2778 		return (control);
2779 	MPASS((m->m_flags & M_EXT) == 0 && m->m_next == NULL);
2780 
2781 	if (mode & UNP_WANTCRED_ALWAYS) {
2782 		struct sockcred2 *sc;
2783 
2784 		sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
2785 		sc->sc_version = 0;
2786 		sc->sc_pid = td->td_proc->p_pid;
2787 		sc->sc_uid = td->td_ucred->cr_ruid;
2788 		sc->sc_euid = td->td_ucred->cr_uid;
2789 		sc->sc_gid = td->td_ucred->cr_rgid;
2790 		sc->sc_egid = td->td_ucred->cr_gid;
2791 		sc->sc_ngroups = ngroups;
2792 		for (i = 0; i < sc->sc_ngroups; i++)
2793 			sc->sc_groups[i] = td->td_ucred->cr_groups[i];
2794 	} else {
2795 		struct sockcred *sc;
2796 
2797 		sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
2798 		sc->sc_uid = td->td_ucred->cr_ruid;
2799 		sc->sc_euid = td->td_ucred->cr_uid;
2800 		sc->sc_gid = td->td_ucred->cr_rgid;
2801 		sc->sc_egid = td->td_ucred->cr_gid;
2802 		sc->sc_ngroups = ngroups;
2803 		for (i = 0; i < sc->sc_ngroups; i++)
2804 			sc->sc_groups[i] = td->td_ucred->cr_groups[i];
2805 	}
2806 
2807 	/*
2808 	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
2809 	 * created SCM_CREDS control message (struct sockcred) has another
2810 	 * format.
2811 	 */
2812 	if (control != NULL && cmsgtype == SCM_CREDS)
2813 		for (n = control, n_prev = NULL; n != NULL;) {
2814 			cm = mtod(n, struct cmsghdr *);
2815     			if (cm->cmsg_level == SOL_SOCKET &&
2816 			    cm->cmsg_type == SCM_CREDS) {
2817     				if (n_prev == NULL)
2818 					control = n->m_next;
2819 				else
2820 					n_prev->m_next = n->m_next;
2821 				if (space != NULL) {
2822 					MPASS(*space >= n->m_len);
2823 					*space -= n->m_len;
2824 					MPASS(*mbcnt >= MSIZE);
2825 					*mbcnt -= MSIZE;
2826 					if (n->m_flags & M_EXT) {
2827 						MPASS(*mbcnt >=
2828 						    n->m_ext.ext_size);
2829 						*mbcnt -= n->m_ext.ext_size;
2830 					}
2831 					MPASS(clast);
2832 					if (*clast == n) {
2833 						MPASS(n->m_next == NULL);
2834 						if (n_prev == NULL)
2835 							*clast = m;
2836 						else
2837 							*clast = n_prev;
2838 					}
2839 				}
2840 				n = m_free(n);
2841 			} else {
2842 				n_prev = n;
2843 				n = n->m_next;
2844 			}
2845 		}
2846 
2847 	/* Prepend it to the head. */
2848 	m->m_next = control;
2849 	if (space != NULL) {
2850 		*space += m->m_len;
2851 		*mbcnt += MSIZE;
2852 		if (control == NULL)
2853 			*clast = m;
2854 	}
2855 	return (m);
2856 }
2857 
2858 static struct unpcb *
2859 fptounp(struct file *fp)
2860 {
2861 	struct socket *so;
2862 
2863 	if (fp->f_type != DTYPE_SOCKET)
2864 		return (NULL);
2865 	if ((so = fp->f_data) == NULL)
2866 		return (NULL);
2867 	if (so->so_proto->pr_domain != &localdomain)
2868 		return (NULL);
2869 	return sotounpcb(so);
2870 }
2871 
2872 static void
2873 unp_discard(struct file *fp)
2874 {
2875 	struct unp_defer *dr;
2876 
2877 	if (unp_externalize_fp(fp)) {
2878 		dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
2879 		dr->ud_fp = fp;
2880 		UNP_DEFERRED_LOCK();
2881 		SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
2882 		UNP_DEFERRED_UNLOCK();
2883 		atomic_add_int(&unp_defers_count, 1);
2884 		taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
2885 	} else
2886 		closef_nothread(fp);
2887 }
2888 
2889 static void
2890 unp_process_defers(void *arg __unused, int pending)
2891 {
2892 	struct unp_defer *dr;
2893 	SLIST_HEAD(, unp_defer) drl;
2894 	int count;
2895 
2896 	SLIST_INIT(&drl);
2897 	for (;;) {
2898 		UNP_DEFERRED_LOCK();
2899 		if (SLIST_FIRST(&unp_defers) == NULL) {
2900 			UNP_DEFERRED_UNLOCK();
2901 			break;
2902 		}
2903 		SLIST_SWAP(&unp_defers, &drl, unp_defer);
2904 		UNP_DEFERRED_UNLOCK();
2905 		count = 0;
2906 		while ((dr = SLIST_FIRST(&drl)) != NULL) {
2907 			SLIST_REMOVE_HEAD(&drl, ud_link);
2908 			closef_nothread(dr->ud_fp);
2909 			free(dr, M_TEMP);
2910 			count++;
2911 		}
2912 		atomic_add_int(&unp_defers_count, -count);
2913 	}
2914 }
2915 
2916 static void
2917 unp_internalize_fp(struct file *fp)
2918 {
2919 	struct unpcb *unp;
2920 
2921 	UNP_LINK_WLOCK();
2922 	if ((unp = fptounp(fp)) != NULL) {
2923 		unp->unp_file = fp;
2924 		unp->unp_msgcount++;
2925 	}
2926 	unp_rights++;
2927 	UNP_LINK_WUNLOCK();
2928 }
2929 
2930 static int
2931 unp_externalize_fp(struct file *fp)
2932 {
2933 	struct unpcb *unp;
2934 	int ret;
2935 
2936 	UNP_LINK_WLOCK();
2937 	if ((unp = fptounp(fp)) != NULL) {
2938 		unp->unp_msgcount--;
2939 		ret = 1;
2940 	} else
2941 		ret = 0;
2942 	unp_rights--;
2943 	UNP_LINK_WUNLOCK();
2944 	return (ret);
2945 }
2946 
2947 /*
2948  * unp_defer indicates whether additional work has been defered for a future
2949  * pass through unp_gc().  It is thread local and does not require explicit
2950  * synchronization.
2951  */
2952 static int	unp_marked;
2953 
2954 static void
2955 unp_remove_dead_ref(struct filedescent **fdep, int fdcount)
2956 {
2957 	struct unpcb *unp;
2958 	struct file *fp;
2959 	int i;
2960 
2961 	/*
2962 	 * This function can only be called from the gc task.
2963 	 */
2964 	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
2965 	    ("%s: not on gc callout", __func__));
2966 	UNP_LINK_LOCK_ASSERT();
2967 
2968 	for (i = 0; i < fdcount; i++) {
2969 		fp = fdep[i]->fde_file;
2970 		if ((unp = fptounp(fp)) == NULL)
2971 			continue;
2972 		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
2973 			continue;
2974 		unp->unp_gcrefs--;
2975 	}
2976 }
2977 
2978 static void
2979 unp_restore_undead_ref(struct filedescent **fdep, int fdcount)
2980 {
2981 	struct unpcb *unp;
2982 	struct file *fp;
2983 	int i;
2984 
2985 	/*
2986 	 * This function can only be called from the gc task.
2987 	 */
2988 	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
2989 	    ("%s: not on gc callout", __func__));
2990 	UNP_LINK_LOCK_ASSERT();
2991 
2992 	for (i = 0; i < fdcount; i++) {
2993 		fp = fdep[i]->fde_file;
2994 		if ((unp = fptounp(fp)) == NULL)
2995 			continue;
2996 		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
2997 			continue;
2998 		unp->unp_gcrefs++;
2999 		unp_marked++;
3000 	}
3001 }
3002 
3003 static void
3004 unp_scan_socket(struct socket *so, void (*op)(struct filedescent **, int))
3005 {
3006 	struct sockbuf *sb;
3007 
3008 	SOCK_LOCK_ASSERT(so);
3009 
3010 	if (sotounpcb(so)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
3011 		return;
3012 
3013 	SOCK_RECVBUF_LOCK(so);
3014 	switch (so->so_type) {
3015 	case SOCK_DGRAM:
3016 		unp_scan(STAILQ_FIRST(&so->so_rcv.uxdg_mb), op);
3017 		unp_scan(so->so_rcv.uxdg_peeked, op);
3018 		TAILQ_FOREACH(sb, &so->so_rcv.uxdg_conns, uxdg_clist)
3019 			unp_scan(STAILQ_FIRST(&sb->uxdg_mb), op);
3020 		break;
3021 	case SOCK_STREAM:
3022 	case SOCK_SEQPACKET:
3023 		unp_scan(so->so_rcv.sb_mb, op);
3024 		break;
3025 	}
3026 	SOCK_RECVBUF_UNLOCK(so);
3027 }
3028 
3029 static void
3030 unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int))
3031 {
3032 	struct socket *so, *soa;
3033 
3034 	so = unp->unp_socket;
3035 	SOCK_LOCK(so);
3036 	if (SOLISTENING(so)) {
3037 		/*
3038 		 * Mark all sockets in our accept queue.
3039 		 */
3040 		TAILQ_FOREACH(soa, &so->sol_comp, so_list)
3041 			unp_scan_socket(soa, op);
3042 	} else {
3043 		/*
3044 		 * Mark all sockets we reference with RIGHTS.
3045 		 */
3046 		unp_scan_socket(so, op);
3047 	}
3048 	SOCK_UNLOCK(so);
3049 }
3050 
3051 static int unp_recycled;
3052 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0,
3053     "Number of unreachable sockets claimed by the garbage collector.");
3054 
3055 static int unp_taskcount;
3056 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0,
3057     "Number of times the garbage collector has run.");
3058 
3059 SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0,
3060     "Number of active local sockets.");
3061 
3062 static void
3063 unp_gc(__unused void *arg, int pending)
3064 {
3065 	struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
3066 				    NULL };
3067 	struct unp_head **head;
3068 	struct unp_head unp_deadhead;	/* List of potentially-dead sockets. */
3069 	struct file *f, **unref;
3070 	struct unpcb *unp, *unptmp;
3071 	int i, total, unp_unreachable;
3072 
3073 	LIST_INIT(&unp_deadhead);
3074 	unp_taskcount++;
3075 	UNP_LINK_RLOCK();
3076 	/*
3077 	 * First determine which sockets may be in cycles.
3078 	 */
3079 	unp_unreachable = 0;
3080 
3081 	for (head = heads; *head != NULL; head++)
3082 		LIST_FOREACH(unp, *head, unp_link) {
3083 			KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0,
3084 			    ("%s: unp %p has unexpected gc flags 0x%x",
3085 			    __func__, unp, (unsigned int)unp->unp_gcflag));
3086 
3087 			f = unp->unp_file;
3088 
3089 			/*
3090 			 * Check for an unreachable socket potentially in a
3091 			 * cycle.  It must be in a queue as indicated by
3092 			 * msgcount, and this must equal the file reference
3093 			 * count.  Note that when msgcount is 0 the file is
3094 			 * NULL.
3095 			 */
3096 			if (f != NULL && unp->unp_msgcount != 0 &&
3097 			    refcount_load(&f->f_count) == unp->unp_msgcount) {
3098 				LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead);
3099 				unp->unp_gcflag |= UNPGC_DEAD;
3100 				unp->unp_gcrefs = unp->unp_msgcount;
3101 				unp_unreachable++;
3102 			}
3103 		}
3104 
3105 	/*
3106 	 * Scan all sockets previously marked as potentially being in a cycle
3107 	 * and remove the references each socket holds on any UNPGC_DEAD
3108 	 * sockets in its queue.  After this step, all remaining references on
3109 	 * sockets marked UNPGC_DEAD should not be part of any cycle.
3110 	 */
3111 	LIST_FOREACH(unp, &unp_deadhead, unp_dead)
3112 		unp_gc_scan(unp, unp_remove_dead_ref);
3113 
3114 	/*
3115 	 * If a socket still has a non-negative refcount, it cannot be in a
3116 	 * cycle.  In this case increment refcount of all children iteratively.
3117 	 * Stop the scan once we do a complete loop without discovering
3118 	 * a new reachable socket.
3119 	 */
3120 	do {
3121 		unp_marked = 0;
3122 		LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp)
3123 			if (unp->unp_gcrefs > 0) {
3124 				unp->unp_gcflag &= ~UNPGC_DEAD;
3125 				LIST_REMOVE(unp, unp_dead);
3126 				KASSERT(unp_unreachable > 0,
3127 				    ("%s: unp_unreachable underflow.",
3128 				    __func__));
3129 				unp_unreachable--;
3130 				unp_gc_scan(unp, unp_restore_undead_ref);
3131 			}
3132 	} while (unp_marked);
3133 
3134 	UNP_LINK_RUNLOCK();
3135 
3136 	if (unp_unreachable == 0)
3137 		return;
3138 
3139 	/*
3140 	 * Allocate space for a local array of dead unpcbs.
3141 	 * TODO: can this path be simplified by instead using the local
3142 	 * dead list at unp_deadhead, after taking out references
3143 	 * on the file object and/or unpcb and dropping the link lock?
3144 	 */
3145 	unref = malloc(unp_unreachable * sizeof(struct file *),
3146 	    M_TEMP, M_WAITOK);
3147 
3148 	/*
3149 	 * Iterate looking for sockets which have been specifically marked
3150 	 * as unreachable and store them locally.
3151 	 */
3152 	UNP_LINK_RLOCK();
3153 	total = 0;
3154 	LIST_FOREACH(unp, &unp_deadhead, unp_dead) {
3155 		KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0,
3156 		    ("%s: unp %p not marked UNPGC_DEAD", __func__, unp));
3157 		unp->unp_gcflag &= ~UNPGC_DEAD;
3158 		f = unp->unp_file;
3159 		if (unp->unp_msgcount == 0 || f == NULL ||
3160 		    refcount_load(&f->f_count) != unp->unp_msgcount ||
3161 		    !fhold(f))
3162 			continue;
3163 		unref[total++] = f;
3164 		KASSERT(total <= unp_unreachable,
3165 		    ("%s: incorrect unreachable count.", __func__));
3166 	}
3167 	UNP_LINK_RUNLOCK();
3168 
3169 	/*
3170 	 * Now flush all sockets, free'ing rights.  This will free the
3171 	 * struct files associated with these sockets but leave each socket
3172 	 * with one remaining ref.
3173 	 */
3174 	for (i = 0; i < total; i++) {
3175 		struct socket *so;
3176 
3177 		so = unref[i]->f_data;
3178 		CURVNET_SET(so->so_vnet);
3179 		sorflush(so);
3180 		CURVNET_RESTORE();
3181 	}
3182 
3183 	/*
3184 	 * And finally release the sockets so they can be reclaimed.
3185 	 */
3186 	for (i = 0; i < total; i++)
3187 		fdrop(unref[i], NULL);
3188 	unp_recycled += total;
3189 	free(unref, M_TEMP);
3190 }
3191 
3192 /*
3193  * Synchronize against unp_gc, which can trip over data as we are freeing it.
3194  */
3195 static void
3196 unp_dispose(struct socket *so)
3197 {
3198 	struct sockbuf *sb;
3199 	struct unpcb *unp;
3200 	struct mbuf *m;
3201 
3202 	MPASS(!SOLISTENING(so));
3203 
3204 	unp = sotounpcb(so);
3205 	UNP_LINK_WLOCK();
3206 	unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
3207 	UNP_LINK_WUNLOCK();
3208 
3209 	/*
3210 	 * Grab our special mbufs before calling sbrelease().
3211 	 */
3212 	SOCK_RECVBUF_LOCK(so);
3213 	switch (so->so_type) {
3214 	case SOCK_DGRAM:
3215 		while ((sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) != NULL) {
3216 			STAILQ_CONCAT(&so->so_rcv.uxdg_mb, &sb->uxdg_mb);
3217 			TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist);
3218 			/* Note: socket of sb may reconnect. */
3219 			sb->uxdg_cc = sb->uxdg_ctl = sb->uxdg_mbcnt = 0;
3220 		}
3221 		sb = &so->so_rcv;
3222 		if (sb->uxdg_peeked != NULL) {
3223 			STAILQ_INSERT_HEAD(&sb->uxdg_mb, sb->uxdg_peeked,
3224 			    m_stailqpkt);
3225 			sb->uxdg_peeked = NULL;
3226 		}
3227 		m = STAILQ_FIRST(&sb->uxdg_mb);
3228 		STAILQ_INIT(&sb->uxdg_mb);
3229 		/* XXX: our shortened sbrelease() */
3230 		(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
3231 		    RLIM_INFINITY);
3232 		/*
3233 		 * XXXGL Mark sb with SBS_CANTRCVMORE.  This is needed to
3234 		 * prevent uipc_sosend_dgram() or unp_disconnect() adding more
3235 		 * data to the socket.
3236 		 * We are now in dom_dispose and it could be a call from
3237 		 * soshutdown() or from the final sofree().  The sofree() case
3238 		 * is simple as it guarantees that no more sends will happen,
3239 		 * however we can race with unp_disconnect() from our peer.
3240 		 * The shutdown(2) case is more exotic.  It would call into
3241 		 * dom_dispose() only if socket is SS_ISCONNECTED.  This is
3242 		 * possible if we did connect(2) on this socket and we also
3243 		 * had it bound with bind(2) and receive connections from other
3244 		 * sockets.  Because soshutdown() violates POSIX (see comment
3245 		 * there) we will end up here shutting down our receive side.
3246 		 * Of course this will have affect not only on the peer we
3247 		 * connect(2)ed to, but also on all of the peers who had
3248 		 * connect(2)ed to us.  Their sends would end up with ENOBUFS.
3249 		 */
3250 		sb->sb_state |= SBS_CANTRCVMORE;
3251 		break;
3252 	case SOCK_STREAM:
3253 	case SOCK_SEQPACKET:
3254 		sb = &so->so_rcv;
3255 		m = sbcut_locked(sb, sb->sb_ccc);
3256 		KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
3257 		    ("%s: ccc %u mb %p mbcnt %u", __func__,
3258 		    sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
3259 		sbrelease_locked(so, SO_RCV);
3260 		break;
3261 	}
3262 	SOCK_RECVBUF_UNLOCK(so);
3263 	if (SOCK_IO_RECV_OWNED(so))
3264 		SOCK_IO_RECV_UNLOCK(so);
3265 
3266 	if (m != NULL) {
3267 		unp_scan(m, unp_freerights);
3268 		m_freem(m);
3269 	}
3270 }
3271 
3272 static void
3273 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
3274 {
3275 	struct mbuf *m;
3276 	struct cmsghdr *cm;
3277 	void *data;
3278 	socklen_t clen, datalen;
3279 
3280 	while (m0 != NULL) {
3281 		for (m = m0; m; m = m->m_next) {
3282 			if (m->m_type != MT_CONTROL)
3283 				continue;
3284 
3285 			cm = mtod(m, struct cmsghdr *);
3286 			clen = m->m_len;
3287 
3288 			while (cm != NULL) {
3289 				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
3290 					break;
3291 
3292 				data = CMSG_DATA(cm);
3293 				datalen = (caddr_t)cm + cm->cmsg_len
3294 				    - (caddr_t)data;
3295 
3296 				if (cm->cmsg_level == SOL_SOCKET &&
3297 				    cm->cmsg_type == SCM_RIGHTS) {
3298 					(*op)(data, datalen /
3299 					    sizeof(struct filedescent *));
3300 				}
3301 
3302 				if (CMSG_SPACE(datalen) < clen) {
3303 					clen -= CMSG_SPACE(datalen);
3304 					cm = (struct cmsghdr *)
3305 					    ((caddr_t)cm + CMSG_SPACE(datalen));
3306 				} else {
3307 					clen = 0;
3308 					cm = NULL;
3309 				}
3310 			}
3311 		}
3312 		m0 = m0->m_nextpkt;
3313 	}
3314 }
3315 
3316 /*
3317  * Definitions of protocols supported in the LOCAL domain.
3318  */
3319 static struct protosw streamproto = {
3320 	.pr_type =		SOCK_STREAM,
3321 	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS|
3322 				    PR_CAPATTACH,
3323 	.pr_ctloutput =		&uipc_ctloutput,
3324 	.pr_abort = 		uipc_abort,
3325 	.pr_accept =		uipc_accept,
3326 	.pr_attach =		uipc_attach,
3327 	.pr_bind =		uipc_bind,
3328 	.pr_bindat =		uipc_bindat,
3329 	.pr_connect =		uipc_connect,
3330 	.pr_connectat =		uipc_connectat,
3331 	.pr_connect2 =		uipc_connect2,
3332 	.pr_detach =		uipc_detach,
3333 	.pr_disconnect =	uipc_disconnect,
3334 	.pr_listen =		uipc_listen,
3335 	.pr_peeraddr =		uipc_peeraddr,
3336 	.pr_rcvd =		uipc_rcvd,
3337 	.pr_send =		uipc_send,
3338 	.pr_ready =		uipc_ready,
3339 	.pr_sense =		uipc_sense,
3340 	.pr_shutdown =		uipc_shutdown,
3341 	.pr_sockaddr =		uipc_sockaddr,
3342 	.pr_soreceive =		soreceive_generic,
3343 	.pr_close =		uipc_close,
3344 };
3345 
3346 static struct protosw dgramproto = {
3347 	.pr_type =		SOCK_DGRAM,
3348 	.pr_flags =		PR_ATOMIC | PR_ADDR |PR_RIGHTS | PR_CAPATTACH |
3349 				    PR_SOCKBUF,
3350 	.pr_ctloutput =		&uipc_ctloutput,
3351 	.pr_abort = 		uipc_abort,
3352 	.pr_accept =		uipc_accept,
3353 	.pr_attach =		uipc_attach,
3354 	.pr_bind =		uipc_bind,
3355 	.pr_bindat =		uipc_bindat,
3356 	.pr_connect =		uipc_connect,
3357 	.pr_connectat =		uipc_connectat,
3358 	.pr_connect2 =		uipc_connect2,
3359 	.pr_detach =		uipc_detach,
3360 	.pr_disconnect =	uipc_disconnect,
3361 	.pr_peeraddr =		uipc_peeraddr,
3362 	.pr_sosend =		uipc_sosend_dgram,
3363 	.pr_sense =		uipc_sense,
3364 	.pr_shutdown =		uipc_shutdown,
3365 	.pr_sockaddr =		uipc_sockaddr,
3366 	.pr_soreceive =		uipc_soreceive_dgram,
3367 	.pr_close =		uipc_close,
3368 };
3369 
3370 static struct protosw seqpacketproto = {
3371 	.pr_type =		SOCK_SEQPACKET,
3372 	/*
3373 	 * XXXRW: For now, PR_ADDR because soreceive will bump into them
3374 	 * due to our use of sbappendaddr.  A new sbappend variants is needed
3375 	 * that supports both atomic record writes and control data.
3376 	 */
3377 	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|
3378 				    PR_WANTRCVD|PR_RIGHTS|PR_CAPATTACH,
3379 	.pr_ctloutput =		&uipc_ctloutput,
3380 	.pr_abort =		uipc_abort,
3381 	.pr_accept =		uipc_accept,
3382 	.pr_attach =		uipc_attach,
3383 	.pr_bind =		uipc_bind,
3384 	.pr_bindat =		uipc_bindat,
3385 	.pr_connect =		uipc_connect,
3386 	.pr_connectat =		uipc_connectat,
3387 	.pr_connect2 =		uipc_connect2,
3388 	.pr_detach =		uipc_detach,
3389 	.pr_disconnect =	uipc_disconnect,
3390 	.pr_listen =		uipc_listen,
3391 	.pr_peeraddr =		uipc_peeraddr,
3392 	.pr_rcvd =		uipc_rcvd,
3393 	.pr_send =		uipc_send,
3394 	.pr_sense =		uipc_sense,
3395 	.pr_shutdown =		uipc_shutdown,
3396 	.pr_sockaddr =		uipc_sockaddr,
3397 	.pr_soreceive =		soreceive_generic,	/* XXX: or...? */
3398 	.pr_close =		uipc_close,
3399 };
3400 
3401 static struct domain localdomain = {
3402 	.dom_family =		AF_LOCAL,
3403 	.dom_name =		"local",
3404 	.dom_externalize =	unp_externalize,
3405 	.dom_dispose =		unp_dispose,
3406 	.dom_nprotosw =		3,
3407 	.dom_protosw =		{
3408 		&streamproto,
3409 		&dgramproto,
3410 		&seqpacketproto,
3411 	}
3412 };
3413 DOMAIN_SET(local);
3414 
3415 /*
3416  * A helper function called by VFS before socket-type vnode reclamation.
3417  * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
3418  * use count.
3419  */
3420 void
3421 vfs_unp_reclaim(struct vnode *vp)
3422 {
3423 	struct unpcb *unp;
3424 	int active;
3425 	struct mtx *vplock;
3426 
3427 	ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
3428 	KASSERT(vp->v_type == VSOCK,
3429 	    ("vfs_unp_reclaim: vp->v_type != VSOCK"));
3430 
3431 	active = 0;
3432 	vplock = mtx_pool_find(mtxpool_sleep, vp);
3433 	mtx_lock(vplock);
3434 	VOP_UNP_CONNECT(vp, &unp);
3435 	if (unp == NULL)
3436 		goto done;
3437 	UNP_PCB_LOCK(unp);
3438 	if (unp->unp_vnode == vp) {
3439 		VOP_UNP_DETACH(vp);
3440 		unp->unp_vnode = NULL;
3441 		active = 1;
3442 	}
3443 	UNP_PCB_UNLOCK(unp);
3444  done:
3445 	mtx_unlock(vplock);
3446 	if (active)
3447 		vunref(vp);
3448 }
3449 
3450 #ifdef DDB
3451 static void
3452 db_print_indent(int indent)
3453 {
3454 	int i;
3455 
3456 	for (i = 0; i < indent; i++)
3457 		db_printf(" ");
3458 }
3459 
3460 static void
3461 db_print_unpflags(int unp_flags)
3462 {
3463 	int comma;
3464 
3465 	comma = 0;
3466 	if (unp_flags & UNP_HAVEPC) {
3467 		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
3468 		comma = 1;
3469 	}
3470 	if (unp_flags & UNP_WANTCRED_ALWAYS) {
3471 		db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : "");
3472 		comma = 1;
3473 	}
3474 	if (unp_flags & UNP_WANTCRED_ONESHOT) {
3475 		db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : "");
3476 		comma = 1;
3477 	}
3478 	if (unp_flags & UNP_CONNWAIT) {
3479 		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
3480 		comma = 1;
3481 	}
3482 	if (unp_flags & UNP_CONNECTING) {
3483 		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
3484 		comma = 1;
3485 	}
3486 	if (unp_flags & UNP_BINDING) {
3487 		db_printf("%sUNP_BINDING", comma ? ", " : "");
3488 		comma = 1;
3489 	}
3490 }
3491 
3492 static void
3493 db_print_xucred(int indent, struct xucred *xu)
3494 {
3495 	int comma, i;
3496 
3497 	db_print_indent(indent);
3498 	db_printf("cr_version: %u   cr_uid: %u   cr_pid: %d   cr_ngroups: %d\n",
3499 	    xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups);
3500 	db_print_indent(indent);
3501 	db_printf("cr_groups: ");
3502 	comma = 0;
3503 	for (i = 0; i < xu->cr_ngroups; i++) {
3504 		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
3505 		comma = 1;
3506 	}
3507 	db_printf("\n");
3508 }
3509 
3510 static void
3511 db_print_unprefs(int indent, struct unp_head *uh)
3512 {
3513 	struct unpcb *unp;
3514 	int counter;
3515 
3516 	counter = 0;
3517 	LIST_FOREACH(unp, uh, unp_reflink) {
3518 		if (counter % 4 == 0)
3519 			db_print_indent(indent);
3520 		db_printf("%p  ", unp);
3521 		if (counter % 4 == 3)
3522 			db_printf("\n");
3523 		counter++;
3524 	}
3525 	if (counter != 0 && counter % 4 != 0)
3526 		db_printf("\n");
3527 }
3528 
3529 DB_SHOW_COMMAND(unpcb, db_show_unpcb)
3530 {
3531 	struct unpcb *unp;
3532 
3533         if (!have_addr) {
3534                 db_printf("usage: show unpcb <addr>\n");
3535                 return;
3536         }
3537         unp = (struct unpcb *)addr;
3538 
3539 	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
3540 	    unp->unp_vnode);
3541 
3542 	db_printf("unp_ino: %ju   unp_conn: %p\n", (uintmax_t)unp->unp_ino,
3543 	    unp->unp_conn);
3544 
3545 	db_printf("unp_refs:\n");
3546 	db_print_unprefs(2, &unp->unp_refs);
3547 
3548 	/* XXXRW: Would be nice to print the full address, if any. */
3549 	db_printf("unp_addr: %p\n", unp->unp_addr);
3550 
3551 	db_printf("unp_gencnt: %llu\n",
3552 	    (unsigned long long)unp->unp_gencnt);
3553 
3554 	db_printf("unp_flags: %x (", unp->unp_flags);
3555 	db_print_unpflags(unp->unp_flags);
3556 	db_printf(")\n");
3557 
3558 	db_printf("unp_peercred:\n");
3559 	db_print_xucred(2, &unp->unp_peercred);
3560 
3561 	db_printf("unp_refcount: %u\n", unp->unp_refcount);
3562 }
3563 #endif
3564