1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1989, 1991, 1993
5 * The Regents of the University of California. All Rights Reserved.
6 * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
7 * Copyright (c) 2018 Matthew Macy
8 * Copyright (c) 2022-2025 Gleb Smirnoff <glebius@FreeBSD.org>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 /*
36 * UNIX Domain (Local) Sockets
37 *
38 * This is an implementation of UNIX (local) domain sockets. Each socket has
39 * an associated struct unpcb (UNIX protocol control block). Stream sockets
40 * may be connected to 0 or 1 other socket. Datagram sockets may be
41 * connected to 0, 1, or many other sockets. Sockets may be created and
42 * connected in pairs (socketpair(2)), or bound/connected to using the file
43 * system name space. For most purposes, only the receive socket buffer is
44 * used, as sending on one socket delivers directly to the receive socket
45 * buffer of a second socket.
46 *
47 * The implementation is substantially complicated by the fact that
48 * "ancillary data", such as file descriptors or credentials, may be passed
49 * across UNIX domain sockets. The potential for passing UNIX domain sockets
50 * over other UNIX domain sockets requires the implementation of a simple
51 * garbage collector to find and tear down cycles of disconnected sockets.
52 *
53 * TODO:
54 * RDM
55 * rethink name space problems
56 * need a proper out-of-band
57 */
58
59 #include "opt_ddb.h"
60
61 #include <sys/param.h>
62 #include <sys/capsicum.h>
63 #include <sys/domain.h>
64 #include <sys/eventhandler.h>
65 #include <sys/fcntl.h>
66 #include <sys/file.h>
67 #include <sys/filedesc.h>
68 #include <sys/jail.h>
69 #include <sys/kernel.h>
70 #include <sys/lock.h>
71 #include <sys/malloc.h>
72 #include <sys/mbuf.h>
73 #include <sys/mount.h>
74 #include <sys/mutex.h>
75 #include <sys/namei.h>
76 #include <sys/poll.h>
77 #include <sys/proc.h>
78 #include <sys/protosw.h>
79 #include <sys/queue.h>
80 #include <sys/resourcevar.h>
81 #include <sys/rwlock.h>
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <sys/signalvar.h>
85 #include <sys/stat.h>
86 #include <sys/sx.h>
87 #include <sys/sysctl.h>
88 #include <sys/systm.h>
89 #include <sys/taskqueue.h>
90 #include <sys/un.h>
91 #include <sys/unpcb.h>
92 #include <sys/vnode.h>
93
94 #include <net/vnet.h>
95
96 #ifdef DDB
97 #include <ddb/ddb.h>
98 #endif
99
100 #include <security/mac/mac_framework.h>
101
102 #include <vm/uma.h>
103
104 MALLOC_DECLARE(M_FILECAPS);
105
106 static struct domain localdomain;
107
108 static uma_zone_t unp_zone;
109 static unp_gen_t unp_gencnt; /* (l) */
110 static u_int unp_count; /* (l) Count of local sockets. */
111 static ino_t unp_ino; /* Prototype for fake inode numbers. */
112 static int unp_rights; /* (g) File descriptors in flight. */
113 static struct unp_head unp_shead; /* (l) List of stream sockets. */
114 static struct unp_head unp_dhead; /* (l) List of datagram sockets. */
115 static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */
116 static struct mtx_pool *unp_vp_mtxpool;
117
118 struct unp_defer {
119 SLIST_ENTRY(unp_defer) ud_link;
120 struct file *ud_fp;
121 };
122 static SLIST_HEAD(, unp_defer) unp_defers;
123 static int unp_defers_count;
124
125 static const struct sockaddr sun_noname = {
126 .sa_len = sizeof(sun_noname),
127 .sa_family = AF_LOCAL,
128 };
129
130 /*
131 * Garbage collection of cyclic file descriptor/socket references occurs
132 * asynchronously in a taskqueue context in order to avoid recursion and
133 * reentrance in the UNIX domain socket, file descriptor, and socket layer
134 * code. See unp_gc() for a full description.
135 */
136 static struct timeout_task unp_gc_task;
137
138 /*
139 * The close of unix domain sockets attached as SCM_RIGHTS is
140 * postponed to the taskqueue, to avoid arbitrary recursion depth.
141 * The attached sockets might have another sockets attached.
142 */
143 static struct task unp_defer_task;
144
145 /*
146 * SOCK_STREAM and SOCK_SEQPACKET unix(4) sockets fully bypass the send buffer,
147 * however the notion of send buffer still makes sense with them. Its size is
148 * the amount of space that a send(2) syscall may copyin(9) before checking
149 * with the receive buffer of a peer. Although not linked anywhere yet,
150 * pointed to by a stack variable, effectively it is a buffer that needs to be
151 * sized.
152 *
153 * SOCK_DGRAM sockets really use the sendspace as the maximum datagram size,
154 * and don't really want to reserve the sendspace. Their recvspace should be
155 * large enough for at least one max-size datagram plus address.
156 */
157 #ifndef PIPSIZ
158 #define PIPSIZ 8192
159 #endif
160 static u_long unpst_sendspace = PIPSIZ;
161 static u_long unpst_recvspace = PIPSIZ;
162 static u_long unpdg_maxdgram = 8*1024; /* support 8KB syslog msgs */
163 static u_long unpdg_recvspace = 16*1024;
164 static u_long unpsp_sendspace = PIPSIZ;
165 static u_long unpsp_recvspace = PIPSIZ;
166
167 static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
168 "Local domain");
169 static SYSCTL_NODE(_net_local, SOCK_STREAM, stream,
170 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
171 "SOCK_STREAM");
172 static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram,
173 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
174 "SOCK_DGRAM");
175 static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket,
176 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
177 "SOCK_SEQPACKET");
178
179 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
180 &unpst_sendspace, 0, "Default stream send space.");
181 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
182 &unpst_recvspace, 0, "Default stream receive space.");
183 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
184 &unpdg_maxdgram, 0, "Maximum datagram size.");
185 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
186 &unpdg_recvspace, 0, "Default datagram receive space.");
187 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
188 &unpsp_sendspace, 0, "Default seqpacket send space.");
189 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
190 &unpsp_recvspace, 0, "Default seqpacket receive space.");
191 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
192 "File descriptors in flight.");
193 SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
194 &unp_defers_count, 0,
195 "File descriptors deferred to taskqueue for close.");
196
197 /*
198 * Locking and synchronization:
199 *
200 * Several types of locks exist in the local domain socket implementation:
201 * - a global linkage lock
202 * - a global connection list lock
203 * - the mtxpool lock
204 * - per-unpcb mutexes
205 *
206 * The linkage lock protects the global socket lists, the generation number
207 * counter and garbage collector state.
208 *
209 * The connection list lock protects the list of referring sockets in a datagram
210 * socket PCB. This lock is also overloaded to protect a global list of
211 * sockets whose buffers contain socket references in the form of SCM_RIGHTS
212 * messages. To avoid recursion, such references are released by a dedicated
213 * thread.
214 *
215 * The mtxpool lock protects the vnode from being modified while referenced.
216 * Lock ordering rules require that it be acquired before any PCB locks.
217 *
218 * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the
219 * unpcb. This includes the unp_conn field, which either links two connected
220 * PCBs together (for connected socket types) or points at the destination
221 * socket (for connectionless socket types). The operations of creating or
222 * destroying a connection therefore involve locking multiple PCBs. To avoid
223 * lock order reversals, in some cases this involves dropping a PCB lock and
224 * using a reference counter to maintain liveness.
225 *
226 * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
227 * allocated in pr_attach() and freed in pr_detach(). The validity of that
228 * pointer is an invariant, so no lock is required to dereference the so_pcb
229 * pointer if a valid socket reference is held by the caller. In practice,
230 * this is always true during operations performed on a socket. Each unpcb
231 * has a back-pointer to its socket, unp_socket, which will be stable under
232 * the same circumstances.
233 *
234 * This pointer may only be safely dereferenced as long as a valid reference
235 * to the unpcb is held. Typically, this reference will be from the socket,
236 * or from another unpcb when the referring unpcb's lock is held (in order
237 * that the reference not be invalidated during use). For example, to follow
238 * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
239 * that detach is not run clearing unp_socket.
240 *
241 * Blocking with UNIX domain sockets is a tricky issue: unlike most network
242 * protocols, bind() is a non-atomic operation, and connect() requires
243 * potential sleeping in the protocol, due to potentially waiting on local or
244 * distributed file systems. We try to separate "lookup" operations, which
245 * may sleep, and the IPC operations themselves, which typically can occur
246 * with relative atomicity as locks can be held over the entire operation.
247 *
248 * Another tricky issue is simultaneous multi-threaded or multi-process
249 * access to a single UNIX domain socket. These are handled by the flags
250 * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
251 * binding, both of which involve dropping UNIX domain socket locks in order
252 * to perform namei() and other file system operations.
253 */
254 static struct rwlock unp_link_rwlock;
255 static struct mtx unp_defers_lock;
256
257 #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \
258 "unp_link_rwlock")
259
260 #define UNP_LINK_LOCK_ASSERT() rw_assert(&unp_link_rwlock, \
261 RA_LOCKED)
262 #define UNP_LINK_UNLOCK_ASSERT() rw_assert(&unp_link_rwlock, \
263 RA_UNLOCKED)
264
265 #define UNP_LINK_RLOCK() rw_rlock(&unp_link_rwlock)
266 #define UNP_LINK_RUNLOCK() rw_runlock(&unp_link_rwlock)
267 #define UNP_LINK_WLOCK() rw_wlock(&unp_link_rwlock)
268 #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock)
269 #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \
270 RA_WLOCKED)
271 #define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock)
272
273 #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \
274 "unp_defer", NULL, MTX_DEF)
275 #define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock)
276 #define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock)
277
278 #define UNP_REF_LIST_LOCK() UNP_DEFERRED_LOCK();
279 #define UNP_REF_LIST_UNLOCK() UNP_DEFERRED_UNLOCK();
280
281 #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \
282 "unp", "unp", \
283 MTX_DUPOK|MTX_DEF)
284 #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx)
285 #define UNP_PCB_LOCKPTR(unp) (&(unp)->unp_mtx)
286 #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx)
287 #define UNP_PCB_TRYLOCK(unp) mtx_trylock(&(unp)->unp_mtx)
288 #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx)
289 #define UNP_PCB_OWNED(unp) mtx_owned(&(unp)->unp_mtx)
290 #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED)
291 #define UNP_PCB_UNLOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
292
293 static int uipc_connect2(struct socket *, struct socket *);
294 static int uipc_ctloutput(struct socket *, struct sockopt *);
295 static int unp_connect(struct socket *, struct sockaddr *,
296 struct thread *);
297 static int unp_connectat(int, struct socket *, struct sockaddr *,
298 struct thread *, bool);
299 static void unp_connect2(struct socket *, struct socket *, bool);
300 static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
301 static void unp_dispose(struct socket *so);
302 static void unp_drop(struct unpcb *);
303 static void unp_gc(__unused void *, int);
304 static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
305 static void unp_discard(struct file *);
306 static void unp_freerights(struct filedescent **, int);
307 static int unp_internalize(struct mbuf *, struct mchain *,
308 struct thread *);
309 static void unp_internalize_fp(struct file *);
310 static int unp_externalize(struct mbuf *, struct mbuf **, int);
311 static int unp_externalize_fp(struct file *);
312 static void unp_addsockcred(struct thread *, struct mchain *, int);
313 static void unp_process_defers(void * __unused, int);
314
315 static void uipc_wrknl_lock(void *);
316 static void uipc_wrknl_unlock(void *);
317 static void uipc_wrknl_assert_lock(void *, int);
318
319 static void
unp_pcb_hold(struct unpcb * unp)320 unp_pcb_hold(struct unpcb *unp)
321 {
322 u_int old __unused;
323
324 old = refcount_acquire(&unp->unp_refcount);
325 KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp));
326 }
327
328 static __result_use_check bool
unp_pcb_rele(struct unpcb * unp)329 unp_pcb_rele(struct unpcb *unp)
330 {
331 bool ret;
332
333 UNP_PCB_LOCK_ASSERT(unp);
334
335 if ((ret = refcount_release(&unp->unp_refcount))) {
336 UNP_PCB_UNLOCK(unp);
337 UNP_PCB_LOCK_DESTROY(unp);
338 uma_zfree(unp_zone, unp);
339 }
340 return (ret);
341 }
342
343 static void
unp_pcb_rele_notlast(struct unpcb * unp)344 unp_pcb_rele_notlast(struct unpcb *unp)
345 {
346 bool ret __unused;
347
348 ret = refcount_release(&unp->unp_refcount);
349 KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp));
350 }
351
352 static void
unp_pcb_lock_pair(struct unpcb * unp,struct unpcb * unp2)353 unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2)
354 {
355 UNP_PCB_UNLOCK_ASSERT(unp);
356 UNP_PCB_UNLOCK_ASSERT(unp2);
357
358 if (unp == unp2) {
359 UNP_PCB_LOCK(unp);
360 } else if ((uintptr_t)unp2 > (uintptr_t)unp) {
361 UNP_PCB_LOCK(unp);
362 UNP_PCB_LOCK(unp2);
363 } else {
364 UNP_PCB_LOCK(unp2);
365 UNP_PCB_LOCK(unp);
366 }
367 }
368
369 static void
unp_pcb_unlock_pair(struct unpcb * unp,struct unpcb * unp2)370 unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2)
371 {
372 UNP_PCB_UNLOCK(unp);
373 if (unp != unp2)
374 UNP_PCB_UNLOCK(unp2);
375 }
376
377 /*
378 * Try to lock the connected peer of an already locked socket. In some cases
379 * this requires that we unlock the current socket. The pairbusy counter is
380 * used to block concurrent connection attempts while the lock is dropped. The
381 * caller must be careful to revalidate PCB state.
382 */
383 static struct unpcb *
unp_pcb_lock_peer(struct unpcb * unp)384 unp_pcb_lock_peer(struct unpcb *unp)
385 {
386 struct unpcb *unp2;
387
388 UNP_PCB_LOCK_ASSERT(unp);
389 unp2 = unp->unp_conn;
390 if (unp2 == NULL)
391 return (NULL);
392 if (__predict_false(unp == unp2))
393 return (unp);
394
395 UNP_PCB_UNLOCK_ASSERT(unp2);
396
397 if (__predict_true(UNP_PCB_TRYLOCK(unp2)))
398 return (unp2);
399 if ((uintptr_t)unp2 > (uintptr_t)unp) {
400 UNP_PCB_LOCK(unp2);
401 return (unp2);
402 }
403 unp->unp_pairbusy++;
404 unp_pcb_hold(unp2);
405 UNP_PCB_UNLOCK(unp);
406
407 UNP_PCB_LOCK(unp2);
408 UNP_PCB_LOCK(unp);
409 KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL,
410 ("%s: socket %p was reconnected", __func__, unp));
411 if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) {
412 unp->unp_flags &= ~UNP_WAITING;
413 wakeup(unp);
414 }
415 if (unp_pcb_rele(unp2)) {
416 /* unp2 is unlocked. */
417 return (NULL);
418 }
419 if (unp->unp_conn == NULL) {
420 UNP_PCB_UNLOCK(unp2);
421 return (NULL);
422 }
423 return (unp2);
424 }
425
426 /*
427 * Try to lock peer of our socket for purposes of sending data to it.
428 */
429 static int
uipc_lock_peer(struct socket * so,struct unpcb ** unp2)430 uipc_lock_peer(struct socket *so, struct unpcb **unp2)
431 {
432 struct unpcb *unp;
433 int error;
434
435 unp = sotounpcb(so);
436 UNP_PCB_LOCK(unp);
437 *unp2 = unp_pcb_lock_peer(unp);
438 if (__predict_false(so->so_error != 0)) {
439 error = so->so_error;
440 so->so_error = 0;
441 UNP_PCB_UNLOCK(unp);
442 if (*unp2 != NULL)
443 UNP_PCB_UNLOCK(*unp2);
444 return (error);
445 }
446 if (__predict_false(*unp2 == NULL)) {
447 /*
448 * Different error code for a previously connected socket and
449 * a never connected one. The SS_ISDISCONNECTED is set in the
450 * unp_soisdisconnected() and is synchronized by the pcb lock.
451 */
452 error = so->so_state & SS_ISDISCONNECTED ? EPIPE : ENOTCONN;
453 UNP_PCB_UNLOCK(unp);
454 return (error);
455 }
456 UNP_PCB_UNLOCK(unp);
457
458 return (0);
459 }
460
461 static void
uipc_abort(struct socket * so)462 uipc_abort(struct socket *so)
463 {
464 struct unpcb *unp, *unp2;
465
466 unp = sotounpcb(so);
467 KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
468 UNP_PCB_UNLOCK_ASSERT(unp);
469
470 UNP_PCB_LOCK(unp);
471 unp2 = unp->unp_conn;
472 if (unp2 != NULL) {
473 unp_pcb_hold(unp2);
474 UNP_PCB_UNLOCK(unp);
475 unp_drop(unp2);
476 } else
477 UNP_PCB_UNLOCK(unp);
478 }
479
480 static int
uipc_attach(struct socket * so,int proto,struct thread * td)481 uipc_attach(struct socket *so, int proto, struct thread *td)
482 {
483 u_long sendspace, recvspace;
484 struct unpcb *unp;
485 int error;
486 bool locked;
487
488 KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
489 switch (so->so_type) {
490 case SOCK_DGRAM:
491 STAILQ_INIT(&so->so_rcv.uxdg_mb);
492 STAILQ_INIT(&so->so_snd.uxdg_mb);
493 TAILQ_INIT(&so->so_rcv.uxdg_conns);
494 /*
495 * Since send buffer is either bypassed or is a part
496 * of one-to-many receive buffer, we assign both space
497 * limits to unpdg_recvspace.
498 */
499 sendspace = recvspace = unpdg_recvspace;
500 break;
501
502 case SOCK_STREAM:
503 sendspace = unpst_sendspace;
504 recvspace = unpst_recvspace;
505 goto common;
506
507 case SOCK_SEQPACKET:
508 sendspace = unpsp_sendspace;
509 recvspace = unpsp_recvspace;
510 common:
511 /*
512 * XXXGL: we need to initialize the mutex with MTX_DUPOK.
513 * Ideally, protocols that have PR_SOCKBUF should be
514 * responsible for mutex initialization officially, and then
515 * this uglyness with mtx_destroy(); mtx_init(); would go away.
516 */
517 mtx_destroy(&so->so_rcv_mtx);
518 mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF | MTX_DUPOK);
519 knlist_init(&so->so_wrsel.si_note, so, uipc_wrknl_lock,
520 uipc_wrknl_unlock, uipc_wrknl_assert_lock);
521 STAILQ_INIT(&so->so_rcv.uxst_mbq);
522 break;
523 default:
524 panic("uipc_attach");
525 }
526 error = soreserve(so, sendspace, recvspace);
527 if (error)
528 return (error);
529 unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
530 if (unp == NULL)
531 return (ENOBUFS);
532 LIST_INIT(&unp->unp_refs);
533 UNP_PCB_LOCK_INIT(unp);
534 unp->unp_socket = so;
535 so->so_pcb = unp;
536 refcount_init(&unp->unp_refcount, 1);
537 unp->unp_mode = ACCESSPERMS;
538
539 if ((locked = UNP_LINK_WOWNED()) == false)
540 UNP_LINK_WLOCK();
541
542 unp->unp_gencnt = ++unp_gencnt;
543 unp->unp_ino = ++unp_ino;
544 unp_count++;
545 switch (so->so_type) {
546 case SOCK_STREAM:
547 LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
548 break;
549
550 case SOCK_DGRAM:
551 LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
552 break;
553
554 case SOCK_SEQPACKET:
555 LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
556 break;
557
558 default:
559 panic("uipc_attach");
560 }
561
562 if (locked == false)
563 UNP_LINK_WUNLOCK();
564
565 return (0);
566 }
567
568 static int
uipc_bindat(int fd,struct socket * so,struct sockaddr * nam,struct thread * td)569 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
570 {
571 struct sockaddr_un *soun = (struct sockaddr_un *)nam;
572 struct vattr vattr;
573 int error, namelen;
574 struct nameidata nd;
575 struct unpcb *unp;
576 struct vnode *vp;
577 struct mount *mp;
578 cap_rights_t rights;
579 char *buf;
580 mode_t mode;
581
582 if (nam->sa_family != AF_UNIX)
583 return (EAFNOSUPPORT);
584
585 unp = sotounpcb(so);
586 KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
587
588 if (soun->sun_len > sizeof(struct sockaddr_un))
589 return (EINVAL);
590 namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
591 if (namelen <= 0)
592 return (EINVAL);
593
594 /*
595 * We don't allow simultaneous bind() calls on a single UNIX domain
596 * socket, so flag in-progress operations, and return an error if an
597 * operation is already in progress.
598 *
599 * Historically, we have not allowed a socket to be rebound, so this
600 * also returns an error. Not allowing re-binding simplifies the
601 * implementation and avoids a great many possible failure modes.
602 */
603 UNP_PCB_LOCK(unp);
604 if (unp->unp_vnode != NULL) {
605 UNP_PCB_UNLOCK(unp);
606 return (EINVAL);
607 }
608 if (unp->unp_flags & UNP_BINDING) {
609 UNP_PCB_UNLOCK(unp);
610 return (EALREADY);
611 }
612 unp->unp_flags |= UNP_BINDING;
613 mode = unp->unp_mode & ~td->td_proc->p_pd->pd_cmask;
614 UNP_PCB_UNLOCK(unp);
615
616 buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
617 bcopy(soun->sun_path, buf, namelen);
618 buf[namelen] = 0;
619
620 restart:
621 NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | NOCACHE,
622 UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT));
623 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
624 error = namei(&nd);
625 if (error)
626 goto error;
627 vp = nd.ni_vp;
628 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
629 NDFREE_PNBUF(&nd);
630 if (nd.ni_dvp == vp)
631 vrele(nd.ni_dvp);
632 else
633 vput(nd.ni_dvp);
634 if (vp != NULL) {
635 vrele(vp);
636 error = EADDRINUSE;
637 goto error;
638 }
639 error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
640 if (error)
641 goto error;
642 goto restart;
643 }
644 VATTR_NULL(&vattr);
645 vattr.va_type = VSOCK;
646 vattr.va_mode = mode;
647 #ifdef MAC
648 error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
649 &vattr);
650 #endif
651 if (error == 0) {
652 /*
653 * The prior lookup may have left LK_SHARED in cn_lkflags,
654 * and VOP_CREATE technically only requires the new vnode to
655 * be locked shared. Most filesystems will return the new vnode
656 * locked exclusive regardless, but we should explicitly
657 * specify that here since we require it and assert to that
658 * effect below.
659 */
660 nd.ni_cnd.cn_lkflags = (nd.ni_cnd.cn_lkflags & ~LK_SHARED) |
661 LK_EXCLUSIVE;
662 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
663 }
664 NDFREE_PNBUF(&nd);
665 if (error) {
666 VOP_VPUT_PAIR(nd.ni_dvp, NULL, true);
667 vn_finished_write(mp);
668 if (error == ERELOOKUP)
669 goto restart;
670 goto error;
671 }
672 vp = nd.ni_vp;
673 ASSERT_VOP_ELOCKED(vp, "uipc_bind");
674 soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
675
676 UNP_PCB_LOCK(unp);
677 VOP_UNP_BIND(vp, unp);
678 unp->unp_vnode = vp;
679 unp->unp_addr = soun;
680 unp->unp_flags &= ~UNP_BINDING;
681 UNP_PCB_UNLOCK(unp);
682 vref(vp);
683 VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
684 vn_finished_write(mp);
685 free(buf, M_TEMP);
686 return (0);
687
688 error:
689 UNP_PCB_LOCK(unp);
690 unp->unp_flags &= ~UNP_BINDING;
691 UNP_PCB_UNLOCK(unp);
692 free(buf, M_TEMP);
693 return (error);
694 }
695
696 static int
uipc_bind(struct socket * so,struct sockaddr * nam,struct thread * td)697 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
698 {
699
700 return (uipc_bindat(AT_FDCWD, so, nam, td));
701 }
702
703 static int
uipc_connect(struct socket * so,struct sockaddr * nam,struct thread * td)704 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
705 {
706 int error;
707
708 KASSERT(td == curthread, ("uipc_connect: td != curthread"));
709 error = unp_connect(so, nam, td);
710 return (error);
711 }
712
713 static int
uipc_connectat(int fd,struct socket * so,struct sockaddr * nam,struct thread * td)714 uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
715 struct thread *td)
716 {
717 int error;
718
719 KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
720 error = unp_connectat(fd, so, nam, td, false);
721 return (error);
722 }
723
724 static void
uipc_close(struct socket * so)725 uipc_close(struct socket *so)
726 {
727 struct unpcb *unp, *unp2;
728 struct vnode *vp = NULL;
729 struct mtx *vplock;
730
731 unp = sotounpcb(so);
732 KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
733
734 vplock = NULL;
735 if ((vp = unp->unp_vnode) != NULL) {
736 vplock = mtx_pool_find(unp_vp_mtxpool, vp);
737 mtx_lock(vplock);
738 }
739 UNP_PCB_LOCK(unp);
740 if (vp && unp->unp_vnode == NULL) {
741 mtx_unlock(vplock);
742 vp = NULL;
743 }
744 if (vp != NULL) {
745 VOP_UNP_DETACH(vp);
746 unp->unp_vnode = NULL;
747 }
748 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
749 unp_disconnect(unp, unp2);
750 else
751 UNP_PCB_UNLOCK(unp);
752 if (vp) {
753 mtx_unlock(vplock);
754 vrele(vp);
755 }
756 }
757
758 static int
uipc_chmod(struct socket * so,mode_t mode,struct ucred * cred __unused,struct thread * td __unused)759 uipc_chmod(struct socket *so, mode_t mode, struct ucred *cred __unused,
760 struct thread *td __unused)
761 {
762 struct unpcb *unp;
763 int error;
764
765 if ((mode & ~ACCESSPERMS) != 0)
766 return (EINVAL);
767
768 error = 0;
769 unp = sotounpcb(so);
770 UNP_PCB_LOCK(unp);
771 if (unp->unp_vnode != NULL || (unp->unp_flags & UNP_BINDING) != 0)
772 error = EINVAL;
773 else
774 unp->unp_mode = mode;
775 UNP_PCB_UNLOCK(unp);
776 return (error);
777 }
778
779 static int
uipc_connect2(struct socket * so1,struct socket * so2)780 uipc_connect2(struct socket *so1, struct socket *so2)
781 {
782 struct unpcb *unp, *unp2;
783
784 if (so1->so_type != so2->so_type)
785 return (EPROTOTYPE);
786
787 unp = so1->so_pcb;
788 KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
789 unp2 = so2->so_pcb;
790 KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
791 unp_pcb_lock_pair(unp, unp2);
792 unp_connect2(so1, so2, false);
793 unp_pcb_unlock_pair(unp, unp2);
794
795 return (0);
796 }
797
798 static void
uipc_detach(struct socket * so)799 uipc_detach(struct socket *so)
800 {
801 struct unpcb *unp, *unp2;
802 struct mtx *vplock;
803 struct vnode *vp;
804 int local_unp_rights;
805
806 unp = sotounpcb(so);
807 KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
808
809 vp = NULL;
810 vplock = NULL;
811
812 if (!SOLISTENING(so))
813 unp_dispose(so);
814
815 UNP_LINK_WLOCK();
816 LIST_REMOVE(unp, unp_link);
817 if (unp->unp_gcflag & UNPGC_DEAD)
818 LIST_REMOVE(unp, unp_dead);
819 unp->unp_gencnt = ++unp_gencnt;
820 --unp_count;
821 UNP_LINK_WUNLOCK();
822
823 UNP_PCB_UNLOCK_ASSERT(unp);
824 restart:
825 if ((vp = unp->unp_vnode) != NULL) {
826 vplock = mtx_pool_find(unp_vp_mtxpool, vp);
827 mtx_lock(vplock);
828 }
829 UNP_PCB_LOCK(unp);
830 if (unp->unp_vnode != vp && unp->unp_vnode != NULL) {
831 if (vplock)
832 mtx_unlock(vplock);
833 UNP_PCB_UNLOCK(unp);
834 goto restart;
835 }
836 if ((vp = unp->unp_vnode) != NULL) {
837 VOP_UNP_DETACH(vp);
838 unp->unp_vnode = NULL;
839 }
840 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
841 unp_disconnect(unp, unp2);
842 else
843 UNP_PCB_UNLOCK(unp);
844
845 UNP_REF_LIST_LOCK();
846 while (!LIST_EMPTY(&unp->unp_refs)) {
847 struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
848
849 unp_pcb_hold(ref);
850 UNP_REF_LIST_UNLOCK();
851
852 MPASS(ref != unp);
853 UNP_PCB_UNLOCK_ASSERT(ref);
854 unp_drop(ref);
855 UNP_REF_LIST_LOCK();
856 }
857 UNP_REF_LIST_UNLOCK();
858
859 UNP_PCB_LOCK(unp);
860 local_unp_rights = unp_rights;
861 unp->unp_socket->so_pcb = NULL;
862 unp->unp_socket = NULL;
863 free(unp->unp_addr, M_SONAME);
864 unp->unp_addr = NULL;
865 if (!unp_pcb_rele(unp))
866 UNP_PCB_UNLOCK(unp);
867 if (vp) {
868 mtx_unlock(vplock);
869 vrele(vp);
870 }
871 if (local_unp_rights)
872 taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
873
874 switch (so->so_type) {
875 case SOCK_STREAM:
876 case SOCK_SEQPACKET:
877 MPASS(SOLISTENING(so) || (STAILQ_EMPTY(&so->so_rcv.uxst_mbq) &&
878 so->so_rcv.uxst_peer == NULL));
879 break;
880 case SOCK_DGRAM:
881 /*
882 * Everything should have been unlinked/freed by unp_dispose()
883 * and/or unp_disconnect().
884 */
885 MPASS(so->so_rcv.uxdg_peeked == NULL);
886 MPASS(STAILQ_EMPTY(&so->so_rcv.uxdg_mb));
887 MPASS(TAILQ_EMPTY(&so->so_rcv.uxdg_conns));
888 MPASS(STAILQ_EMPTY(&so->so_snd.uxdg_mb));
889 }
890 }
891
892 static int
uipc_disconnect(struct socket * so)893 uipc_disconnect(struct socket *so)
894 {
895 struct unpcb *unp, *unp2;
896
897 unp = sotounpcb(so);
898 KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
899
900 UNP_PCB_LOCK(unp);
901 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
902 unp_disconnect(unp, unp2);
903 else
904 UNP_PCB_UNLOCK(unp);
905 return (0);
906 }
907
908 static int
uipc_listen(struct socket * so,int backlog,struct thread * td)909 uipc_listen(struct socket *so, int backlog, struct thread *td)
910 {
911 struct unpcb *unp;
912 int error;
913
914 MPASS(so->so_type != SOCK_DGRAM);
915
916 /*
917 * Synchronize with concurrent connection attempts.
918 */
919 error = 0;
920 unp = sotounpcb(so);
921 UNP_PCB_LOCK(unp);
922 if (unp->unp_conn != NULL || (unp->unp_flags & UNP_CONNECTING) != 0)
923 error = EINVAL;
924 else if (unp->unp_vnode == NULL)
925 error = EDESTADDRREQ;
926 if (error != 0) {
927 UNP_PCB_UNLOCK(unp);
928 return (error);
929 }
930
931 SOCK_LOCK(so);
932 error = solisten_proto_check(so);
933 if (error == 0) {
934 cru2xt(td, &unp->unp_peercred);
935 if (!SOLISTENING(so)) {
936 (void)chgsbsize(so->so_cred->cr_uidinfo,
937 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
938 (void)chgsbsize(so->so_cred->cr_uidinfo,
939 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
940 }
941 solisten_proto(so, backlog);
942 }
943 SOCK_UNLOCK(so);
944 UNP_PCB_UNLOCK(unp);
945 return (error);
946 }
947
948 static int
uipc_peeraddr(struct socket * so,struct sockaddr * ret)949 uipc_peeraddr(struct socket *so, struct sockaddr *ret)
950 {
951 struct unpcb *unp, *unp2;
952 const struct sockaddr *sa;
953
954 unp = sotounpcb(so);
955 KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
956
957 UNP_PCB_LOCK(unp);
958 unp2 = unp_pcb_lock_peer(unp);
959 if (unp2 != NULL) {
960 if (unp2->unp_addr != NULL)
961 sa = (struct sockaddr *)unp2->unp_addr;
962 else
963 sa = &sun_noname;
964 bcopy(sa, ret, sa->sa_len);
965 unp_pcb_unlock_pair(unp, unp2);
966 } else {
967 UNP_PCB_UNLOCK(unp);
968 sa = &sun_noname;
969 bcopy(sa, ret, sa->sa_len);
970 }
971 return (0);
972 }
973
974 /*
975 * pr_sosend() called with mbuf instead of uio is a kernel thread. NFS,
976 * netgraph(4) and other subsystems can call into socket code. The
977 * function will condition the mbuf so that it can be safely put onto socket
978 * buffer and calculate its char count and mbuf count.
979 *
980 * Note: we don't support receiving control data from a kernel thread. Our
981 * pr_sosend methods have MPASS() to check that. This may change.
982 */
983 static void
uipc_reset_kernel_mbuf(struct mbuf * m,struct mchain * mc)984 uipc_reset_kernel_mbuf(struct mbuf *m, struct mchain *mc)
985 {
986
987 M_ASSERTPKTHDR(m);
988
989 m_clrprotoflags(m);
990 m_tag_delete_chain(m, NULL);
991 m->m_pkthdr.rcvif = NULL;
992 m->m_pkthdr.flowid = 0;
993 m->m_pkthdr.csum_flags = 0;
994 m->m_pkthdr.fibnum = 0;
995 m->m_pkthdr.rsstype = 0;
996
997 mc_init_m(mc, m);
998 MPASS(m->m_pkthdr.len == mc->mc_len);
999 }
1000
1001 #ifdef SOCKBUF_DEBUG
1002 static inline void
uipc_stream_sbcheck(struct sockbuf * sb)1003 uipc_stream_sbcheck(struct sockbuf *sb)
1004 {
1005 struct mbuf *d;
1006 u_int dacc, dccc, dctl, dmbcnt;
1007 bool notready = false;
1008
1009 dacc = dccc = dctl = dmbcnt = 0;
1010 STAILQ_FOREACH(d, &sb->uxst_mbq, m_stailq) {
1011 if (d == sb->uxst_fnrdy) {
1012 MPASS(d->m_flags & M_NOTREADY);
1013 notready = true;
1014 }
1015 if (d->m_type == MT_CONTROL)
1016 dctl += d->m_len;
1017 else if (d->m_type == MT_DATA) {
1018 dccc += d->m_len;
1019 if (!notready)
1020 dacc += d->m_len;
1021 } else
1022 MPASS(0);
1023 dmbcnt += MSIZE;
1024 if (d->m_flags & M_EXT)
1025 dmbcnt += d->m_ext.ext_size;
1026 if (d->m_stailq.stqe_next == NULL)
1027 MPASS(sb->uxst_mbq.stqh_last == &d->m_stailq.stqe_next);
1028 }
1029 MPASS(sb->uxst_fnrdy == NULL || notready);
1030 MPASS(dacc == sb->sb_acc);
1031 MPASS(dccc == sb->sb_ccc);
1032 MPASS(dctl == sb->sb_ctl);
1033 MPASS(dmbcnt == sb->sb_mbcnt);
1034 (void)STAILQ_EMPTY(&sb->uxst_mbq);
1035 }
1036 #define UIPC_STREAM_SBCHECK(sb) uipc_stream_sbcheck(sb)
1037 #else
1038 #define UIPC_STREAM_SBCHECK(sb) do {} while (0)
1039 #endif
1040
1041 /*
1042 * uipc_stream_sbspace() returns how much a writer can send, limited by char
1043 * count or mbuf memory use, whatever ends first.
1044 *
1045 * An obvious and legitimate reason for a socket having more data than allowed,
1046 * is lowering the limit with setsockopt(SO_RCVBUF) on already full buffer.
1047 * Also, sb_mbcnt may overcommit sb_mbmax in case if previous write observed
1048 * 'space < mbspace', but mchain allocated to hold 'space' bytes of data ended
1049 * up with 'mc_mlen > mbspace'. A typical scenario would be a full buffer with
1050 * writer trying to push in a large write, and a slow reader, that reads just
1051 * a few bytes at a time. In that case writer will keep creating new mbufs
1052 * with mc_split(). These mbufs will carry little chars, but will all point at
1053 * the same cluster, thus each adding cluster size to sb_mbcnt. This means we
1054 * will count same cluster many times potentially underutilizing socket buffer.
1055 * We aren't optimizing towards ineffective readers. Classic socket buffer had
1056 * the same "feature".
1057 */
1058 static inline u_int
uipc_stream_sbspace(struct sockbuf * sb)1059 uipc_stream_sbspace(struct sockbuf *sb)
1060 {
1061 u_int space, mbspace;
1062
1063 if (__predict_true(sb->sb_hiwat >= sb->sb_ccc + sb->sb_ctl))
1064 space = sb->sb_hiwat - sb->sb_ccc - sb->sb_ctl;
1065 else
1066 return (0);
1067 if (__predict_true(sb->sb_mbmax >= sb->sb_mbcnt))
1068 mbspace = sb->sb_mbmax - sb->sb_mbcnt;
1069 else
1070 return (0);
1071
1072 return (min(space, mbspace));
1073 }
1074
1075 static int
uipc_sosend_stream_or_seqpacket(struct socket * so,struct sockaddr * addr,struct uio * uio0,struct mbuf * m,struct mbuf * c,int flags,struct thread * td)1076 uipc_sosend_stream_or_seqpacket(struct socket *so, struct sockaddr *addr,
1077 struct uio *uio0, struct mbuf *m, struct mbuf *c, int flags,
1078 struct thread *td)
1079 {
1080 struct unpcb *unp2;
1081 struct socket *so2;
1082 struct sockbuf *sb;
1083 struct uio *uio;
1084 struct mchain mc, cmc;
1085 size_t resid, sent;
1086 bool nonblock, eor, aio;
1087 int error;
1088
1089 MPASS((uio0 != NULL && m == NULL) || (m != NULL && uio0 == NULL));
1090 MPASS(m == NULL || c == NULL);
1091
1092 if (__predict_false(flags & MSG_OOB))
1093 return (EOPNOTSUPP);
1094
1095 nonblock = (so->so_state & SS_NBIO) ||
1096 (flags & (MSG_DONTWAIT | MSG_NBIO));
1097 eor = flags & MSG_EOR;
1098
1099 mc = MCHAIN_INITIALIZER(&mc);
1100 cmc = MCHAIN_INITIALIZER(&cmc);
1101 sent = 0;
1102 aio = false;
1103
1104 if (m == NULL) {
1105 if (c != NULL && (error = unp_internalize(c, &cmc, td)))
1106 goto out;
1107 /*
1108 * This function may read more data from the uio than it would
1109 * then place on socket. That would leave uio inconsistent
1110 * upon return. Normally uio is allocated on the stack of the
1111 * syscall thread and we don't care about leaving it consistent.
1112 * However, aio(9) will allocate a uio as part of job and will
1113 * use it to track progress. We detect aio(9) checking the
1114 * SB_AIO_RUNNING flag. It is safe to check it without lock
1115 * cause it is set and cleared in the same taskqueue thread.
1116 *
1117 * This check can also produce a false positive: there is
1118 * aio(9) job and also there is a syscall we are serving now.
1119 * No sane software does that, it would leave to a mess in
1120 * the socket buffer, as aio(9) doesn't grab the I/O sx(9).
1121 * But syzkaller can create this mess. For such false positive
1122 * our goal is just don't panic or leak memory.
1123 */
1124 if (__predict_false(so->so_snd.sb_flags & SB_AIO_RUNNING)) {
1125 uio = cloneuio(uio0);
1126 aio = true;
1127 } else {
1128 uio = uio0;
1129 resid = uio->uio_resid;
1130 }
1131 /*
1132 * Optimization for a case when our send fits into the receive
1133 * buffer - do the copyin before taking any locks, sized to our
1134 * send buffer. Later copyins will also take into account
1135 * space in the peer's receive buffer.
1136 */
1137 error = mc_uiotomc(&mc, uio, so->so_snd.sb_hiwat, 0, M_WAITOK,
1138 eor ? M_EOR : 0);
1139 if (__predict_false(error))
1140 goto out2;
1141 } else
1142 uipc_reset_kernel_mbuf(m, &mc);
1143
1144 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1145 if (error)
1146 goto out2;
1147
1148 if (__predict_false((error = uipc_lock_peer(so, &unp2)) != 0))
1149 goto out3;
1150
1151 if (unp2->unp_flags & UNP_WANTCRED_MASK) {
1152 /*
1153 * Credentials are passed only once on SOCK_STREAM and
1154 * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
1155 * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
1156 */
1157 unp_addsockcred(td, &cmc, unp2->unp_flags);
1158 unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
1159 }
1160
1161 /*
1162 * Cycle through the data to send and available space in the peer's
1163 * receive buffer. Put a reference on the peer socket, so that it
1164 * doesn't get freed while we sbwait(). If peer goes away, we will
1165 * observe the SBS_CANTRCVMORE and our sorele() will finalize peer's
1166 * socket destruction.
1167 */
1168 so2 = unp2->unp_socket;
1169 soref(so2);
1170 UNP_PCB_UNLOCK(unp2);
1171 sb = &so2->so_rcv;
1172 while (mc.mc_len + cmc.mc_len > 0) {
1173 struct mchain mcnext = MCHAIN_INITIALIZER(&mcnext);
1174 u_int space;
1175
1176 SOCK_RECVBUF_LOCK(so2);
1177 restart:
1178 UIPC_STREAM_SBCHECK(sb);
1179 if (__predict_false(cmc.mc_len > sb->sb_hiwat)) {
1180 SOCK_RECVBUF_UNLOCK(so2);
1181 error = EMSGSIZE;
1182 goto out4;
1183 }
1184 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
1185 SOCK_RECVBUF_UNLOCK(so2);
1186 error = EPIPE;
1187 goto out4;
1188 }
1189 /*
1190 * Wait on the peer socket receive buffer until we have enough
1191 * space to put at least control. The data is a stream and can
1192 * be put partially, but control is really a datagram.
1193 */
1194 space = uipc_stream_sbspace(sb);
1195 if (space < sb->sb_lowat || space < cmc.mc_len) {
1196 if (nonblock) {
1197 if (aio)
1198 sb->uxst_flags |= UXST_PEER_AIO;
1199 SOCK_RECVBUF_UNLOCK(so2);
1200 if (aio) {
1201 SOCK_SENDBUF_LOCK(so);
1202 so->so_snd.sb_ccc =
1203 so->so_snd.sb_hiwat - space;
1204 SOCK_SENDBUF_UNLOCK(so);
1205 }
1206 error = EWOULDBLOCK;
1207 goto out4;
1208 }
1209 if ((error = sbwait(so2, SO_RCV)) != 0) {
1210 SOCK_RECVBUF_UNLOCK(so2);
1211 goto out4;
1212 } else
1213 goto restart;
1214 }
1215 MPASS(space >= cmc.mc_len);
1216 space -= cmc.mc_len;
1217 if (space == 0) {
1218 /* There is space only to send control. */
1219 MPASS(!STAILQ_EMPTY(&cmc.mc_q));
1220 mcnext = mc;
1221 mc = MCHAIN_INITIALIZER(&mc);
1222 } else if (space < mc.mc_len) {
1223 /* Not enough space. */
1224 if (__predict_false(mc_split(&mc, &mcnext, space,
1225 M_NOWAIT) == ENOMEM)) {
1226 /*
1227 * If allocation failed use M_WAITOK and merge
1228 * the chain back. Next time mc_split() will
1229 * easily split at the same place. Only if we
1230 * race with setsockopt(SO_RCVBUF) shrinking
1231 * sb_hiwat can this happen more than once.
1232 */
1233 SOCK_RECVBUF_UNLOCK(so2);
1234 (void)mc_split(&mc, &mcnext, space, M_WAITOK);
1235 mc_concat(&mc, &mcnext);
1236 SOCK_RECVBUF_LOCK(so2);
1237 goto restart;
1238 }
1239 MPASS(mc.mc_len == space);
1240 }
1241 if (!STAILQ_EMPTY(&cmc.mc_q)) {
1242 STAILQ_CONCAT(&sb->uxst_mbq, &cmc.mc_q);
1243 sb->sb_ctl += cmc.mc_len;
1244 sb->sb_mbcnt += cmc.mc_mlen;
1245 cmc.mc_len = 0;
1246 }
1247 sent += mc.mc_len;
1248 if (sb->uxst_fnrdy == NULL)
1249 sb->sb_acc += mc.mc_len;
1250 sb->sb_ccc += mc.mc_len;
1251 sb->sb_mbcnt += mc.mc_mlen;
1252 STAILQ_CONCAT(&sb->uxst_mbq, &mc.mc_q);
1253 UIPC_STREAM_SBCHECK(sb);
1254 space = uipc_stream_sbspace(sb);
1255 sorwakeup_locked(so2);
1256 if (!STAILQ_EMPTY(&mcnext.mc_q)) {
1257 /*
1258 * Such assignment is unsafe in general, but it is
1259 * safe with !STAILQ_EMPTY(&mcnext.mc_q). In C++ we
1260 * could reload = for STAILQs :)
1261 */
1262 mc = mcnext;
1263 } else if (uio != NULL && uio->uio_resid > 0) {
1264 /*
1265 * Copyin sum of peer's receive buffer space and our
1266 * sb_hiwat, which is our virtual send buffer size.
1267 * See comment above unpst_sendspace declaration.
1268 * We are reading sb_hiwat locklessly, cause a) we
1269 * don't care about an application that does send(2)
1270 * and setsockopt(2) racing internally, and for an
1271 * application that does this in sequence we will see
1272 * the correct value cause sbsetopt() uses buffer lock
1273 * and we also have already acquired it at least once.
1274 */
1275 error = mc_uiotomc(&mc, uio, space +
1276 atomic_load_int(&so->so_snd.sb_hiwat), 0, M_WAITOK,
1277 eor ? M_EOR : 0);
1278 if (__predict_false(error))
1279 goto out4;
1280 } else
1281 mc = MCHAIN_INITIALIZER(&mc);
1282 }
1283
1284 MPASS(STAILQ_EMPTY(&mc.mc_q));
1285
1286 td->td_ru.ru_msgsnd++;
1287 out4:
1288 sorele(so2);
1289 out3:
1290 SOCK_IO_SEND_UNLOCK(so);
1291 out2:
1292 if (aio) {
1293 freeuio(uio);
1294 uioadvance(uio0, sent);
1295 } else if (uio != NULL)
1296 uio->uio_resid = resid - sent;
1297 if (!mc_empty(&cmc))
1298 unp_scan(mc_first(&cmc), unp_freerights);
1299 out:
1300 mc_freem(&mc);
1301 mc_freem(&cmc);
1302
1303 return (error);
1304 }
1305
1306 /*
1307 * Wakeup a writer, used by recv(2) and shutdown(2).
1308 *
1309 * @param so Points to a connected stream socket with receive buffer locked
1310 *
1311 * In a blocking mode peer is sleeping on our receive buffer, and we need just
1312 * wakeup(9) on it. But to wake up various event engines, we need to reach
1313 * over to peer's selinfo. This can be safely done as the socket buffer
1314 * receive lock is protecting us from the peer going away.
1315 */
1316 static void
uipc_wakeup_writer(struct socket * so)1317 uipc_wakeup_writer(struct socket *so)
1318 {
1319 struct sockbuf *sb = &so->so_rcv;
1320 struct selinfo *sel;
1321
1322 SOCK_RECVBUF_LOCK_ASSERT(so);
1323 MPASS(sb->uxst_peer != NULL);
1324
1325 sel = &sb->uxst_peer->so_wrsel;
1326
1327 if (sb->uxst_flags & UXST_PEER_SEL) {
1328 selwakeuppri(sel, PSOCK);
1329 /*
1330 * XXXGL: sowakeup() does SEL_WAITING() without locks.
1331 */
1332 if (!SEL_WAITING(sel))
1333 sb->uxst_flags &= ~UXST_PEER_SEL;
1334 }
1335 if (sb->sb_flags & SB_WAIT) {
1336 sb->sb_flags &= ~SB_WAIT;
1337 wakeup(&sb->sb_acc);
1338 }
1339 KNOTE_LOCKED(&sel->si_note, 0);
1340 SOCK_RECVBUF_UNLOCK(so);
1341 }
1342
1343 static void
uipc_cantrcvmore(struct socket * so)1344 uipc_cantrcvmore(struct socket *so)
1345 {
1346
1347 SOCK_RECVBUF_LOCK(so);
1348 so->so_rcv.sb_state |= SBS_CANTRCVMORE;
1349 selwakeuppri(&so->so_rdsel, PSOCK);
1350 KNOTE_LOCKED(&so->so_rdsel.si_note, 0);
1351 if (so->so_rcv.uxst_peer != NULL)
1352 uipc_wakeup_writer(so);
1353 else
1354 SOCK_RECVBUF_UNLOCK(so);
1355 }
1356
1357 static int
uipc_soreceive_stream_or_seqpacket(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1358 uipc_soreceive_stream_or_seqpacket(struct socket *so, struct sockaddr **psa,
1359 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1360 {
1361 struct sockbuf *sb = &so->so_rcv;
1362 struct mbuf *control, *m, *first, *last, *next;
1363 u_int ctl, space, datalen, mbcnt, lastlen;
1364 int error, flags;
1365 bool nonblock, waitall, peek;
1366
1367 MPASS(mp0 == NULL);
1368
1369 if (psa != NULL)
1370 *psa = NULL;
1371 if (controlp != NULL)
1372 *controlp = NULL;
1373
1374 flags = flagsp != NULL ? *flagsp : 0;
1375 nonblock = (so->so_state & SS_NBIO) ||
1376 (flags & (MSG_DONTWAIT | MSG_NBIO));
1377 peek = flags & MSG_PEEK;
1378 waitall = (flags & MSG_WAITALL) && !peek;
1379
1380 /*
1381 * This check may fail only on a socket that never went through
1382 * connect(2). We can check this locklessly, cause: a) for a new born
1383 * socket we don't care about applications that may race internally
1384 * between connect(2) and recv(2), and b) for a dying socket if we
1385 * miss update by unp_sosidisconnected(), we would still get the check
1386 * correct. For dying socket we would observe SBS_CANTRCVMORE later.
1387 */
1388 if (__predict_false((atomic_load_short(&so->so_state) &
1389 (SS_ISCONNECTED|SS_ISDISCONNECTED)) == 0))
1390 return (ENOTCONN);
1391
1392 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
1393 if (__predict_false(error))
1394 return (error);
1395
1396 restart:
1397 SOCK_RECVBUF_LOCK(so);
1398 UIPC_STREAM_SBCHECK(sb);
1399 while (sb->sb_acc < sb->sb_lowat &&
1400 (sb->sb_ctl == 0 || controlp == NULL)) {
1401 if (so->so_error) {
1402 error = so->so_error;
1403 if (!peek)
1404 so->so_error = 0;
1405 SOCK_RECVBUF_UNLOCK(so);
1406 SOCK_IO_RECV_UNLOCK(so);
1407 return (error);
1408 }
1409 if (sb->sb_state & SBS_CANTRCVMORE) {
1410 SOCK_RECVBUF_UNLOCK(so);
1411 SOCK_IO_RECV_UNLOCK(so);
1412 return (0);
1413 }
1414 if (nonblock) {
1415 SOCK_RECVBUF_UNLOCK(so);
1416 SOCK_IO_RECV_UNLOCK(so);
1417 return (EWOULDBLOCK);
1418 }
1419 error = sbwait(so, SO_RCV);
1420 if (error) {
1421 SOCK_RECVBUF_UNLOCK(so);
1422 SOCK_IO_RECV_UNLOCK(so);
1423 return (error);
1424 }
1425 }
1426
1427 MPASS(STAILQ_FIRST(&sb->uxst_mbq));
1428 MPASS(sb->sb_acc > 0 || sb->sb_ctl > 0);
1429
1430 mbcnt = 0;
1431 ctl = 0;
1432 first = STAILQ_FIRST(&sb->uxst_mbq);
1433 if (first->m_type == MT_CONTROL) {
1434 control = first;
1435 STAILQ_FOREACH_FROM(first, &sb->uxst_mbq, m_stailq) {
1436 if (first->m_type != MT_CONTROL)
1437 break;
1438 ctl += first->m_len;
1439 mbcnt += MSIZE;
1440 if (first->m_flags & M_EXT)
1441 mbcnt += first->m_ext.ext_size;
1442 }
1443 } else
1444 control = NULL;
1445
1446 /*
1447 * Find split point for the next copyout. On exit from the loop:
1448 * last == NULL - socket to be flushed
1449 * last != NULL
1450 * lastlen > last->m_len - uio to be filled, last to be adjusted
1451 * lastlen == 0 - MT_CONTROL, M_EOR or M_NOTREADY encountered
1452 */
1453 space = uio->uio_resid;
1454 datalen = 0;
1455 for (m = first, last = sb->uxst_fnrdy, lastlen = 0;
1456 m != sb->uxst_fnrdy;
1457 m = STAILQ_NEXT(m, m_stailq)) {
1458 if (m->m_type != MT_DATA) {
1459 last = m;
1460 lastlen = 0;
1461 break;
1462 }
1463 if (space >= m->m_len) {
1464 space -= m->m_len;
1465 datalen += m->m_len;
1466 mbcnt += MSIZE;
1467 if (m->m_flags & M_EXT)
1468 mbcnt += m->m_ext.ext_size;
1469 if (m->m_flags & M_EOR) {
1470 last = STAILQ_NEXT(m, m_stailq);
1471 lastlen = 0;
1472 flags |= MSG_EOR;
1473 break;
1474 }
1475 } else {
1476 datalen += space;
1477 last = m;
1478 lastlen = space;
1479 break;
1480 }
1481 }
1482
1483 UIPC_STREAM_SBCHECK(sb);
1484 if (!peek) {
1485 if (last == NULL)
1486 STAILQ_INIT(&sb->uxst_mbq);
1487 else {
1488 STAILQ_FIRST(&sb->uxst_mbq) = last;
1489 MPASS(last->m_len > lastlen);
1490 last->m_len -= lastlen;
1491 last->m_data += lastlen;
1492 }
1493 MPASS(sb->sb_acc >= datalen);
1494 sb->sb_acc -= datalen;
1495 sb->sb_ccc -= datalen;
1496 MPASS(sb->sb_ctl >= ctl);
1497 sb->sb_ctl -= ctl;
1498 MPASS(sb->sb_mbcnt >= mbcnt);
1499 sb->sb_mbcnt -= mbcnt;
1500 UIPC_STREAM_SBCHECK(sb);
1501 if (__predict_true(sb->uxst_peer != NULL)) {
1502 struct unpcb *unp2;
1503 bool aio;
1504
1505 if ((aio = sb->uxst_flags & UXST_PEER_AIO))
1506 sb->uxst_flags &= ~UXST_PEER_AIO;
1507
1508 uipc_wakeup_writer(so);
1509 /*
1510 * XXXGL: need to go through uipc_lock_peer() after
1511 * the receive buffer lock dropped, it was protecting
1512 * us from unp_soisdisconnected(). The aio workarounds
1513 * should be refactored to the aio(4) side.
1514 */
1515 if (aio && uipc_lock_peer(so, &unp2) == 0) {
1516 struct socket *so2 = unp2->unp_socket;
1517
1518 SOCK_SENDBUF_LOCK(so2);
1519 so2->so_snd.sb_ccc -= datalen;
1520 sowakeup_aio(so2, SO_SND);
1521 SOCK_SENDBUF_UNLOCK(so2);
1522 UNP_PCB_UNLOCK(unp2);
1523 }
1524 } else
1525 SOCK_RECVBUF_UNLOCK(so);
1526 } else
1527 SOCK_RECVBUF_UNLOCK(so);
1528
1529 while (control != NULL && control->m_type == MT_CONTROL) {
1530 if (!peek) {
1531 /*
1532 * unp_externalize() failure must abort entire read(2).
1533 * Such failure should also free the problematic
1534 * control, but link back the remaining data to the head
1535 * of the buffer, so that socket is not left in a state
1536 * where it can't progress forward with reading.
1537 * Probability of such a failure is really low, so it
1538 * is fine that we need to perform pretty complex
1539 * operation here to reconstruct the buffer.
1540 */
1541 error = unp_externalize(control, controlp, flags);
1542 control = m_free(control);
1543 if (__predict_false(error && control != NULL)) {
1544 struct mchain cmc;
1545
1546 mc_init_m(&cmc, control);
1547
1548 SOCK_RECVBUF_LOCK(so);
1549 MPASS(!(sb->sb_state & SBS_CANTRCVMORE));
1550
1551 if (__predict_false(cmc.mc_len + sb->sb_ccc +
1552 sb->sb_ctl > sb->sb_hiwat)) {
1553 /*
1554 * Too bad, while unp_externalize() was
1555 * failing, the other side had filled
1556 * the buffer and we can't prepend data
1557 * back. Losing data!
1558 */
1559 SOCK_RECVBUF_UNLOCK(so);
1560 SOCK_IO_RECV_UNLOCK(so);
1561 unp_scan(mc_first(&cmc),
1562 unp_freerights);
1563 mc_freem(&cmc);
1564 return (error);
1565 }
1566
1567 UIPC_STREAM_SBCHECK(sb);
1568 /* XXXGL: STAILQ_PREPEND */
1569 STAILQ_CONCAT(&cmc.mc_q, &sb->uxst_mbq);
1570 STAILQ_SWAP(&cmc.mc_q, &sb->uxst_mbq, mbuf);
1571
1572 sb->sb_ctl = sb->sb_acc = sb->sb_ccc =
1573 sb->sb_mbcnt = 0;
1574 STAILQ_FOREACH(m, &sb->uxst_mbq, m_stailq) {
1575 if (m->m_type == MT_DATA) {
1576 sb->sb_acc += m->m_len;
1577 sb->sb_ccc += m->m_len;
1578 } else {
1579 sb->sb_ctl += m->m_len;
1580 }
1581 sb->sb_mbcnt += MSIZE;
1582 if (m->m_flags & M_EXT)
1583 sb->sb_mbcnt +=
1584 m->m_ext.ext_size;
1585 }
1586 UIPC_STREAM_SBCHECK(sb);
1587 SOCK_RECVBUF_UNLOCK(so);
1588 SOCK_IO_RECV_UNLOCK(so);
1589 return (error);
1590 }
1591 if (controlp != NULL) {
1592 while (*controlp != NULL)
1593 controlp = &(*controlp)->m_next;
1594 }
1595 } else {
1596 /*
1597 * XXXGL
1598 *
1599 * In MSG_PEEK case control is not externalized. This
1600 * means we are leaking some kernel pointers to the
1601 * userland. They are useless to a law-abiding
1602 * application, but may be useful to a malware. This
1603 * is what the historical implementation in the
1604 * soreceive_generic() did. To be improved?
1605 */
1606 if (controlp != NULL) {
1607 *controlp = m_copym(control, 0, control->m_len,
1608 M_WAITOK);
1609 controlp = &(*controlp)->m_next;
1610 }
1611 control = STAILQ_NEXT(control, m_stailq);
1612 }
1613 }
1614
1615 for (m = first; m != last; m = next) {
1616 next = STAILQ_NEXT(m, m_stailq);
1617 error = uiomove(mtod(m, char *), m->m_len, uio);
1618 if (__predict_false(error)) {
1619 SOCK_IO_RECV_UNLOCK(so);
1620 if (!peek)
1621 for (; m != last; m = next) {
1622 next = STAILQ_NEXT(m, m_stailq);
1623 m_free(m);
1624 }
1625 return (error);
1626 }
1627 if (!peek)
1628 m_free(m);
1629 }
1630 if (last != NULL && lastlen > 0) {
1631 if (!peek) {
1632 MPASS(!(m->m_flags & M_PKTHDR));
1633 MPASS(last->m_data - M_START(last) >= lastlen);
1634 error = uiomove(mtod(last, char *) - lastlen,
1635 lastlen, uio);
1636 } else
1637 error = uiomove(mtod(last, char *), lastlen, uio);
1638 if (__predict_false(error)) {
1639 SOCK_IO_RECV_UNLOCK(so);
1640 return (error);
1641 }
1642 }
1643 if (waitall && !(flags & MSG_EOR) && uio->uio_resid > 0)
1644 goto restart;
1645 SOCK_IO_RECV_UNLOCK(so);
1646
1647 if (flagsp != NULL)
1648 *flagsp |= flags;
1649
1650 uio->uio_td->td_ru.ru_msgrcv++;
1651
1652 return (0);
1653 }
1654
1655 static int
uipc_sopoll_stream_or_seqpacket(struct socket * so,int events,struct thread * td)1656 uipc_sopoll_stream_or_seqpacket(struct socket *so, int events,
1657 struct thread *td)
1658 {
1659 struct unpcb *unp = sotounpcb(so);
1660 int revents;
1661
1662 UNP_PCB_LOCK(unp);
1663 if (SOLISTENING(so)) {
1664 /* The above check is safe, since conversion to listening uses
1665 * both protocol and socket lock.
1666 */
1667 SOCK_LOCK(so);
1668 if (!(events & (POLLIN | POLLRDNORM)))
1669 revents = 0;
1670 else if (!TAILQ_EMPTY(&so->sol_comp))
1671 revents = events & (POLLIN | POLLRDNORM);
1672 else if (so->so_error)
1673 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
1674 else {
1675 selrecord(td, &so->so_rdsel);
1676 revents = 0;
1677 }
1678 SOCK_UNLOCK(so);
1679 } else {
1680 if (so->so_state & SS_ISDISCONNECTED)
1681 revents = POLLHUP;
1682 else
1683 revents = 0;
1684 if (events & (POLLIN | POLLRDNORM | POLLRDHUP)) {
1685 SOCK_RECVBUF_LOCK(so);
1686 if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat ||
1687 so->so_error || so->so_rerror)
1688 revents |= events & (POLLIN | POLLRDNORM);
1689 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
1690 revents |= events &
1691 (POLLIN | POLLRDNORM | POLLRDHUP);
1692 if (!(revents & (POLLIN | POLLRDNORM | POLLRDHUP))) {
1693 selrecord(td, &so->so_rdsel);
1694 so->so_rcv.sb_flags |= SB_SEL;
1695 }
1696 SOCK_RECVBUF_UNLOCK(so);
1697 }
1698 if (events & (POLLOUT | POLLWRNORM)) {
1699 struct socket *so2 = so->so_rcv.uxst_peer;
1700
1701 if (so2 != NULL) {
1702 struct sockbuf *sb = &so2->so_rcv;
1703
1704 SOCK_RECVBUF_LOCK(so2);
1705 if (uipc_stream_sbspace(sb) >= sb->sb_lowat)
1706 revents |= events &
1707 (POLLOUT | POLLWRNORM);
1708 if (sb->sb_state & SBS_CANTRCVMORE)
1709 revents |= POLLHUP;
1710 if (!(revents & (POLLOUT | POLLWRNORM))) {
1711 so2->so_rcv.uxst_flags |= UXST_PEER_SEL;
1712 selrecord(td, &so->so_wrsel);
1713 }
1714 SOCK_RECVBUF_UNLOCK(so2);
1715 } else
1716 selrecord(td, &so->so_wrsel);
1717 }
1718 }
1719 UNP_PCB_UNLOCK(unp);
1720 return (revents);
1721 }
1722
1723 static void
uipc_wrknl_lock(void * arg)1724 uipc_wrknl_lock(void *arg)
1725 {
1726 struct socket *so = arg;
1727 struct unpcb *unp = sotounpcb(so);
1728
1729 retry:
1730 if (SOLISTENING(so)) {
1731 SOLISTEN_LOCK(so);
1732 } else {
1733 UNP_PCB_LOCK(unp);
1734 if (__predict_false(SOLISTENING(so))) {
1735 UNP_PCB_UNLOCK(unp);
1736 goto retry;
1737 }
1738 if (so->so_rcv.uxst_peer != NULL)
1739 SOCK_RECVBUF_LOCK(so->so_rcv.uxst_peer);
1740 }
1741 }
1742
1743 static void
uipc_wrknl_unlock(void * arg)1744 uipc_wrknl_unlock(void *arg)
1745 {
1746 struct socket *so = arg;
1747 struct unpcb *unp = sotounpcb(so);
1748
1749 if (SOLISTENING(so))
1750 SOLISTEN_UNLOCK(so);
1751 else {
1752 if (so->so_rcv.uxst_peer != NULL)
1753 SOCK_RECVBUF_UNLOCK(so->so_rcv.uxst_peer);
1754 UNP_PCB_UNLOCK(unp);
1755 }
1756 }
1757
1758 static void
uipc_wrknl_assert_lock(void * arg,int what)1759 uipc_wrknl_assert_lock(void *arg, int what)
1760 {
1761 struct socket *so = arg;
1762
1763 if (SOLISTENING(so)) {
1764 if (what == LA_LOCKED)
1765 SOLISTEN_LOCK_ASSERT(so);
1766 else
1767 SOLISTEN_UNLOCK_ASSERT(so);
1768 } else {
1769 /*
1770 * The pr_soreceive method will put a note without owning the
1771 * unp lock, so we can't assert it here. But we can safely
1772 * dereference uxst_peer pointer, since receive buffer lock
1773 * is assumed to be held here.
1774 */
1775 if (what == LA_LOCKED && so->so_rcv.uxst_peer != NULL)
1776 SOCK_RECVBUF_LOCK_ASSERT(so->so_rcv.uxst_peer);
1777 }
1778 }
1779
1780 static void
uipc_filt_sowdetach(struct knote * kn)1781 uipc_filt_sowdetach(struct knote *kn)
1782 {
1783 struct socket *so = kn->kn_fp->f_data;
1784
1785 uipc_wrknl_lock(so);
1786 knlist_remove(&so->so_wrsel.si_note, kn, 1);
1787 uipc_wrknl_unlock(so);
1788 }
1789
1790 static int
uipc_filt_sowrite(struct knote * kn,long hint)1791 uipc_filt_sowrite(struct knote *kn, long hint)
1792 {
1793 struct socket *so = kn->kn_fp->f_data, *so2;
1794 struct unpcb *unp = sotounpcb(so), *unp2 = unp->unp_conn;
1795
1796 if (SOLISTENING(so))
1797 return (0);
1798
1799 if (unp2 == NULL) {
1800 if (so->so_state & SS_ISDISCONNECTED) {
1801 kn->kn_flags |= EV_EOF;
1802 kn->kn_fflags = so->so_error;
1803 return (1);
1804 } else
1805 return (0);
1806 }
1807
1808 so2 = unp2->unp_socket;
1809 SOCK_RECVBUF_LOCK_ASSERT(so2);
1810 kn->kn_data = uipc_stream_sbspace(&so2->so_rcv);
1811
1812 if (so2->so_rcv.sb_state & SBS_CANTRCVMORE) {
1813 /*
1814 * XXXGL: maybe kn->kn_flags |= EV_EOF ?
1815 */
1816 return (1);
1817 } else if (kn->kn_sfflags & NOTE_LOWAT)
1818 return (kn->kn_data >= kn->kn_sdata);
1819 else
1820 return (kn->kn_data >= so2->so_rcv.sb_lowat);
1821 }
1822
1823 static int
uipc_filt_soempty(struct knote * kn,long hint)1824 uipc_filt_soempty(struct knote *kn, long hint)
1825 {
1826 struct socket *so = kn->kn_fp->f_data, *so2;
1827 struct unpcb *unp = sotounpcb(so), *unp2 = unp->unp_conn;
1828
1829 if (SOLISTENING(so) || unp2 == NULL)
1830 return (1);
1831
1832 so2 = unp2->unp_socket;
1833 SOCK_RECVBUF_LOCK_ASSERT(so2);
1834 kn->kn_data = uipc_stream_sbspace(&so2->so_rcv);
1835
1836 return (kn->kn_data == 0 ? 1 : 0);
1837 }
1838
1839 static const struct filterops uipc_write_filtops = {
1840 .f_isfd = 1,
1841 .f_detach = uipc_filt_sowdetach,
1842 .f_event = uipc_filt_sowrite,
1843 };
1844 static const struct filterops uipc_empty_filtops = {
1845 .f_isfd = 1,
1846 .f_detach = uipc_filt_sowdetach,
1847 .f_event = uipc_filt_soempty,
1848 };
1849
1850 static int
uipc_kqfilter_stream_or_seqpacket(struct socket * so,struct knote * kn)1851 uipc_kqfilter_stream_or_seqpacket(struct socket *so, struct knote *kn)
1852 {
1853 struct unpcb *unp = sotounpcb(so);
1854 struct knlist *knl;
1855
1856 switch (kn->kn_filter) {
1857 case EVFILT_READ:
1858 return (sokqfilter_generic(so, kn));
1859 case EVFILT_WRITE:
1860 kn->kn_fop = &uipc_write_filtops;
1861 break;
1862 case EVFILT_EMPTY:
1863 kn->kn_fop = &uipc_empty_filtops;
1864 break;
1865 default:
1866 return (EINVAL);
1867 }
1868
1869 knl = &so->so_wrsel.si_note;
1870 UNP_PCB_LOCK(unp);
1871 if (SOLISTENING(so)) {
1872 SOLISTEN_LOCK(so);
1873 knlist_add(knl, kn, 1);
1874 SOLISTEN_UNLOCK(so);
1875 } else {
1876 struct socket *so2 = so->so_rcv.uxst_peer;
1877
1878 if (so2 != NULL)
1879 SOCK_RECVBUF_LOCK(so2);
1880 knlist_add(knl, kn, 1);
1881 if (so2 != NULL)
1882 SOCK_RECVBUF_UNLOCK(so2);
1883 }
1884 UNP_PCB_UNLOCK(unp);
1885 return (0);
1886 }
1887
1888 /* PF_UNIX/SOCK_DGRAM version of sbspace() */
1889 static inline bool
uipc_dgram_sbspace(struct sockbuf * sb,u_int cc,u_int mbcnt)1890 uipc_dgram_sbspace(struct sockbuf *sb, u_int cc, u_int mbcnt)
1891 {
1892 u_int bleft, mleft;
1893
1894 /*
1895 * Negative space may happen if send(2) is followed by
1896 * setsockopt(SO_SNDBUF/SO_RCVBUF) that shrinks maximum.
1897 */
1898 if (__predict_false(sb->sb_hiwat < sb->uxdg_cc ||
1899 sb->sb_mbmax < sb->uxdg_mbcnt))
1900 return (false);
1901
1902 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE))
1903 return (false);
1904
1905 bleft = sb->sb_hiwat - sb->uxdg_cc;
1906 mleft = sb->sb_mbmax - sb->uxdg_mbcnt;
1907
1908 return (bleft >= cc && mleft >= mbcnt);
1909 }
1910
1911 /*
1912 * PF_UNIX/SOCK_DGRAM send
1913 *
1914 * Allocate a record consisting of 3 mbufs in the sequence of
1915 * from -> control -> data and append it to the socket buffer.
1916 *
1917 * The first mbuf carries sender's name and is a pkthdr that stores
1918 * overall length of datagram, its memory consumption and control length.
1919 */
1920 #define ctllen PH_loc.thirtytwo[1]
1921 _Static_assert(offsetof(struct pkthdr, memlen) + sizeof(u_int) <=
1922 offsetof(struct pkthdr, ctllen), "unix/dgram can not store ctllen");
1923 static int
uipc_sosend_dgram(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * m,struct mbuf * c,int flags,struct thread * td)1924 uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1925 struct mbuf *m, struct mbuf *c, int flags, struct thread *td)
1926 {
1927 struct unpcb *unp, *unp2;
1928 const struct sockaddr *from;
1929 struct socket *so2;
1930 struct sockbuf *sb;
1931 struct mchain cmc = MCHAIN_INITIALIZER(&cmc);
1932 struct mbuf *f;
1933 u_int cc, ctl, mbcnt;
1934 u_int dcc __diagused, dctl __diagused, dmbcnt __diagused;
1935 int error;
1936
1937 MPASS((uio != NULL && m == NULL) || (m != NULL && uio == NULL));
1938
1939 error = 0;
1940 f = NULL;
1941
1942 if (__predict_false(flags & MSG_OOB)) {
1943 error = EOPNOTSUPP;
1944 goto out;
1945 }
1946 if (m == NULL) {
1947 if (__predict_false(uio->uio_resid > unpdg_maxdgram)) {
1948 error = EMSGSIZE;
1949 goto out;
1950 }
1951 m = m_uiotombuf(uio, M_WAITOK, 0, max_hdr, M_PKTHDR);
1952 if (__predict_false(m == NULL)) {
1953 error = EFAULT;
1954 goto out;
1955 }
1956 f = m_gethdr(M_WAITOK, MT_SONAME);
1957 cc = m->m_pkthdr.len;
1958 mbcnt = MSIZE + m->m_pkthdr.memlen;
1959 if (c != NULL && (error = unp_internalize(c, &cmc, td)))
1960 goto out;
1961 } else {
1962 struct mchain mc;
1963
1964 uipc_reset_kernel_mbuf(m, &mc);
1965 cc = mc.mc_len;
1966 mbcnt = mc.mc_mlen;
1967 if (__predict_false(m->m_pkthdr.len > unpdg_maxdgram)) {
1968 error = EMSGSIZE;
1969 goto out;
1970 }
1971 if ((f = m_gethdr(M_NOWAIT, MT_SONAME)) == NULL) {
1972 error = ENOBUFS;
1973 goto out;
1974 }
1975 }
1976
1977 unp = sotounpcb(so);
1978 MPASS(unp);
1979
1980 /*
1981 * XXXGL: would be cool to fully remove so_snd out of the equation
1982 * and avoid this lock, which is not only extraneous, but also being
1983 * released, thus still leaving possibility for a race. We can easily
1984 * handle SBS_CANTSENDMORE/SS_ISCONNECTED complement in unpcb, but it
1985 * is more difficult to invent something to handle so_error.
1986 */
1987 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1988 if (error)
1989 goto out2;
1990 SOCK_SENDBUF_LOCK(so);
1991 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1992 SOCK_SENDBUF_UNLOCK(so);
1993 error = EPIPE;
1994 goto out3;
1995 }
1996 if (so->so_error != 0) {
1997 error = so->so_error;
1998 so->so_error = 0;
1999 SOCK_SENDBUF_UNLOCK(so);
2000 goto out3;
2001 }
2002 if (((so->so_state & SS_ISCONNECTED) == 0) && addr == NULL) {
2003 SOCK_SENDBUF_UNLOCK(so);
2004 error = EDESTADDRREQ;
2005 goto out3;
2006 }
2007 SOCK_SENDBUF_UNLOCK(so);
2008
2009 if (addr != NULL) {
2010 if ((error = unp_connectat(AT_FDCWD, so, addr, td, true)))
2011 goto out3;
2012 UNP_PCB_LOCK_ASSERT(unp);
2013 unp2 = unp->unp_conn;
2014 UNP_PCB_LOCK_ASSERT(unp2);
2015 } else {
2016 UNP_PCB_LOCK(unp);
2017 unp2 = unp_pcb_lock_peer(unp);
2018 if (unp2 == NULL) {
2019 UNP_PCB_UNLOCK(unp);
2020 error = ENOTCONN;
2021 goto out3;
2022 }
2023 }
2024
2025 if (unp2->unp_flags & UNP_WANTCRED_MASK)
2026 unp_addsockcred(td, &cmc, unp2->unp_flags);
2027 if (unp->unp_addr != NULL)
2028 from = (struct sockaddr *)unp->unp_addr;
2029 else
2030 from = &sun_noname;
2031 f->m_len = from->sa_len;
2032 MPASS(from->sa_len <= MLEN);
2033 bcopy(from, mtod(f, void *), from->sa_len);
2034
2035 /*
2036 * Concatenate mbufs: from -> control -> data.
2037 * Save overall cc and mbcnt in "from" mbuf.
2038 */
2039 if (!STAILQ_EMPTY(&cmc.mc_q)) {
2040 f->m_next = mc_first(&cmc);
2041 mc_last(&cmc)->m_next = m;
2042 /* XXXGL: This is dirty as well as rollback after ENOBUFS. */
2043 STAILQ_INIT(&cmc.mc_q);
2044 } else
2045 f->m_next = m;
2046 m = NULL;
2047 ctl = f->m_len + cmc.mc_len;
2048 mbcnt += cmc.mc_mlen;
2049 #ifdef INVARIANTS
2050 dcc = dctl = dmbcnt = 0;
2051 for (struct mbuf *mb = f; mb != NULL; mb = mb->m_next) {
2052 if (mb->m_type == MT_DATA)
2053 dcc += mb->m_len;
2054 else
2055 dctl += mb->m_len;
2056 dmbcnt += MSIZE;
2057 if (mb->m_flags & M_EXT)
2058 dmbcnt += mb->m_ext.ext_size;
2059 }
2060 MPASS(dcc == cc);
2061 MPASS(dctl == ctl);
2062 MPASS(dmbcnt == mbcnt);
2063 #endif
2064 f->m_pkthdr.len = cc + ctl;
2065 f->m_pkthdr.memlen = mbcnt;
2066 f->m_pkthdr.ctllen = ctl;
2067
2068 /*
2069 * Destination socket buffer selection.
2070 *
2071 * Unconnected sends, when !(so->so_state & SS_ISCONNECTED) and the
2072 * destination address is supplied, create a temporary connection for
2073 * the run time of the function (see call to unp_connectat() above and
2074 * to unp_disconnect() below). We distinguish them by condition of
2075 * (addr != NULL). We intentionally avoid adding 'bool connected' for
2076 * that condition, since, again, through the run time of this code we
2077 * are always connected. For such "unconnected" sends, the destination
2078 * buffer would be the receive buffer of destination socket so2.
2079 *
2080 * For connected sends, data lands on the send buffer of the sender's
2081 * socket "so". Then, if we just added the very first datagram
2082 * on this send buffer, we need to add the send buffer on to the
2083 * receiving socket's buffer list. We put ourselves on top of the
2084 * list. Such logic gives infrequent senders priority over frequent
2085 * senders.
2086 *
2087 * Note on byte count management. As long as event methods kevent(2),
2088 * select(2) are not protocol specific (yet), we need to maintain
2089 * meaningful values on the receive buffer. So, the receive buffer
2090 * would accumulate counters from all connected buffers potentially
2091 * having sb_ccc > sb_hiwat or sb_mbcnt > sb_mbmax.
2092 */
2093 so2 = unp2->unp_socket;
2094 sb = (addr == NULL) ? &so->so_snd : &so2->so_rcv;
2095 SOCK_RECVBUF_LOCK(so2);
2096 if (uipc_dgram_sbspace(sb, cc + ctl, mbcnt)) {
2097 if (addr == NULL && STAILQ_EMPTY(&sb->uxdg_mb))
2098 TAILQ_INSERT_HEAD(&so2->so_rcv.uxdg_conns, &so->so_snd,
2099 uxdg_clist);
2100 STAILQ_INSERT_TAIL(&sb->uxdg_mb, f, m_stailqpkt);
2101 sb->uxdg_cc += cc + ctl;
2102 sb->uxdg_ctl += ctl;
2103 sb->uxdg_mbcnt += mbcnt;
2104 so2->so_rcv.sb_acc += cc + ctl;
2105 so2->so_rcv.sb_ccc += cc + ctl;
2106 so2->so_rcv.sb_ctl += ctl;
2107 so2->so_rcv.sb_mbcnt += mbcnt;
2108 sorwakeup_locked(so2);
2109 f = NULL;
2110 } else {
2111 soroverflow_locked(so2);
2112 error = ENOBUFS;
2113 if (f->m_next->m_type == MT_CONTROL) {
2114 STAILQ_FIRST(&cmc.mc_q) = f->m_next;
2115 f->m_next = NULL;
2116 }
2117 }
2118
2119 if (addr != NULL)
2120 unp_disconnect(unp, unp2);
2121 else
2122 unp_pcb_unlock_pair(unp, unp2);
2123
2124 td->td_ru.ru_msgsnd++;
2125
2126 out3:
2127 SOCK_IO_SEND_UNLOCK(so);
2128 out2:
2129 if (!mc_empty(&cmc))
2130 unp_scan(mc_first(&cmc), unp_freerights);
2131 out:
2132 if (f)
2133 m_freem(f);
2134 mc_freem(&cmc);
2135 if (m)
2136 m_freem(m);
2137
2138 return (error);
2139 }
2140
2141 /*
2142 * PF_UNIX/SOCK_DGRAM receive with MSG_PEEK.
2143 * The mbuf has already been unlinked from the uxdg_mb of socket buffer
2144 * and needs to be linked onto uxdg_peeked of receive socket buffer.
2145 */
2146 static int
uipc_peek_dgram(struct socket * so,struct mbuf * m,struct sockaddr ** psa,struct uio * uio,struct mbuf ** controlp,int * flagsp)2147 uipc_peek_dgram(struct socket *so, struct mbuf *m, struct sockaddr **psa,
2148 struct uio *uio, struct mbuf **controlp, int *flagsp)
2149 {
2150 ssize_t len = 0;
2151 int error;
2152
2153 so->so_rcv.uxdg_peeked = m;
2154 so->so_rcv.uxdg_cc += m->m_pkthdr.len;
2155 so->so_rcv.uxdg_ctl += m->m_pkthdr.ctllen;
2156 so->so_rcv.uxdg_mbcnt += m->m_pkthdr.memlen;
2157 SOCK_RECVBUF_UNLOCK(so);
2158
2159 KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
2160 if (psa != NULL)
2161 *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
2162
2163 m = m->m_next;
2164 KASSERT(m, ("%s: no data or control after soname", __func__));
2165
2166 /*
2167 * With MSG_PEEK the control isn't executed, just copied.
2168 */
2169 while (m != NULL && m->m_type == MT_CONTROL) {
2170 if (controlp != NULL) {
2171 *controlp = m_copym(m, 0, m->m_len, M_WAITOK);
2172 controlp = &(*controlp)->m_next;
2173 }
2174 m = m->m_next;
2175 }
2176 KASSERT(m == NULL || m->m_type == MT_DATA,
2177 ("%s: not MT_DATA mbuf %p", __func__, m));
2178 while (m != NULL && uio->uio_resid > 0) {
2179 len = uio->uio_resid;
2180 if (len > m->m_len)
2181 len = m->m_len;
2182 error = uiomove(mtod(m, char *), (int)len, uio);
2183 if (error) {
2184 SOCK_IO_RECV_UNLOCK(so);
2185 return (error);
2186 }
2187 if (len == m->m_len)
2188 m = m->m_next;
2189 }
2190 SOCK_IO_RECV_UNLOCK(so);
2191
2192 if (flagsp != NULL) {
2193 if (m != NULL) {
2194 if (*flagsp & MSG_TRUNC) {
2195 /* Report real length of the packet */
2196 uio->uio_resid -= m_length(m, NULL) - len;
2197 }
2198 *flagsp |= MSG_TRUNC;
2199 } else
2200 *flagsp &= ~MSG_TRUNC;
2201 }
2202
2203 return (0);
2204 }
2205
2206 /*
2207 * PF_UNIX/SOCK_DGRAM receive
2208 */
2209 static int
uipc_soreceive_dgram(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)2210 uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2211 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2212 {
2213 struct sockbuf *sb = NULL;
2214 struct mbuf *m;
2215 int flags, error;
2216 ssize_t len = 0;
2217 bool nonblock;
2218
2219 MPASS(mp0 == NULL);
2220
2221 if (psa != NULL)
2222 *psa = NULL;
2223 if (controlp != NULL)
2224 *controlp = NULL;
2225
2226 flags = flagsp != NULL ? *flagsp : 0;
2227 nonblock = (so->so_state & SS_NBIO) ||
2228 (flags & (MSG_DONTWAIT | MSG_NBIO));
2229
2230 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
2231 if (__predict_false(error))
2232 return (error);
2233
2234 /*
2235 * Loop blocking while waiting for a datagram. Prioritize connected
2236 * peers over unconnected sends. Set sb to selected socket buffer
2237 * containing an mbuf on exit from the wait loop. A datagram that
2238 * had already been peeked at has top priority.
2239 */
2240 SOCK_RECVBUF_LOCK(so);
2241 while ((m = so->so_rcv.uxdg_peeked) == NULL &&
2242 (sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) == NULL &&
2243 (m = STAILQ_FIRST(&so->so_rcv.uxdg_mb)) == NULL) {
2244 if (so->so_error) {
2245 error = so->so_error;
2246 if (!(flags & MSG_PEEK))
2247 so->so_error = 0;
2248 SOCK_RECVBUF_UNLOCK(so);
2249 SOCK_IO_RECV_UNLOCK(so);
2250 return (error);
2251 }
2252 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2253 uio->uio_resid == 0) {
2254 SOCK_RECVBUF_UNLOCK(so);
2255 SOCK_IO_RECV_UNLOCK(so);
2256 return (0);
2257 }
2258 if (nonblock) {
2259 SOCK_RECVBUF_UNLOCK(so);
2260 SOCK_IO_RECV_UNLOCK(so);
2261 return (EWOULDBLOCK);
2262 }
2263 error = sbwait(so, SO_RCV);
2264 if (error) {
2265 SOCK_RECVBUF_UNLOCK(so);
2266 SOCK_IO_RECV_UNLOCK(so);
2267 return (error);
2268 }
2269 }
2270
2271 if (sb == NULL)
2272 sb = &so->so_rcv;
2273 else if (m == NULL)
2274 m = STAILQ_FIRST(&sb->uxdg_mb);
2275 else
2276 MPASS(m == so->so_rcv.uxdg_peeked);
2277
2278 MPASS(sb->uxdg_cc > 0);
2279 M_ASSERTPKTHDR(m);
2280 KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
2281
2282 if (uio->uio_td)
2283 uio->uio_td->td_ru.ru_msgrcv++;
2284
2285 if (__predict_true(m != so->so_rcv.uxdg_peeked)) {
2286 STAILQ_REMOVE_HEAD(&sb->uxdg_mb, m_stailqpkt);
2287 if (STAILQ_EMPTY(&sb->uxdg_mb) && sb != &so->so_rcv)
2288 TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist);
2289 } else
2290 so->so_rcv.uxdg_peeked = NULL;
2291
2292 sb->uxdg_cc -= m->m_pkthdr.len;
2293 sb->uxdg_ctl -= m->m_pkthdr.ctllen;
2294 sb->uxdg_mbcnt -= m->m_pkthdr.memlen;
2295
2296 if (__predict_false(flags & MSG_PEEK))
2297 return (uipc_peek_dgram(so, m, psa, uio, controlp, flagsp));
2298
2299 so->so_rcv.sb_acc -= m->m_pkthdr.len;
2300 so->so_rcv.sb_ccc -= m->m_pkthdr.len;
2301 so->so_rcv.sb_ctl -= m->m_pkthdr.ctllen;
2302 so->so_rcv.sb_mbcnt -= m->m_pkthdr.memlen;
2303 SOCK_RECVBUF_UNLOCK(so);
2304
2305 if (psa != NULL)
2306 *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
2307 m = m_free(m);
2308 KASSERT(m, ("%s: no data or control after soname", __func__));
2309
2310 /*
2311 * Packet to copyout() is now in 'm' and it is disconnected from the
2312 * queue.
2313 *
2314 * Process one or more MT_CONTROL mbufs present before any data mbufs
2315 * in the first mbuf chain on the socket buffer. We call into the
2316 * unp_externalize() to perform externalization (or freeing if
2317 * controlp == NULL). In some cases there can be only MT_CONTROL mbufs
2318 * without MT_DATA mbufs.
2319 */
2320 while (m != NULL && m->m_type == MT_CONTROL) {
2321 error = unp_externalize(m, controlp, flags);
2322 m = m_free(m);
2323 if (error != 0) {
2324 SOCK_IO_RECV_UNLOCK(so);
2325 unp_scan(m, unp_freerights);
2326 m_freem(m);
2327 return (error);
2328 }
2329 if (controlp != NULL) {
2330 while (*controlp != NULL)
2331 controlp = &(*controlp)->m_next;
2332 }
2333 }
2334 KASSERT(m == NULL || m->m_type == MT_DATA,
2335 ("%s: not MT_DATA mbuf %p", __func__, m));
2336 while (m != NULL && uio->uio_resid > 0) {
2337 len = uio->uio_resid;
2338 if (len > m->m_len)
2339 len = m->m_len;
2340 error = uiomove(mtod(m, char *), (int)len, uio);
2341 if (error) {
2342 SOCK_IO_RECV_UNLOCK(so);
2343 m_freem(m);
2344 return (error);
2345 }
2346 if (len == m->m_len)
2347 m = m_free(m);
2348 else {
2349 m->m_data += len;
2350 m->m_len -= len;
2351 }
2352 }
2353 SOCK_IO_RECV_UNLOCK(so);
2354
2355 if (m != NULL) {
2356 if (flagsp != NULL) {
2357 if (flags & MSG_TRUNC) {
2358 /* Report real length of the packet */
2359 uio->uio_resid -= m_length(m, NULL);
2360 }
2361 *flagsp |= MSG_TRUNC;
2362 }
2363 m_freem(m);
2364 } else if (flagsp != NULL)
2365 *flagsp &= ~MSG_TRUNC;
2366
2367 return (0);
2368 }
2369
2370 static int
uipc_sendfile_wait(struct socket * so,off_t need,int * space)2371 uipc_sendfile_wait(struct socket *so, off_t need, int *space)
2372 {
2373 struct unpcb *unp2;
2374 struct socket *so2;
2375 struct sockbuf *sb;
2376 bool nonblock, sockref;
2377 int error;
2378
2379 MPASS(so->so_type == SOCK_STREAM);
2380 MPASS(need > 0);
2381 MPASS(space != NULL);
2382
2383 nonblock = so->so_state & SS_NBIO;
2384 sockref = false;
2385
2386 if (__predict_false((so->so_state & SS_ISCONNECTED) == 0))
2387 return (ENOTCONN);
2388
2389 if (__predict_false((error = uipc_lock_peer(so, &unp2)) != 0))
2390 return (error);
2391
2392 so2 = unp2->unp_socket;
2393 sb = &so2->so_rcv;
2394 SOCK_RECVBUF_LOCK(so2);
2395 UNP_PCB_UNLOCK(unp2);
2396 while ((*space = uipc_stream_sbspace(sb)) < need &&
2397 (*space < so->so_snd.sb_hiwat / 2)) {
2398 UIPC_STREAM_SBCHECK(sb);
2399 if (nonblock) {
2400 SOCK_RECVBUF_UNLOCK(so2);
2401 return (EAGAIN);
2402 }
2403 if (!sockref)
2404 soref(so2);
2405 error = sbwait(so2, SO_RCV);
2406 if (error == 0 &&
2407 __predict_false(sb->sb_state & SBS_CANTRCVMORE))
2408 error = EPIPE;
2409 if (error) {
2410 SOCK_RECVBUF_UNLOCK(so2);
2411 sorele(so2);
2412 return (error);
2413 }
2414 }
2415 UIPC_STREAM_SBCHECK(sb);
2416 SOCK_RECVBUF_UNLOCK(so2);
2417 if (sockref)
2418 sorele(so2);
2419
2420 return (0);
2421 }
2422
2423 /*
2424 * Although this is a pr_send method, for unix(4) it is called only via
2425 * sendfile(2) path. This means we can be sure that mbufs are clear of
2426 * any extra flags and don't require any conditioning.
2427 */
2428 static int
uipc_sendfile(struct socket * so,int flags,struct mbuf * m,struct sockaddr * from,struct mbuf * control,struct thread * td)2429 uipc_sendfile(struct socket *so, int flags, struct mbuf *m,
2430 struct sockaddr *from, struct mbuf *control, struct thread *td)
2431 {
2432 struct mchain mc;
2433 struct unpcb *unp2;
2434 struct socket *so2;
2435 struct sockbuf *sb;
2436 bool notready, wakeup;
2437 int error;
2438
2439 MPASS(so->so_type == SOCK_STREAM);
2440 MPASS(from == NULL && control == NULL);
2441 KASSERT(!(m->m_flags & M_EXTPG),
2442 ("unix(4): TLS sendfile(2) not supported"));
2443
2444 notready = flags & PRUS_NOTREADY;
2445
2446 if (__predict_false((so->so_state & SS_ISCONNECTED) == 0)) {
2447 error = ENOTCONN;
2448 goto out;
2449 }
2450
2451 if (__predict_false((error = uipc_lock_peer(so, &unp2)) != 0))
2452 goto out;
2453
2454 mc_init_m(&mc, m);
2455
2456 so2 = unp2->unp_socket;
2457 sb = &so2->so_rcv;
2458 SOCK_RECVBUF_LOCK(so2);
2459 UNP_PCB_UNLOCK(unp2);
2460 UIPC_STREAM_SBCHECK(sb);
2461 sb->sb_ccc += mc.mc_len;
2462 sb->sb_mbcnt += mc.mc_mlen;
2463 if (sb->uxst_fnrdy == NULL) {
2464 if (notready) {
2465 wakeup = false;
2466 STAILQ_FOREACH(m, &mc.mc_q, m_stailq) {
2467 if (m->m_flags & M_NOTREADY) {
2468 sb->uxst_fnrdy = m;
2469 break;
2470 } else {
2471 sb->sb_acc += m->m_len;
2472 wakeup = true;
2473 }
2474 }
2475 } else {
2476 wakeup = true;
2477 sb->sb_acc += mc.mc_len;
2478 }
2479 } else {
2480 wakeup = false;
2481 }
2482 STAILQ_CONCAT(&sb->uxst_mbq, &mc.mc_q);
2483 UIPC_STREAM_SBCHECK(sb);
2484 if (wakeup)
2485 sorwakeup_locked(so2);
2486 else
2487 SOCK_RECVBUF_UNLOCK(so2);
2488
2489 return (0);
2490 out:
2491 /*
2492 * In case of not ready data, uipc_ready() is responsible
2493 * for freeing memory.
2494 */
2495 if (m != NULL && !notready)
2496 m_freem(m);
2497
2498 return (error);
2499 }
2500
2501 static int
uipc_sbready(struct sockbuf * sb,struct mbuf * m,int count)2502 uipc_sbready(struct sockbuf *sb, struct mbuf *m, int count)
2503 {
2504 bool blocker;
2505
2506 /* assert locked */
2507
2508 blocker = (sb->uxst_fnrdy == m);
2509 STAILQ_FOREACH_FROM(m, &sb->uxst_mbq, m_stailq) {
2510 if (count > 0) {
2511 MPASS(m->m_flags & M_NOTREADY);
2512 m->m_flags &= ~M_NOTREADY;
2513 if (blocker)
2514 sb->sb_acc += m->m_len;
2515 count--;
2516 } else if (m->m_flags & M_NOTREADY)
2517 break;
2518 else if (blocker)
2519 sb->sb_acc += m->m_len;
2520 }
2521 if (blocker) {
2522 sb->uxst_fnrdy = m;
2523 return (0);
2524 } else
2525 return (EINPROGRESS);
2526 }
2527
2528 static bool
uipc_ready_scan(struct socket * so,struct mbuf * m,int count,int * errorp)2529 uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp)
2530 {
2531 struct mbuf *mb;
2532 struct sockbuf *sb;
2533
2534 SOCK_LOCK(so);
2535 if (SOLISTENING(so)) {
2536 SOCK_UNLOCK(so);
2537 return (false);
2538 }
2539 mb = NULL;
2540 sb = &so->so_rcv;
2541 SOCK_RECVBUF_LOCK(so);
2542 if (sb->uxst_fnrdy != NULL) {
2543 STAILQ_FOREACH(mb, &sb->uxst_mbq, m_stailq) {
2544 if (mb == m) {
2545 *errorp = uipc_sbready(sb, m, count);
2546 break;
2547 }
2548 }
2549 }
2550 SOCK_RECVBUF_UNLOCK(so);
2551 SOCK_UNLOCK(so);
2552 return (mb != NULL);
2553 }
2554
2555 static int
uipc_ready(struct socket * so,struct mbuf * m,int count)2556 uipc_ready(struct socket *so, struct mbuf *m, int count)
2557 {
2558 struct unpcb *unp, *unp2;
2559 int error;
2560
2561 MPASS(so->so_type == SOCK_STREAM);
2562
2563 if (__predict_true(uipc_lock_peer(so, &unp2) == 0)) {
2564 struct socket *so2;
2565 struct sockbuf *sb;
2566
2567 so2 = unp2->unp_socket;
2568 sb = &so2->so_rcv;
2569 SOCK_RECVBUF_LOCK(so2);
2570 UNP_PCB_UNLOCK(unp2);
2571 UIPC_STREAM_SBCHECK(sb);
2572 error = uipc_sbready(sb, m, count);
2573 UIPC_STREAM_SBCHECK(sb);
2574 if (error == 0)
2575 sorwakeup_locked(so2);
2576 else
2577 SOCK_RECVBUF_UNLOCK(so2);
2578 } else {
2579 /*
2580 * The receiving socket has been disconnected, but may still
2581 * be valid. In this case, the not-ready mbufs are still
2582 * present in its socket buffer, so perform an exhaustive
2583 * search before giving up and freeing the mbufs.
2584 */
2585 UNP_LINK_RLOCK();
2586 LIST_FOREACH(unp, &unp_shead, unp_link) {
2587 if (uipc_ready_scan(unp->unp_socket, m, count, &error))
2588 break;
2589 }
2590 UNP_LINK_RUNLOCK();
2591
2592 if (unp == NULL) {
2593 for (int i = 0; i < count; i++)
2594 m = m_free(m);
2595 return (ECONNRESET);
2596 }
2597 }
2598 return (error);
2599 }
2600
2601 static int
uipc_sense(struct socket * so,struct stat * sb)2602 uipc_sense(struct socket *so, struct stat *sb)
2603 {
2604 struct unpcb *unp;
2605
2606 unp = sotounpcb(so);
2607 KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
2608
2609 sb->st_blksize = so->so_snd.sb_hiwat;
2610 sb->st_dev = NODEV;
2611 sb->st_ino = unp->unp_ino;
2612 return (0);
2613 }
2614
2615 static int
uipc_shutdown(struct socket * so,enum shutdown_how how)2616 uipc_shutdown(struct socket *so, enum shutdown_how how)
2617 {
2618 struct unpcb *unp = sotounpcb(so);
2619 int error;
2620
2621 SOCK_LOCK(so);
2622 if (SOLISTENING(so)) {
2623 if (how != SHUT_WR) {
2624 so->so_error = ECONNABORTED;
2625 solisten_wakeup(so); /* unlocks so */
2626 } else
2627 SOCK_UNLOCK(so);
2628 return (ENOTCONN);
2629 } else if ((so->so_state &
2630 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
2631 /*
2632 * POSIX mandates us to just return ENOTCONN when shutdown(2) is
2633 * invoked on a datagram sockets, however historically we would
2634 * actually tear socket down. This is known to be leveraged by
2635 * some applications to unblock process waiting in recv(2) by
2636 * other process that it shares that socket with. Try to meet
2637 * both backward-compatibility and POSIX requirements by forcing
2638 * ENOTCONN but still flushing buffers and performing wakeup(9).
2639 *
2640 * XXXGL: it remains unknown what applications expect this
2641 * behavior and is this isolated to unix/dgram or inet/dgram or
2642 * both. See: D10351, D3039.
2643 */
2644 error = ENOTCONN;
2645 if (so->so_type != SOCK_DGRAM) {
2646 SOCK_UNLOCK(so);
2647 return (error);
2648 }
2649 } else
2650 error = 0;
2651 SOCK_UNLOCK(so);
2652
2653 switch (how) {
2654 case SHUT_RD:
2655 if (so->so_type == SOCK_DGRAM)
2656 socantrcvmore(so);
2657 else
2658 uipc_cantrcvmore(so);
2659 unp_dispose(so);
2660 break;
2661 case SHUT_RDWR:
2662 if (so->so_type == SOCK_DGRAM)
2663 socantrcvmore(so);
2664 else
2665 uipc_cantrcvmore(so);
2666 unp_dispose(so);
2667 /* FALLTHROUGH */
2668 case SHUT_WR:
2669 if (so->so_type == SOCK_DGRAM) {
2670 socantsendmore(so);
2671 } else {
2672 UNP_PCB_LOCK(unp);
2673 if (unp->unp_conn != NULL)
2674 uipc_cantrcvmore(unp->unp_conn->unp_socket);
2675 UNP_PCB_UNLOCK(unp);
2676 }
2677 }
2678 wakeup(&so->so_timeo);
2679
2680 return (error);
2681 }
2682
2683 static int
uipc_sockaddr(struct socket * so,struct sockaddr * ret)2684 uipc_sockaddr(struct socket *so, struct sockaddr *ret)
2685 {
2686 struct unpcb *unp;
2687 const struct sockaddr *sa;
2688
2689 unp = sotounpcb(so);
2690 KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
2691
2692 UNP_PCB_LOCK(unp);
2693 if (unp->unp_addr != NULL)
2694 sa = (struct sockaddr *) unp->unp_addr;
2695 else
2696 sa = &sun_noname;
2697 bcopy(sa, ret, sa->sa_len);
2698 UNP_PCB_UNLOCK(unp);
2699 return (0);
2700 }
2701
2702 static int
uipc_ctloutput(struct socket * so,struct sockopt * sopt)2703 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
2704 {
2705 struct unpcb *unp;
2706 struct xucred xu;
2707 int error, optval;
2708
2709 if (sopt->sopt_level != SOL_LOCAL)
2710 return (EINVAL);
2711
2712 unp = sotounpcb(so);
2713 KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
2714 error = 0;
2715 switch (sopt->sopt_dir) {
2716 case SOPT_GET:
2717 switch (sopt->sopt_name) {
2718 case LOCAL_PEERCRED:
2719 UNP_PCB_LOCK(unp);
2720 if (unp->unp_flags & UNP_HAVEPC)
2721 xu = unp->unp_peercred;
2722 else {
2723 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
2724 error = ENOTCONN;
2725 else
2726 error = EINVAL;
2727 }
2728 UNP_PCB_UNLOCK(unp);
2729 if (error == 0)
2730 error = sooptcopyout(sopt, &xu, sizeof(xu));
2731 break;
2732
2733 case LOCAL_CREDS:
2734 /* Unlocked read. */
2735 optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0;
2736 error = sooptcopyout(sopt, &optval, sizeof(optval));
2737 break;
2738
2739 case LOCAL_CREDS_PERSISTENT:
2740 /* Unlocked read. */
2741 optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0;
2742 error = sooptcopyout(sopt, &optval, sizeof(optval));
2743 break;
2744
2745 default:
2746 error = EOPNOTSUPP;
2747 break;
2748 }
2749 break;
2750
2751 case SOPT_SET:
2752 switch (sopt->sopt_name) {
2753 case LOCAL_CREDS:
2754 case LOCAL_CREDS_PERSISTENT:
2755 error = sooptcopyin(sopt, &optval, sizeof(optval),
2756 sizeof(optval));
2757 if (error)
2758 break;
2759
2760 #define OPTSET(bit, exclusive) do { \
2761 UNP_PCB_LOCK(unp); \
2762 if (optval) { \
2763 if ((unp->unp_flags & (exclusive)) != 0) { \
2764 UNP_PCB_UNLOCK(unp); \
2765 error = EINVAL; \
2766 break; \
2767 } \
2768 unp->unp_flags |= (bit); \
2769 } else \
2770 unp->unp_flags &= ~(bit); \
2771 UNP_PCB_UNLOCK(unp); \
2772 } while (0)
2773
2774 switch (sopt->sopt_name) {
2775 case LOCAL_CREDS:
2776 OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS);
2777 break;
2778
2779 case LOCAL_CREDS_PERSISTENT:
2780 OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT);
2781 break;
2782
2783 default:
2784 break;
2785 }
2786 break;
2787 #undef OPTSET
2788 default:
2789 error = ENOPROTOOPT;
2790 break;
2791 }
2792 break;
2793
2794 default:
2795 error = EOPNOTSUPP;
2796 break;
2797 }
2798 return (error);
2799 }
2800
2801 static int
unp_connect(struct socket * so,struct sockaddr * nam,struct thread * td)2802 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
2803 {
2804
2805 return (unp_connectat(AT_FDCWD, so, nam, td, false));
2806 }
2807
2808 static int
unp_connectat(int fd,struct socket * so,struct sockaddr * nam,struct thread * td,bool return_locked)2809 unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
2810 struct thread *td, bool return_locked)
2811 {
2812 struct mtx *vplock;
2813 struct sockaddr_un *soun;
2814 struct vnode *vp;
2815 struct socket *so2;
2816 struct unpcb *unp, *unp2, *unp3;
2817 struct nameidata nd;
2818 char buf[SOCK_MAXADDRLEN];
2819 struct sockaddr *sa;
2820 cap_rights_t rights;
2821 int error, len;
2822 bool connreq;
2823
2824 CURVNET_ASSERT_SET();
2825
2826 if (nam->sa_family != AF_UNIX)
2827 return (EAFNOSUPPORT);
2828 if (nam->sa_len > sizeof(struct sockaddr_un))
2829 return (EINVAL);
2830 len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
2831 if (len <= 0)
2832 return (EINVAL);
2833 soun = (struct sockaddr_un *)nam;
2834 bcopy(soun->sun_path, buf, len);
2835 buf[len] = 0;
2836
2837 error = 0;
2838 unp = sotounpcb(so);
2839 UNP_PCB_LOCK(unp);
2840 for (;;) {
2841 /*
2842 * Wait for connection state to stabilize. If a connection
2843 * already exists, give up. For datagram sockets, which permit
2844 * multiple consecutive connect(2) calls, upper layers are
2845 * responsible for disconnecting in advance of a subsequent
2846 * connect(2), but this is not synchronized with PCB connection
2847 * state.
2848 *
2849 * Also make sure that no threads are currently attempting to
2850 * lock the peer socket, to ensure that unp_conn cannot
2851 * transition between two valid sockets while locks are dropped.
2852 */
2853 if (SOLISTENING(so))
2854 error = EOPNOTSUPP;
2855 else if (unp->unp_conn != NULL)
2856 error = EISCONN;
2857 else if ((unp->unp_flags & UNP_CONNECTING) != 0) {
2858 error = EALREADY;
2859 }
2860 if (error != 0) {
2861 UNP_PCB_UNLOCK(unp);
2862 return (error);
2863 }
2864 if (unp->unp_pairbusy > 0) {
2865 unp->unp_flags |= UNP_WAITING;
2866 mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0);
2867 continue;
2868 }
2869 break;
2870 }
2871 unp->unp_flags |= UNP_CONNECTING;
2872 UNP_PCB_UNLOCK(unp);
2873
2874 connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0;
2875 if (connreq)
2876 sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
2877 else
2878 sa = NULL;
2879 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
2880 UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT));
2881 error = namei(&nd);
2882 if (error)
2883 vp = NULL;
2884 else
2885 vp = nd.ni_vp;
2886 ASSERT_VOP_LOCKED(vp, "unp_connect");
2887 if (error)
2888 goto bad;
2889 NDFREE_PNBUF(&nd);
2890
2891 if (vp->v_type != VSOCK) {
2892 error = ENOTSOCK;
2893 goto bad;
2894 }
2895 #ifdef MAC
2896 error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
2897 if (error)
2898 goto bad;
2899 #endif
2900 error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
2901 if (error)
2902 goto bad;
2903
2904 unp = sotounpcb(so);
2905 KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
2906
2907 vplock = mtx_pool_find(unp_vp_mtxpool, vp);
2908 mtx_lock(vplock);
2909 VOP_UNP_CONNECT(vp, &unp2);
2910 if (unp2 == NULL) {
2911 error = ECONNREFUSED;
2912 goto bad2;
2913 }
2914 so2 = unp2->unp_socket;
2915 if (so->so_type != so2->so_type) {
2916 error = EPROTOTYPE;
2917 goto bad2;
2918 }
2919 if (connreq) {
2920 if (SOLISTENING(so2))
2921 so2 = solisten_clone(so2);
2922 else
2923 so2 = NULL;
2924 if (so2 == NULL) {
2925 error = ECONNREFUSED;
2926 goto bad2;
2927 }
2928 if ((error = uipc_attach(so2, 0, NULL)) != 0) {
2929 sodealloc(so2);
2930 goto bad2;
2931 }
2932 unp3 = sotounpcb(so2);
2933 unp_pcb_lock_pair(unp2, unp3);
2934 if (unp2->unp_addr != NULL) {
2935 bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
2936 unp3->unp_addr = (struct sockaddr_un *) sa;
2937 sa = NULL;
2938 }
2939
2940 unp_copy_peercred(td, unp3, unp, unp2);
2941
2942 UNP_PCB_UNLOCK(unp2);
2943 unp2 = unp3;
2944
2945 /*
2946 * It is safe to block on the PCB lock here since unp2 is
2947 * nascent and cannot be connected to any other sockets.
2948 */
2949 UNP_PCB_LOCK(unp);
2950 #ifdef MAC
2951 mac_socketpeer_set_from_socket(so, so2);
2952 mac_socketpeer_set_from_socket(so2, so);
2953 #endif
2954 } else {
2955 unp_pcb_lock_pair(unp, unp2);
2956 }
2957 KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
2958 sotounpcb(so2) == unp2,
2959 ("%s: unp2 %p so2 %p", __func__, unp2, so2));
2960 unp_connect2(so, so2, connreq);
2961 if (connreq)
2962 (void)solisten_enqueue(so2, SS_ISCONNECTED);
2963 KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
2964 ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
2965 unp->unp_flags &= ~UNP_CONNECTING;
2966 if (!return_locked)
2967 unp_pcb_unlock_pair(unp, unp2);
2968 bad2:
2969 mtx_unlock(vplock);
2970 bad:
2971 if (vp != NULL) {
2972 /*
2973 * If we are returning locked (called via uipc_sosend_dgram()),
2974 * we need to be sure that vput() won't sleep. This is
2975 * guaranteed by VOP_UNP_CONNECT() call above and unp2 lock.
2976 * SOCK_STREAM/SEQPACKET can't request return_locked (yet).
2977 */
2978 MPASS(!(return_locked && connreq));
2979 vput(vp);
2980 }
2981 free(sa, M_SONAME);
2982 if (__predict_false(error)) {
2983 UNP_PCB_LOCK(unp);
2984 KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
2985 ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
2986 unp->unp_flags &= ~UNP_CONNECTING;
2987 UNP_PCB_UNLOCK(unp);
2988 }
2989 return (error);
2990 }
2991
2992 /*
2993 * Set socket peer credentials at connection time.
2994 *
2995 * The client's PCB credentials are copied from its process structure. The
2996 * server's PCB credentials are copied from the socket on which it called
2997 * listen(2). uipc_listen cached that process's credentials at the time.
2998 */
2999 void
unp_copy_peercred(struct thread * td,struct unpcb * client_unp,struct unpcb * server_unp,struct unpcb * listen_unp)3000 unp_copy_peercred(struct thread *td, struct unpcb *client_unp,
3001 struct unpcb *server_unp, struct unpcb *listen_unp)
3002 {
3003 cru2xt(td, &client_unp->unp_peercred);
3004 client_unp->unp_flags |= UNP_HAVEPC;
3005
3006 memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred,
3007 sizeof(server_unp->unp_peercred));
3008 server_unp->unp_flags |= UNP_HAVEPC;
3009 client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK);
3010 }
3011
3012 /*
3013 * unix/stream & unix/seqpacket version of soisconnected().
3014 *
3015 * The crucial thing we are doing here is setting up the uxst_peer linkage,
3016 * holding unp and receive buffer locks of the both sockets. The disconnect
3017 * procedure does the same. This gives as a safe way to access the peer in the
3018 * send(2) and recv(2) during the socket lifetime.
3019 *
3020 * The less important thing is event notification of the fact that a socket is
3021 * now connected. It is unusual for a software to put a socket into event
3022 * mechanism before connect(2), but is supposed to be supported. Note that
3023 * there can not be any sleeping I/O on the socket, yet, only presence in the
3024 * select/poll/kevent.
3025 *
3026 * This function can be called via two call paths:
3027 * 1) socketpair(2) - in this case socket has not been yet reported to userland
3028 * and just can't have any event notifications mechanisms set up. The
3029 * 'wakeup' boolean is always false.
3030 * 2) connect(2) of existing socket to a recent clone of a listener:
3031 * 2.1) Socket that connect(2)s will have 'wakeup' true. An application
3032 * could have already put it into event mechanism, is it shall be
3033 * reported as readable and as writable.
3034 * 2.2) Socket that was just cloned with solisten_clone(). Same as 1).
3035 */
3036 static void
unp_soisconnected(struct socket * so,bool wakeup)3037 unp_soisconnected(struct socket *so, bool wakeup)
3038 {
3039 struct socket *so2 = sotounpcb(so)->unp_conn->unp_socket;
3040 struct sockbuf *sb;
3041
3042 SOCK_LOCK_ASSERT(so);
3043 UNP_PCB_LOCK_ASSERT(sotounpcb(so));
3044 UNP_PCB_LOCK_ASSERT(sotounpcb(so2));
3045 SOCK_RECVBUF_LOCK_ASSERT(so);
3046 SOCK_RECVBUF_LOCK_ASSERT(so2);
3047
3048 MPASS(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET);
3049 MPASS((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
3050 SS_ISDISCONNECTING)) == 0);
3051 MPASS(so->so_qstate == SQ_NONE);
3052
3053 so->so_state &= ~SS_ISDISCONNECTED;
3054 so->so_state |= SS_ISCONNECTED;
3055
3056 sb = &so2->so_rcv;
3057 sb->uxst_peer = so;
3058
3059 if (wakeup) {
3060 KNOTE_LOCKED(&sb->sb_sel->si_note, 0);
3061 sb = &so->so_rcv;
3062 selwakeuppri(sb->sb_sel, PSOCK);
3063 SOCK_SENDBUF_LOCK_ASSERT(so);
3064 sb = &so->so_snd;
3065 selwakeuppri(sb->sb_sel, PSOCK);
3066 SOCK_SENDBUF_UNLOCK(so);
3067 }
3068 }
3069
3070 static void
unp_connect2(struct socket * so,struct socket * so2,bool wakeup)3071 unp_connect2(struct socket *so, struct socket *so2, bool wakeup)
3072 {
3073 struct unpcb *unp;
3074 struct unpcb *unp2;
3075
3076 MPASS(so2->so_type == so->so_type);
3077 unp = sotounpcb(so);
3078 KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
3079 unp2 = sotounpcb(so2);
3080 KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
3081
3082 UNP_PCB_LOCK_ASSERT(unp);
3083 UNP_PCB_LOCK_ASSERT(unp2);
3084 KASSERT(unp->unp_conn == NULL,
3085 ("%s: socket %p is already connected", __func__, unp));
3086
3087 unp->unp_conn = unp2;
3088 unp_pcb_hold(unp2);
3089 unp_pcb_hold(unp);
3090 switch (so->so_type) {
3091 case SOCK_DGRAM:
3092 UNP_REF_LIST_LOCK();
3093 LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
3094 UNP_REF_LIST_UNLOCK();
3095 soisconnected(so);
3096 break;
3097
3098 case SOCK_STREAM:
3099 case SOCK_SEQPACKET:
3100 KASSERT(unp2->unp_conn == NULL,
3101 ("%s: socket %p is already connected", __func__, unp2));
3102 unp2->unp_conn = unp;
3103 SOCK_LOCK(so);
3104 SOCK_LOCK(so2);
3105 if (wakeup) /* Avoid LOR with receive buffer lock. */
3106 SOCK_SENDBUF_LOCK(so);
3107 SOCK_RECVBUF_LOCK(so);
3108 SOCK_RECVBUF_LOCK(so2);
3109 unp_soisconnected(so, wakeup); /* Will unlock send buffer. */
3110 unp_soisconnected(so2, false);
3111 SOCK_RECVBUF_UNLOCK(so);
3112 SOCK_RECVBUF_UNLOCK(so2);
3113 SOCK_UNLOCK(so);
3114 SOCK_UNLOCK(so2);
3115 break;
3116
3117 default:
3118 panic("unp_connect2");
3119 }
3120 }
3121
3122 static void
unp_soisdisconnected(struct socket * so)3123 unp_soisdisconnected(struct socket *so)
3124 {
3125 SOCK_LOCK_ASSERT(so);
3126 SOCK_RECVBUF_LOCK_ASSERT(so);
3127 MPASS(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET);
3128 MPASS(!SOLISTENING(so));
3129 MPASS((so->so_state & (SS_ISCONNECTING | SS_ISDISCONNECTING |
3130 SS_ISDISCONNECTED)) == 0);
3131 MPASS(so->so_state & SS_ISCONNECTED);
3132
3133 so->so_state |= SS_ISDISCONNECTED;
3134 so->so_state &= ~SS_ISCONNECTED;
3135 so->so_rcv.uxst_peer = NULL;
3136 socantrcvmore_locked(so);
3137 }
3138
3139 static void
unp_disconnect(struct unpcb * unp,struct unpcb * unp2)3140 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
3141 {
3142 struct socket *so, *so2;
3143 struct mbuf *m = NULL;
3144 #ifdef INVARIANTS
3145 struct unpcb *unptmp;
3146 #endif
3147
3148 UNP_PCB_LOCK_ASSERT(unp);
3149 UNP_PCB_LOCK_ASSERT(unp2);
3150 KASSERT(unp->unp_conn == unp2,
3151 ("%s: unpcb %p is not connected to %p", __func__, unp, unp2));
3152
3153 unp->unp_conn = NULL;
3154 so = unp->unp_socket;
3155 so2 = unp2->unp_socket;
3156 switch (unp->unp_socket->so_type) {
3157 case SOCK_DGRAM:
3158 /*
3159 * Remove our send socket buffer from the peer's receive buffer.
3160 * Move the data to the receive buffer only if it is empty.
3161 * This is a protection against a scenario where a peer
3162 * connects, floods and disconnects, effectively blocking
3163 * sendto() from unconnected sockets.
3164 */
3165 SOCK_RECVBUF_LOCK(so2);
3166 if (!STAILQ_EMPTY(&so->so_snd.uxdg_mb)) {
3167 TAILQ_REMOVE(&so2->so_rcv.uxdg_conns, &so->so_snd,
3168 uxdg_clist);
3169 if (__predict_true((so2->so_rcv.sb_state &
3170 SBS_CANTRCVMORE) == 0) &&
3171 STAILQ_EMPTY(&so2->so_rcv.uxdg_mb)) {
3172 STAILQ_CONCAT(&so2->so_rcv.uxdg_mb,
3173 &so->so_snd.uxdg_mb);
3174 so2->so_rcv.uxdg_cc += so->so_snd.uxdg_cc;
3175 so2->so_rcv.uxdg_ctl += so->so_snd.uxdg_ctl;
3176 so2->so_rcv.uxdg_mbcnt += so->so_snd.uxdg_mbcnt;
3177 } else {
3178 m = STAILQ_FIRST(&so->so_snd.uxdg_mb);
3179 STAILQ_INIT(&so->so_snd.uxdg_mb);
3180 so2->so_rcv.sb_acc -= so->so_snd.uxdg_cc;
3181 so2->so_rcv.sb_ccc -= so->so_snd.uxdg_cc;
3182 so2->so_rcv.sb_ctl -= so->so_snd.uxdg_ctl;
3183 so2->so_rcv.sb_mbcnt -= so->so_snd.uxdg_mbcnt;
3184 }
3185 /* Note: so may reconnect. */
3186 so->so_snd.uxdg_cc = 0;
3187 so->so_snd.uxdg_ctl = 0;
3188 so->so_snd.uxdg_mbcnt = 0;
3189 }
3190 SOCK_RECVBUF_UNLOCK(so2);
3191 UNP_REF_LIST_LOCK();
3192 #ifdef INVARIANTS
3193 LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) {
3194 if (unptmp == unp)
3195 break;
3196 }
3197 KASSERT(unptmp != NULL,
3198 ("%s: %p not found in reflist of %p", __func__, unp, unp2));
3199 #endif
3200 LIST_REMOVE(unp, unp_reflink);
3201 UNP_REF_LIST_UNLOCK();
3202 if (so) {
3203 SOCK_LOCK(so);
3204 so->so_state &= ~SS_ISCONNECTED;
3205 SOCK_UNLOCK(so);
3206 }
3207 break;
3208
3209 case SOCK_STREAM:
3210 case SOCK_SEQPACKET:
3211 SOCK_LOCK(so);
3212 SOCK_LOCK(so2);
3213 SOCK_RECVBUF_LOCK(so);
3214 SOCK_RECVBUF_LOCK(so2);
3215 unp_soisdisconnected(so);
3216 MPASS(unp2->unp_conn == unp);
3217 unp2->unp_conn = NULL;
3218 unp_soisdisconnected(so2);
3219 SOCK_UNLOCK(so);
3220 SOCK_UNLOCK(so2);
3221 break;
3222 }
3223
3224 if (unp == unp2) {
3225 unp_pcb_rele_notlast(unp);
3226 if (!unp_pcb_rele(unp))
3227 UNP_PCB_UNLOCK(unp);
3228 } else {
3229 if (!unp_pcb_rele(unp))
3230 UNP_PCB_UNLOCK(unp);
3231 if (!unp_pcb_rele(unp2))
3232 UNP_PCB_UNLOCK(unp2);
3233 }
3234
3235 if (m != NULL) {
3236 unp_scan(m, unp_freerights);
3237 m_freemp(m);
3238 }
3239 }
3240
3241 /*
3242 * unp_pcblist() walks the global list of struct unpcb's to generate a
3243 * pointer list, bumping the refcount on each unpcb. It then copies them out
3244 * sequentially, validating the generation number on each to see if it has
3245 * been detached. All of this is necessary because copyout() may sleep on
3246 * disk I/O.
3247 */
3248 static int
unp_pcblist(SYSCTL_HANDLER_ARGS)3249 unp_pcblist(SYSCTL_HANDLER_ARGS)
3250 {
3251 struct unpcb *unp, **unp_list;
3252 unp_gen_t gencnt;
3253 struct xunpgen *xug;
3254 struct unp_head *head;
3255 struct xunpcb *xu;
3256 u_int i;
3257 int error, n;
3258
3259 switch ((intptr_t)arg1) {
3260 case SOCK_STREAM:
3261 head = &unp_shead;
3262 break;
3263
3264 case SOCK_DGRAM:
3265 head = &unp_dhead;
3266 break;
3267
3268 case SOCK_SEQPACKET:
3269 head = &unp_sphead;
3270 break;
3271
3272 default:
3273 panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
3274 }
3275
3276 /*
3277 * The process of preparing the PCB list is too time-consuming and
3278 * resource-intensive to repeat twice on every request.
3279 */
3280 if (req->oldptr == NULL) {
3281 n = unp_count;
3282 req->oldidx = 2 * (sizeof *xug)
3283 + (n + n/8) * sizeof(struct xunpcb);
3284 return (0);
3285 }
3286
3287 if (req->newptr != NULL)
3288 return (EPERM);
3289
3290 /*
3291 * OK, now we're committed to doing something.
3292 */
3293 xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO);
3294 UNP_LINK_RLOCK();
3295 gencnt = unp_gencnt;
3296 n = unp_count;
3297 UNP_LINK_RUNLOCK();
3298
3299 xug->xug_len = sizeof *xug;
3300 xug->xug_count = n;
3301 xug->xug_gen = gencnt;
3302 xug->xug_sogen = so_gencnt;
3303 error = SYSCTL_OUT(req, xug, sizeof *xug);
3304 if (error) {
3305 free(xug, M_TEMP);
3306 return (error);
3307 }
3308
3309 unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
3310
3311 UNP_LINK_RLOCK();
3312 for (unp = LIST_FIRST(head), i = 0; unp && i < n;
3313 unp = LIST_NEXT(unp, unp_link)) {
3314 UNP_PCB_LOCK(unp);
3315 if (unp->unp_gencnt <= gencnt) {
3316 if (cr_cansee(req->td->td_ucred,
3317 unp->unp_socket->so_cred)) {
3318 UNP_PCB_UNLOCK(unp);
3319 continue;
3320 }
3321 unp_list[i++] = unp;
3322 unp_pcb_hold(unp);
3323 }
3324 UNP_PCB_UNLOCK(unp);
3325 }
3326 UNP_LINK_RUNLOCK();
3327 n = i; /* In case we lost some during malloc. */
3328
3329 error = 0;
3330 xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
3331 for (i = 0; i < n; i++) {
3332 unp = unp_list[i];
3333 UNP_PCB_LOCK(unp);
3334 if (unp_pcb_rele(unp))
3335 continue;
3336
3337 if (unp->unp_gencnt <= gencnt) {
3338 xu->xu_len = sizeof *xu;
3339 xu->xu_unpp = (uintptr_t)unp;
3340 /*
3341 * XXX - need more locking here to protect against
3342 * connect/disconnect races for SMP.
3343 */
3344 if (unp->unp_addr != NULL)
3345 bcopy(unp->unp_addr, &xu->xu_addr,
3346 unp->unp_addr->sun_len);
3347 else
3348 bzero(&xu->xu_addr, sizeof(xu->xu_addr));
3349 if (unp->unp_conn != NULL &&
3350 unp->unp_conn->unp_addr != NULL)
3351 bcopy(unp->unp_conn->unp_addr,
3352 &xu->xu_caddr,
3353 unp->unp_conn->unp_addr->sun_len);
3354 else
3355 bzero(&xu->xu_caddr, sizeof(xu->xu_caddr));
3356 xu->unp_vnode = (uintptr_t)unp->unp_vnode;
3357 xu->unp_conn = (uintptr_t)unp->unp_conn;
3358 xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs);
3359 xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink);
3360 xu->unp_gencnt = unp->unp_gencnt;
3361 sotoxsocket(unp->unp_socket, &xu->xu_socket);
3362 UNP_PCB_UNLOCK(unp);
3363 error = SYSCTL_OUT(req, xu, sizeof *xu);
3364 } else {
3365 UNP_PCB_UNLOCK(unp);
3366 }
3367 }
3368 free(xu, M_TEMP);
3369 if (!error) {
3370 /*
3371 * Give the user an updated idea of our state. If the
3372 * generation differs from what we told her before, she knows
3373 * that something happened while we were processing this
3374 * request, and it might be necessary to retry.
3375 */
3376 xug->xug_gen = unp_gencnt;
3377 xug->xug_sogen = so_gencnt;
3378 xug->xug_count = unp_count;
3379 error = SYSCTL_OUT(req, xug, sizeof *xug);
3380 }
3381 free(unp_list, M_TEMP);
3382 free(xug, M_TEMP);
3383 return (error);
3384 }
3385
3386 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist,
3387 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
3388 (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
3389 "List of active local datagram sockets");
3390 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist,
3391 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
3392 (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
3393 "List of active local stream sockets");
3394 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
3395 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
3396 (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
3397 "List of active local seqpacket sockets");
3398
3399 static void
unp_drop(struct unpcb * unp)3400 unp_drop(struct unpcb *unp)
3401 {
3402 struct socket *so;
3403 struct unpcb *unp2;
3404
3405 /*
3406 * Regardless of whether the socket's peer dropped the connection
3407 * with this socket by aborting or disconnecting, POSIX requires
3408 * that ECONNRESET is returned on next connected send(2) in case of
3409 * a SOCK_DGRAM socket and EPIPE for SOCK_STREAM.
3410 */
3411 UNP_PCB_LOCK(unp);
3412 if ((so = unp->unp_socket) != NULL)
3413 so->so_error =
3414 so->so_proto->pr_type == SOCK_DGRAM ? ECONNRESET : EPIPE;
3415 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
3416 /* Last reference dropped in unp_disconnect(). */
3417 unp_pcb_rele_notlast(unp);
3418 unp_disconnect(unp, unp2);
3419 } else if (!unp_pcb_rele(unp)) {
3420 UNP_PCB_UNLOCK(unp);
3421 }
3422 }
3423
3424 static void
unp_freerights(struct filedescent ** fdep,int fdcount)3425 unp_freerights(struct filedescent **fdep, int fdcount)
3426 {
3427 struct file *fp;
3428 int i;
3429
3430 KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
3431
3432 for (i = 0; i < fdcount; i++) {
3433 fp = fdep[i]->fde_file;
3434 filecaps_free(&fdep[i]->fde_caps);
3435 unp_discard(fp);
3436 }
3437 free(fdep[0], M_FILECAPS);
3438 }
3439
3440 static bool
restrict_rights(struct file * fp,struct thread * td)3441 restrict_rights(struct file *fp, struct thread *td)
3442 {
3443 struct prison *prison1, *prison2;
3444
3445 prison1 = fp->f_cred->cr_prison;
3446 prison2 = td->td_ucred->cr_prison;
3447 return (prison1 != prison2 && prison1->pr_root != prison2->pr_root &&
3448 prison2 != &prison0);
3449 }
3450
3451 static int
unp_externalize(struct mbuf * control,struct mbuf ** controlp,int flags)3452 unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
3453 {
3454 struct thread *td = curthread; /* XXX */
3455 struct cmsghdr *cm = mtod(control, struct cmsghdr *);
3456 int *fdp;
3457 struct filedesc *fdesc = td->td_proc->p_fd;
3458 struct filedescent **fdep;
3459 void *data;
3460 socklen_t clen = control->m_len, datalen;
3461 int error, fdflags, newfds;
3462 u_int newlen;
3463
3464 UNP_LINK_UNLOCK_ASSERT();
3465
3466 fdflags = (flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
3467
3468 error = 0;
3469 if (controlp != NULL) /* controlp == NULL => free control messages */
3470 *controlp = NULL;
3471 while (cm != NULL) {
3472 MPASS(clen >= sizeof(*cm) && clen >= cm->cmsg_len);
3473
3474 data = CMSG_DATA(cm);
3475 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
3476 if (cm->cmsg_level == SOL_SOCKET
3477 && cm->cmsg_type == SCM_RIGHTS) {
3478 newfds = datalen / sizeof(*fdep);
3479 if (newfds == 0)
3480 goto next;
3481 fdep = data;
3482
3483 /* If we're not outputting the descriptors free them. */
3484 if (error || controlp == NULL) {
3485 unp_freerights(fdep, newfds);
3486 goto next;
3487 }
3488 FILEDESC_XLOCK(fdesc);
3489
3490 /*
3491 * Now change each pointer to an fd in the global
3492 * table to an integer that is the index to the local
3493 * fd table entry that we set up to point to the
3494 * global one we are transferring.
3495 */
3496 newlen = newfds * sizeof(int);
3497 *controlp = sbcreatecontrol(NULL, newlen,
3498 SCM_RIGHTS, SOL_SOCKET, M_WAITOK);
3499
3500 fdp = (int *)
3501 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
3502 if ((error = fdallocn(td, 0, fdp, newfds))) {
3503 FILEDESC_XUNLOCK(fdesc);
3504 unp_freerights(fdep, newfds);
3505 m_freem(*controlp);
3506 *controlp = NULL;
3507 goto next;
3508 }
3509 for (int i = 0; i < newfds; i++, fdp++) {
3510 struct file *fp;
3511
3512 fp = fdep[i]->fde_file;
3513 _finstall(fdesc, fp, *fdp, fdflags |
3514 (restrict_rights(fp, td) ?
3515 O_RESOLVE_BENEATH : 0), &fdep[i]->fde_caps);
3516 unp_externalize_fp(fp);
3517 }
3518
3519 /*
3520 * The new type indicates that the mbuf data refers to
3521 * kernel resources that may need to be released before
3522 * the mbuf is freed.
3523 */
3524 m_chtype(*controlp, MT_EXTCONTROL);
3525 FILEDESC_XUNLOCK(fdesc);
3526 free(fdep[0], M_FILECAPS);
3527 } else {
3528 /* We can just copy anything else across. */
3529 if (error || controlp == NULL)
3530 goto next;
3531 *controlp = sbcreatecontrol(NULL, datalen,
3532 cm->cmsg_type, cm->cmsg_level, M_WAITOK);
3533 bcopy(data,
3534 CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
3535 datalen);
3536 }
3537 controlp = &(*controlp)->m_next;
3538
3539 next:
3540 if (CMSG_SPACE(datalen) < clen) {
3541 clen -= CMSG_SPACE(datalen);
3542 cm = (struct cmsghdr *)
3543 ((caddr_t)cm + CMSG_SPACE(datalen));
3544 } else {
3545 clen = 0;
3546 cm = NULL;
3547 }
3548 }
3549
3550 return (error);
3551 }
3552
3553 static void
unp_zone_change(void * tag)3554 unp_zone_change(void *tag)
3555 {
3556
3557 uma_zone_set_max(unp_zone, maxsockets);
3558 }
3559
3560 #ifdef INVARIANTS
3561 static void
unp_zdtor(void * mem,int size __unused,void * arg __unused)3562 unp_zdtor(void *mem, int size __unused, void *arg __unused)
3563 {
3564 struct unpcb *unp;
3565
3566 unp = mem;
3567
3568 KASSERT(LIST_EMPTY(&unp->unp_refs),
3569 ("%s: unpcb %p has lingering refs", __func__, unp));
3570 KASSERT(unp->unp_socket == NULL,
3571 ("%s: unpcb %p has socket backpointer", __func__, unp));
3572 KASSERT(unp->unp_vnode == NULL,
3573 ("%s: unpcb %p has vnode references", __func__, unp));
3574 KASSERT(unp->unp_conn == NULL,
3575 ("%s: unpcb %p is still connected", __func__, unp));
3576 KASSERT(unp->unp_addr == NULL,
3577 ("%s: unpcb %p has leaked addr", __func__, unp));
3578 }
3579 #endif
3580
3581 static void
unp_init(void * arg __unused)3582 unp_init(void *arg __unused)
3583 {
3584 uma_dtor dtor;
3585
3586 #ifdef INVARIANTS
3587 dtor = unp_zdtor;
3588 #else
3589 dtor = NULL;
3590 #endif
3591 unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor,
3592 NULL, NULL, UMA_ALIGN_CACHE, 0);
3593 uma_zone_set_max(unp_zone, maxsockets);
3594 uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
3595 EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
3596 NULL, EVENTHANDLER_PRI_ANY);
3597 LIST_INIT(&unp_dhead);
3598 LIST_INIT(&unp_shead);
3599 LIST_INIT(&unp_sphead);
3600 SLIST_INIT(&unp_defers);
3601 TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
3602 TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
3603 UNP_LINK_LOCK_INIT();
3604 UNP_DEFERRED_LOCK_INIT();
3605 unp_vp_mtxpool = mtx_pool_create("unp vp mtxpool", 32, MTX_DEF);
3606 }
3607 SYSINIT(unp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, unp_init, NULL);
3608
3609 static void
unp_internalize_cleanup_rights(struct mbuf * control)3610 unp_internalize_cleanup_rights(struct mbuf *control)
3611 {
3612 struct cmsghdr *cp;
3613 struct mbuf *m;
3614 void *data;
3615 socklen_t datalen;
3616
3617 for (m = control; m != NULL; m = m->m_next) {
3618 cp = mtod(m, struct cmsghdr *);
3619 if (cp->cmsg_level != SOL_SOCKET ||
3620 cp->cmsg_type != SCM_RIGHTS)
3621 continue;
3622 data = CMSG_DATA(cp);
3623 datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data;
3624 unp_freerights(data, datalen / sizeof(struct filedesc *));
3625 }
3626 }
3627
3628 static int
unp_internalize(struct mbuf * control,struct mchain * mc,struct thread * td)3629 unp_internalize(struct mbuf *control, struct mchain *mc, struct thread *td)
3630 {
3631 struct proc *p;
3632 struct filedesc *fdesc;
3633 struct bintime *bt;
3634 struct cmsghdr *cm;
3635 struct cmsgcred *cmcred;
3636 struct mbuf *m;
3637 struct filedescent *fde, **fdep, *fdev;
3638 struct file *fp;
3639 struct timeval *tv;
3640 struct timespec *ts;
3641 void *data;
3642 socklen_t clen, datalen;
3643 int i, j, error, *fdp, oldfds;
3644 u_int newlen;
3645
3646 MPASS(control->m_next == NULL); /* COMPAT_OLDSOCK may violate */
3647 UNP_LINK_UNLOCK_ASSERT();
3648
3649 p = td->td_proc;
3650 fdesc = p->p_fd;
3651 error = 0;
3652 *mc = MCHAIN_INITIALIZER(mc);
3653 for (clen = control->m_len, cm = mtod(control, struct cmsghdr *),
3654 data = CMSG_DATA(cm);
3655
3656 clen >= sizeof(*cm) && cm->cmsg_level == SOL_SOCKET &&
3657 clen >= cm->cmsg_len && cm->cmsg_len >= sizeof(*cm) &&
3658 (char *)cm + cm->cmsg_len >= (char *)data;
3659
3660 clen -= min(CMSG_SPACE(datalen), clen),
3661 cm = (struct cmsghdr *) ((char *)cm + CMSG_SPACE(datalen)),
3662 data = CMSG_DATA(cm)) {
3663 datalen = (char *)cm + cm->cmsg_len - (char *)data;
3664 switch (cm->cmsg_type) {
3665 case SCM_CREDS:
3666 m = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS,
3667 SOL_SOCKET, M_WAITOK);
3668 cmcred = (struct cmsgcred *)
3669 CMSG_DATA(mtod(m, struct cmsghdr *));
3670 cmcred->cmcred_pid = p->p_pid;
3671 cmcred->cmcred_uid = td->td_ucred->cr_ruid;
3672 cmcred->cmcred_gid = td->td_ucred->cr_rgid;
3673 cmcred->cmcred_euid = td->td_ucred->cr_uid;
3674 cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
3675 CMGROUP_MAX);
3676 for (i = 0; i < cmcred->cmcred_ngroups; i++)
3677 cmcred->cmcred_groups[i] =
3678 td->td_ucred->cr_groups[i];
3679 break;
3680
3681 case SCM_RIGHTS:
3682 oldfds = datalen / sizeof (int);
3683 if (oldfds == 0)
3684 continue;
3685 /* On some machines sizeof pointer is bigger than
3686 * sizeof int, so we need to check if data fits into
3687 * single mbuf. We could allocate several mbufs, and
3688 * unp_externalize() should even properly handle that.
3689 * But it is not worth to complicate the code for an
3690 * insane scenario of passing over 200 file descriptors
3691 * at once.
3692 */
3693 newlen = oldfds * sizeof(fdep[0]);
3694 if (CMSG_SPACE(newlen) > MCLBYTES) {
3695 error = EMSGSIZE;
3696 goto out;
3697 }
3698 /*
3699 * Check that all the FDs passed in refer to legal
3700 * files. If not, reject the entire operation.
3701 */
3702 fdp = data;
3703 FILEDESC_SLOCK(fdesc);
3704 for (i = 0; i < oldfds; i++, fdp++) {
3705 fp = fget_noref(fdesc, *fdp);
3706 if (fp == NULL) {
3707 FILEDESC_SUNLOCK(fdesc);
3708 error = EBADF;
3709 goto out;
3710 }
3711 if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
3712 FILEDESC_SUNLOCK(fdesc);
3713 error = EOPNOTSUPP;
3714 goto out;
3715 }
3716 }
3717
3718 /*
3719 * Now replace the integer FDs with pointers to the
3720 * file structure and capability rights.
3721 */
3722 m = sbcreatecontrol(NULL, newlen, SCM_RIGHTS,
3723 SOL_SOCKET, M_WAITOK);
3724 fdp = data;
3725 for (i = 0; i < oldfds; i++, fdp++) {
3726 if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) {
3727 fdp = data;
3728 for (j = 0; j < i; j++, fdp++) {
3729 fdrop(fdesc->fd_ofiles[*fdp].
3730 fde_file, td);
3731 }
3732 FILEDESC_SUNLOCK(fdesc);
3733 error = EBADF;
3734 goto out;
3735 }
3736 }
3737 fdp = data;
3738 fdep = (struct filedescent **)
3739 CMSG_DATA(mtod(m, struct cmsghdr *));
3740 fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
3741 M_WAITOK);
3742 for (i = 0; i < oldfds; i++, fdev++, fdp++) {
3743 fde = &fdesc->fd_ofiles[*fdp];
3744 fdep[i] = fdev;
3745 fdep[i]->fde_file = fde->fde_file;
3746 filecaps_copy(&fde->fde_caps,
3747 &fdep[i]->fde_caps, true);
3748 unp_internalize_fp(fdep[i]->fde_file);
3749 }
3750 FILEDESC_SUNLOCK(fdesc);
3751 break;
3752
3753 case SCM_TIMESTAMP:
3754 m = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP,
3755 SOL_SOCKET, M_WAITOK);
3756 tv = (struct timeval *)
3757 CMSG_DATA(mtod(m, struct cmsghdr *));
3758 microtime(tv);
3759 break;
3760
3761 case SCM_BINTIME:
3762 m = sbcreatecontrol(NULL, sizeof(*bt), SCM_BINTIME,
3763 SOL_SOCKET, M_WAITOK);
3764 bt = (struct bintime *)
3765 CMSG_DATA(mtod(m, struct cmsghdr *));
3766 bintime(bt);
3767 break;
3768
3769 case SCM_REALTIME:
3770 m = sbcreatecontrol(NULL, sizeof(*ts), SCM_REALTIME,
3771 SOL_SOCKET, M_WAITOK);
3772 ts = (struct timespec *)
3773 CMSG_DATA(mtod(m, struct cmsghdr *));
3774 nanotime(ts);
3775 break;
3776
3777 case SCM_MONOTONIC:
3778 m = sbcreatecontrol(NULL, sizeof(*ts), SCM_MONOTONIC,
3779 SOL_SOCKET, M_WAITOK);
3780 ts = (struct timespec *)
3781 CMSG_DATA(mtod(m, struct cmsghdr *));
3782 nanouptime(ts);
3783 break;
3784
3785 default:
3786 error = EINVAL;
3787 goto out;
3788 }
3789
3790 mc_append(mc, m);
3791 }
3792 if (clen > 0)
3793 error = EINVAL;
3794
3795 out:
3796 if (error != 0)
3797 unp_internalize_cleanup_rights(mc_first(mc));
3798 m_freem(control);
3799 return (error);
3800 }
3801
3802 static void
unp_addsockcred(struct thread * td,struct mchain * mc,int mode)3803 unp_addsockcred(struct thread *td, struct mchain *mc, int mode)
3804 {
3805 struct mbuf *m, *n, *n_prev;
3806 const struct cmsghdr *cm;
3807 int ngroups, i, cmsgtype;
3808 size_t ctrlsz;
3809
3810 ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
3811 if (mode & UNP_WANTCRED_ALWAYS) {
3812 ctrlsz = SOCKCRED2SIZE(ngroups);
3813 cmsgtype = SCM_CREDS2;
3814 } else {
3815 ctrlsz = SOCKCREDSIZE(ngroups);
3816 cmsgtype = SCM_CREDS;
3817 }
3818
3819 /* XXXGL: uipc_sosend_*() need to be improved so that we can M_WAITOK */
3820 m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET, M_NOWAIT);
3821 if (m == NULL)
3822 return;
3823 MPASS((m->m_flags & M_EXT) == 0 && m->m_next == NULL);
3824
3825 if (mode & UNP_WANTCRED_ALWAYS) {
3826 struct sockcred2 *sc;
3827
3828 sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
3829 sc->sc_version = 0;
3830 sc->sc_pid = td->td_proc->p_pid;
3831 sc->sc_uid = td->td_ucred->cr_ruid;
3832 sc->sc_euid = td->td_ucred->cr_uid;
3833 sc->sc_gid = td->td_ucred->cr_rgid;
3834 sc->sc_egid = td->td_ucred->cr_gid;
3835 sc->sc_ngroups = ngroups;
3836 for (i = 0; i < sc->sc_ngroups; i++)
3837 sc->sc_groups[i] = td->td_ucred->cr_groups[i];
3838 } else {
3839 struct sockcred *sc;
3840
3841 sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
3842 sc->sc_uid = td->td_ucred->cr_ruid;
3843 sc->sc_euid = td->td_ucred->cr_uid;
3844 sc->sc_gid = td->td_ucred->cr_rgid;
3845 sc->sc_egid = td->td_ucred->cr_gid;
3846 sc->sc_ngroups = ngroups;
3847 for (i = 0; i < sc->sc_ngroups; i++)
3848 sc->sc_groups[i] = td->td_ucred->cr_groups[i];
3849 }
3850
3851 /*
3852 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
3853 * created SCM_CREDS control message (struct sockcred) has another
3854 * format.
3855 */
3856 if (!STAILQ_EMPTY(&mc->mc_q) && cmsgtype == SCM_CREDS)
3857 STAILQ_FOREACH_SAFE(n, &mc->mc_q, m_stailq, n_prev) {
3858 cm = mtod(n, struct cmsghdr *);
3859 if (cm->cmsg_level == SOL_SOCKET &&
3860 cm->cmsg_type == SCM_CREDS) {
3861 mc_remove(mc, n);
3862 m_free(n);
3863 }
3864 }
3865
3866 /* Prepend it to the head. */
3867 mc_prepend(mc, m);
3868 }
3869
3870 static struct unpcb *
fptounp(struct file * fp)3871 fptounp(struct file *fp)
3872 {
3873 struct socket *so;
3874
3875 if (fp->f_type != DTYPE_SOCKET)
3876 return (NULL);
3877 if ((so = fp->f_data) == NULL)
3878 return (NULL);
3879 if (so->so_proto->pr_domain != &localdomain)
3880 return (NULL);
3881 return sotounpcb(so);
3882 }
3883
3884 static void
unp_discard(struct file * fp)3885 unp_discard(struct file *fp)
3886 {
3887 struct unp_defer *dr;
3888
3889 if (unp_externalize_fp(fp)) {
3890 dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
3891 dr->ud_fp = fp;
3892 UNP_DEFERRED_LOCK();
3893 SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
3894 UNP_DEFERRED_UNLOCK();
3895 atomic_add_int(&unp_defers_count, 1);
3896 taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
3897 } else
3898 closef_nothread(fp);
3899 }
3900
3901 static void
unp_process_defers(void * arg __unused,int pending)3902 unp_process_defers(void *arg __unused, int pending)
3903 {
3904 struct unp_defer *dr;
3905 SLIST_HEAD(, unp_defer) drl;
3906 int count;
3907
3908 SLIST_INIT(&drl);
3909 for (;;) {
3910 UNP_DEFERRED_LOCK();
3911 if (SLIST_FIRST(&unp_defers) == NULL) {
3912 UNP_DEFERRED_UNLOCK();
3913 break;
3914 }
3915 SLIST_SWAP(&unp_defers, &drl, unp_defer);
3916 UNP_DEFERRED_UNLOCK();
3917 count = 0;
3918 while ((dr = SLIST_FIRST(&drl)) != NULL) {
3919 SLIST_REMOVE_HEAD(&drl, ud_link);
3920 closef_nothread(dr->ud_fp);
3921 free(dr, M_TEMP);
3922 count++;
3923 }
3924 atomic_add_int(&unp_defers_count, -count);
3925 }
3926 }
3927
3928 static void
unp_internalize_fp(struct file * fp)3929 unp_internalize_fp(struct file *fp)
3930 {
3931 struct unpcb *unp;
3932
3933 UNP_LINK_WLOCK();
3934 if ((unp = fptounp(fp)) != NULL) {
3935 unp->unp_file = fp;
3936 unp->unp_msgcount++;
3937 }
3938 unp_rights++;
3939 UNP_LINK_WUNLOCK();
3940 }
3941
3942 static int
unp_externalize_fp(struct file * fp)3943 unp_externalize_fp(struct file *fp)
3944 {
3945 struct unpcb *unp;
3946 int ret;
3947
3948 UNP_LINK_WLOCK();
3949 if ((unp = fptounp(fp)) != NULL) {
3950 unp->unp_msgcount--;
3951 ret = 1;
3952 } else
3953 ret = 0;
3954 unp_rights--;
3955 UNP_LINK_WUNLOCK();
3956 return (ret);
3957 }
3958
3959 /*
3960 * unp_defer indicates whether additional work has been defered for a future
3961 * pass through unp_gc(). It is thread local and does not require explicit
3962 * synchronization.
3963 */
3964 static int unp_marked;
3965
3966 static void
unp_remove_dead_ref(struct filedescent ** fdep,int fdcount)3967 unp_remove_dead_ref(struct filedescent **fdep, int fdcount)
3968 {
3969 struct unpcb *unp;
3970 struct file *fp;
3971 int i;
3972
3973 /*
3974 * This function can only be called from the gc task.
3975 */
3976 KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
3977 ("%s: not on gc callout", __func__));
3978 UNP_LINK_LOCK_ASSERT();
3979
3980 for (i = 0; i < fdcount; i++) {
3981 fp = fdep[i]->fde_file;
3982 if ((unp = fptounp(fp)) == NULL)
3983 continue;
3984 if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
3985 continue;
3986 unp->unp_gcrefs--;
3987 }
3988 }
3989
3990 static void
unp_restore_undead_ref(struct filedescent ** fdep,int fdcount)3991 unp_restore_undead_ref(struct filedescent **fdep, int fdcount)
3992 {
3993 struct unpcb *unp;
3994 struct file *fp;
3995 int i;
3996
3997 /*
3998 * This function can only be called from the gc task.
3999 */
4000 KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
4001 ("%s: not on gc callout", __func__));
4002 UNP_LINK_LOCK_ASSERT();
4003
4004 for (i = 0; i < fdcount; i++) {
4005 fp = fdep[i]->fde_file;
4006 if ((unp = fptounp(fp)) == NULL)
4007 continue;
4008 if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
4009 continue;
4010 unp->unp_gcrefs++;
4011 unp_marked++;
4012 }
4013 }
4014
4015 static void
unp_scan_socket(struct socket * so,void (* op)(struct filedescent **,int))4016 unp_scan_socket(struct socket *so, void (*op)(struct filedescent **, int))
4017 {
4018 struct sockbuf *sb;
4019
4020 SOCK_LOCK_ASSERT(so);
4021
4022 if (sotounpcb(so)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
4023 return;
4024
4025 SOCK_RECVBUF_LOCK(so);
4026 switch (so->so_type) {
4027 case SOCK_DGRAM:
4028 unp_scan(STAILQ_FIRST(&so->so_rcv.uxdg_mb), op);
4029 unp_scan(so->so_rcv.uxdg_peeked, op);
4030 TAILQ_FOREACH(sb, &so->so_rcv.uxdg_conns, uxdg_clist)
4031 unp_scan(STAILQ_FIRST(&sb->uxdg_mb), op);
4032 break;
4033 case SOCK_STREAM:
4034 case SOCK_SEQPACKET:
4035 unp_scan(STAILQ_FIRST(&so->so_rcv.uxst_mbq), op);
4036 break;
4037 }
4038 SOCK_RECVBUF_UNLOCK(so);
4039 }
4040
4041 static void
unp_gc_scan(struct unpcb * unp,void (* op)(struct filedescent **,int))4042 unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int))
4043 {
4044 struct socket *so, *soa;
4045
4046 so = unp->unp_socket;
4047 SOCK_LOCK(so);
4048 if (SOLISTENING(so)) {
4049 /*
4050 * Mark all sockets in our accept queue.
4051 */
4052 TAILQ_FOREACH(soa, &so->sol_comp, so_list)
4053 unp_scan_socket(soa, op);
4054 } else {
4055 /*
4056 * Mark all sockets we reference with RIGHTS.
4057 */
4058 unp_scan_socket(so, op);
4059 }
4060 SOCK_UNLOCK(so);
4061 }
4062
4063 static int unp_recycled;
4064 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0,
4065 "Number of unreachable sockets claimed by the garbage collector.");
4066
4067 static int unp_taskcount;
4068 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0,
4069 "Number of times the garbage collector has run.");
4070
4071 SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0,
4072 "Number of active local sockets.");
4073
4074 static void
unp_gc(__unused void * arg,int pending)4075 unp_gc(__unused void *arg, int pending)
4076 {
4077 struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
4078 NULL };
4079 struct unp_head **head;
4080 struct unp_head unp_deadhead; /* List of potentially-dead sockets. */
4081 struct file *f, **unref;
4082 struct unpcb *unp, *unptmp;
4083 int i, total, unp_unreachable;
4084
4085 LIST_INIT(&unp_deadhead);
4086 unp_taskcount++;
4087 UNP_LINK_RLOCK();
4088 /*
4089 * First determine which sockets may be in cycles.
4090 */
4091 unp_unreachable = 0;
4092
4093 for (head = heads; *head != NULL; head++)
4094 LIST_FOREACH(unp, *head, unp_link) {
4095 KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0,
4096 ("%s: unp %p has unexpected gc flags 0x%x",
4097 __func__, unp, (unsigned int)unp->unp_gcflag));
4098
4099 f = unp->unp_file;
4100
4101 /*
4102 * Check for an unreachable socket potentially in a
4103 * cycle. It must be in a queue as indicated by
4104 * msgcount, and this must equal the file reference
4105 * count. Note that when msgcount is 0 the file is
4106 * NULL.
4107 */
4108 if (f != NULL && unp->unp_msgcount != 0 &&
4109 refcount_load(&f->f_count) == unp->unp_msgcount) {
4110 LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead);
4111 unp->unp_gcflag |= UNPGC_DEAD;
4112 unp->unp_gcrefs = unp->unp_msgcount;
4113 unp_unreachable++;
4114 }
4115 }
4116
4117 /*
4118 * Scan all sockets previously marked as potentially being in a cycle
4119 * and remove the references each socket holds on any UNPGC_DEAD
4120 * sockets in its queue. After this step, all remaining references on
4121 * sockets marked UNPGC_DEAD should not be part of any cycle.
4122 */
4123 LIST_FOREACH(unp, &unp_deadhead, unp_dead)
4124 unp_gc_scan(unp, unp_remove_dead_ref);
4125
4126 /*
4127 * If a socket still has a non-negative refcount, it cannot be in a
4128 * cycle. In this case increment refcount of all children iteratively.
4129 * Stop the scan once we do a complete loop without discovering
4130 * a new reachable socket.
4131 */
4132 do {
4133 unp_marked = 0;
4134 LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp)
4135 if (unp->unp_gcrefs > 0) {
4136 unp->unp_gcflag &= ~UNPGC_DEAD;
4137 LIST_REMOVE(unp, unp_dead);
4138 KASSERT(unp_unreachable > 0,
4139 ("%s: unp_unreachable underflow.",
4140 __func__));
4141 unp_unreachable--;
4142 unp_gc_scan(unp, unp_restore_undead_ref);
4143 }
4144 } while (unp_marked);
4145
4146 UNP_LINK_RUNLOCK();
4147
4148 if (unp_unreachable == 0)
4149 return;
4150
4151 /*
4152 * Allocate space for a local array of dead unpcbs.
4153 * TODO: can this path be simplified by instead using the local
4154 * dead list at unp_deadhead, after taking out references
4155 * on the file object and/or unpcb and dropping the link lock?
4156 */
4157 unref = malloc(unp_unreachable * sizeof(struct file *),
4158 M_TEMP, M_WAITOK);
4159
4160 /*
4161 * Iterate looking for sockets which have been specifically marked
4162 * as unreachable and store them locally.
4163 */
4164 UNP_LINK_RLOCK();
4165 total = 0;
4166 LIST_FOREACH(unp, &unp_deadhead, unp_dead) {
4167 KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0,
4168 ("%s: unp %p not marked UNPGC_DEAD", __func__, unp));
4169 unp->unp_gcflag &= ~UNPGC_DEAD;
4170 f = unp->unp_file;
4171 if (unp->unp_msgcount == 0 || f == NULL ||
4172 refcount_load(&f->f_count) != unp->unp_msgcount ||
4173 !fhold(f))
4174 continue;
4175 unref[total++] = f;
4176 KASSERT(total <= unp_unreachable,
4177 ("%s: incorrect unreachable count.", __func__));
4178 }
4179 UNP_LINK_RUNLOCK();
4180
4181 /*
4182 * Now flush all sockets, free'ing rights. This will free the
4183 * struct files associated with these sockets but leave each socket
4184 * with one remaining ref.
4185 */
4186 for (i = 0; i < total; i++) {
4187 struct socket *so;
4188
4189 so = unref[i]->f_data;
4190 CURVNET_SET(so->so_vnet);
4191 socantrcvmore(so);
4192 unp_dispose(so);
4193 CURVNET_RESTORE();
4194 }
4195
4196 /*
4197 * And finally release the sockets so they can be reclaimed.
4198 */
4199 for (i = 0; i < total; i++)
4200 fdrop(unref[i], NULL);
4201 unp_recycled += total;
4202 free(unref, M_TEMP);
4203 }
4204
4205 /*
4206 * Synchronize against unp_gc, which can trip over data as we are freeing it.
4207 */
4208 static void
unp_dispose(struct socket * so)4209 unp_dispose(struct socket *so)
4210 {
4211 struct sockbuf *sb;
4212 struct unpcb *unp;
4213 struct mbuf *m;
4214 int error __diagused;
4215
4216 MPASS(!SOLISTENING(so));
4217
4218 unp = sotounpcb(so);
4219 UNP_LINK_WLOCK();
4220 unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
4221 UNP_LINK_WUNLOCK();
4222
4223 /*
4224 * Grab our special mbufs before calling sbrelease().
4225 */
4226 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
4227 MPASS(!error);
4228 SOCK_RECVBUF_LOCK(so);
4229 switch (so->so_type) {
4230 case SOCK_DGRAM:
4231 while ((sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) != NULL) {
4232 STAILQ_CONCAT(&so->so_rcv.uxdg_mb, &sb->uxdg_mb);
4233 TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist);
4234 /* Note: socket of sb may reconnect. */
4235 sb->uxdg_cc = sb->uxdg_ctl = sb->uxdg_mbcnt = 0;
4236 }
4237 sb = &so->so_rcv;
4238 if (sb->uxdg_peeked != NULL) {
4239 STAILQ_INSERT_HEAD(&sb->uxdg_mb, sb->uxdg_peeked,
4240 m_stailqpkt);
4241 sb->uxdg_peeked = NULL;
4242 }
4243 m = STAILQ_FIRST(&sb->uxdg_mb);
4244 STAILQ_INIT(&sb->uxdg_mb);
4245 break;
4246 case SOCK_STREAM:
4247 case SOCK_SEQPACKET:
4248 sb = &so->so_rcv;
4249 m = STAILQ_FIRST(&sb->uxst_mbq);
4250 STAILQ_INIT(&sb->uxst_mbq);
4251 sb->sb_acc = sb->sb_ccc = sb->sb_ctl = sb->sb_mbcnt = 0;
4252 /*
4253 * Trim M_NOTREADY buffers from the free list. They are
4254 * referenced by the I/O thread.
4255 */
4256 if (sb->uxst_fnrdy != NULL) {
4257 struct mbuf *n, *prev;
4258
4259 while (m != NULL && m->m_flags & M_NOTREADY)
4260 m = m->m_next;
4261 for (prev = n = m; n != NULL; n = n->m_next) {
4262 if (n->m_flags & M_NOTREADY)
4263 prev->m_next = n->m_next;
4264 else
4265 prev = n;
4266 }
4267 sb->uxst_fnrdy = NULL;
4268 }
4269 break;
4270 }
4271 /*
4272 * Mark sb with SBS_CANTRCVMORE. This is needed to prevent
4273 * uipc_sosend_*() or unp_disconnect() adding more data to the socket.
4274 * We came here either through shutdown(2) or from the final sofree().
4275 * The sofree() case is simple as it guarantees that no more sends will
4276 * happen, however we can race with unp_disconnect() from our peer.
4277 * The shutdown(2) case is more exotic. It would call into
4278 * unp_dispose() only if socket is SS_ISCONNECTED. This is possible if
4279 * we did connect(2) on this socket and we also had it bound with
4280 * bind(2) and receive connections from other sockets. Because
4281 * uipc_shutdown() violates POSIX (see comment there) this applies to
4282 * SOCK_DGRAM as well. For SOCK_DGRAM this SBS_CANTRCVMORE will have
4283 * affect not only on the peer we connect(2)ed to, but also on all of
4284 * the peers who had connect(2)ed to us. Their sends would end up
4285 * with ENOBUFS.
4286 */
4287 sb->sb_state |= SBS_CANTRCVMORE;
4288 (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
4289 RLIM_INFINITY);
4290 SOCK_RECVBUF_UNLOCK(so);
4291 SOCK_IO_RECV_UNLOCK(so);
4292
4293 if (m != NULL) {
4294 unp_scan(m, unp_freerights);
4295 m_freemp(m);
4296 }
4297 }
4298
4299 static void
unp_scan(struct mbuf * m0,void (* op)(struct filedescent **,int))4300 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
4301 {
4302 struct mbuf *m;
4303 struct cmsghdr *cm;
4304 void *data;
4305 socklen_t clen, datalen;
4306
4307 while (m0 != NULL) {
4308 for (m = m0; m; m = m->m_next) {
4309 if (m->m_type != MT_CONTROL)
4310 continue;
4311
4312 cm = mtod(m, struct cmsghdr *);
4313 clen = m->m_len;
4314
4315 while (cm != NULL) {
4316 if (sizeof(*cm) > clen || cm->cmsg_len > clen)
4317 break;
4318
4319 data = CMSG_DATA(cm);
4320 datalen = (caddr_t)cm + cm->cmsg_len
4321 - (caddr_t)data;
4322
4323 if (cm->cmsg_level == SOL_SOCKET &&
4324 cm->cmsg_type == SCM_RIGHTS) {
4325 (*op)(data, datalen /
4326 sizeof(struct filedescent *));
4327 }
4328
4329 if (CMSG_SPACE(datalen) < clen) {
4330 clen -= CMSG_SPACE(datalen);
4331 cm = (struct cmsghdr *)
4332 ((caddr_t)cm + CMSG_SPACE(datalen));
4333 } else {
4334 clen = 0;
4335 cm = NULL;
4336 }
4337 }
4338 }
4339 m0 = m0->m_nextpkt;
4340 }
4341 }
4342
4343 /*
4344 * Definitions of protocols supported in the LOCAL domain.
4345 */
4346 static struct protosw streamproto = {
4347 .pr_type = SOCK_STREAM,
4348 .pr_flags = PR_CONNREQUIRED | PR_CAPATTACH | PR_SOCKBUF,
4349 .pr_ctloutput = &uipc_ctloutput,
4350 .pr_abort = uipc_abort,
4351 .pr_accept = uipc_peeraddr,
4352 .pr_attach = uipc_attach,
4353 .pr_bind = uipc_bind,
4354 .pr_bindat = uipc_bindat,
4355 .pr_connect = uipc_connect,
4356 .pr_connectat = uipc_connectat,
4357 .pr_connect2 = uipc_connect2,
4358 .pr_detach = uipc_detach,
4359 .pr_disconnect = uipc_disconnect,
4360 .pr_listen = uipc_listen,
4361 .pr_peeraddr = uipc_peeraddr,
4362 .pr_send = uipc_sendfile,
4363 .pr_sendfile_wait = uipc_sendfile_wait,
4364 .pr_ready = uipc_ready,
4365 .pr_sense = uipc_sense,
4366 .pr_shutdown = uipc_shutdown,
4367 .pr_sockaddr = uipc_sockaddr,
4368 .pr_sosend = uipc_sosend_stream_or_seqpacket,
4369 .pr_soreceive = uipc_soreceive_stream_or_seqpacket,
4370 .pr_sopoll = uipc_sopoll_stream_or_seqpacket,
4371 .pr_kqfilter = uipc_kqfilter_stream_or_seqpacket,
4372 .pr_close = uipc_close,
4373 .pr_chmod = uipc_chmod,
4374 };
4375
4376 static struct protosw dgramproto = {
4377 .pr_type = SOCK_DGRAM,
4378 .pr_flags = PR_ATOMIC | PR_ADDR | PR_CAPATTACH | PR_SOCKBUF,
4379 .pr_ctloutput = &uipc_ctloutput,
4380 .pr_abort = uipc_abort,
4381 .pr_accept = uipc_peeraddr,
4382 .pr_attach = uipc_attach,
4383 .pr_bind = uipc_bind,
4384 .pr_bindat = uipc_bindat,
4385 .pr_connect = uipc_connect,
4386 .pr_connectat = uipc_connectat,
4387 .pr_connect2 = uipc_connect2,
4388 .pr_detach = uipc_detach,
4389 .pr_disconnect = uipc_disconnect,
4390 .pr_peeraddr = uipc_peeraddr,
4391 .pr_sosend = uipc_sosend_dgram,
4392 .pr_sense = uipc_sense,
4393 .pr_shutdown = uipc_shutdown,
4394 .pr_sockaddr = uipc_sockaddr,
4395 .pr_soreceive = uipc_soreceive_dgram,
4396 .pr_close = uipc_close,
4397 .pr_chmod = uipc_chmod,
4398 };
4399
4400 static struct protosw seqpacketproto = {
4401 .pr_type = SOCK_SEQPACKET,
4402 .pr_flags = PR_CONNREQUIRED | PR_CAPATTACH | PR_SOCKBUF,
4403 .pr_ctloutput = &uipc_ctloutput,
4404 .pr_abort = uipc_abort,
4405 .pr_accept = uipc_peeraddr,
4406 .pr_attach = uipc_attach,
4407 .pr_bind = uipc_bind,
4408 .pr_bindat = uipc_bindat,
4409 .pr_connect = uipc_connect,
4410 .pr_connectat = uipc_connectat,
4411 .pr_connect2 = uipc_connect2,
4412 .pr_detach = uipc_detach,
4413 .pr_disconnect = uipc_disconnect,
4414 .pr_listen = uipc_listen,
4415 .pr_peeraddr = uipc_peeraddr,
4416 .pr_sense = uipc_sense,
4417 .pr_shutdown = uipc_shutdown,
4418 .pr_sockaddr = uipc_sockaddr,
4419 .pr_sosend = uipc_sosend_stream_or_seqpacket,
4420 .pr_soreceive = uipc_soreceive_stream_or_seqpacket,
4421 .pr_sopoll = uipc_sopoll_stream_or_seqpacket,
4422 .pr_kqfilter = uipc_kqfilter_stream_or_seqpacket,
4423 .pr_close = uipc_close,
4424 .pr_chmod = uipc_chmod,
4425 };
4426
4427 static struct domain localdomain = {
4428 .dom_family = AF_LOCAL,
4429 .dom_name = "local",
4430 .dom_nprotosw = 3,
4431 .dom_protosw = {
4432 &streamproto,
4433 &dgramproto,
4434 &seqpacketproto,
4435 }
4436 };
4437 DOMAIN_SET(local);
4438
4439 /*
4440 * A helper function called by VFS before socket-type vnode reclamation.
4441 * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
4442 * use count.
4443 */
4444 void
vfs_unp_reclaim(struct vnode * vp)4445 vfs_unp_reclaim(struct vnode *vp)
4446 {
4447 struct unpcb *unp;
4448 int active;
4449 struct mtx *vplock;
4450
4451 ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
4452 KASSERT(vp->v_type == VSOCK,
4453 ("vfs_unp_reclaim: vp->v_type != VSOCK"));
4454
4455 active = 0;
4456 vplock = mtx_pool_find(unp_vp_mtxpool, vp);
4457 mtx_lock(vplock);
4458 VOP_UNP_CONNECT(vp, &unp);
4459 if (unp == NULL)
4460 goto done;
4461 UNP_PCB_LOCK(unp);
4462 if (unp->unp_vnode == vp) {
4463 VOP_UNP_DETACH(vp);
4464 unp->unp_vnode = NULL;
4465 active = 1;
4466 }
4467 UNP_PCB_UNLOCK(unp);
4468 done:
4469 mtx_unlock(vplock);
4470 if (active)
4471 vunref(vp);
4472 }
4473
4474 #ifdef DDB
4475 static void
db_print_indent(int indent)4476 db_print_indent(int indent)
4477 {
4478 int i;
4479
4480 for (i = 0; i < indent; i++)
4481 db_printf(" ");
4482 }
4483
4484 static void
db_print_unpflags(int unp_flags)4485 db_print_unpflags(int unp_flags)
4486 {
4487 int comma;
4488
4489 comma = 0;
4490 if (unp_flags & UNP_HAVEPC) {
4491 db_printf("%sUNP_HAVEPC", comma ? ", " : "");
4492 comma = 1;
4493 }
4494 if (unp_flags & UNP_WANTCRED_ALWAYS) {
4495 db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : "");
4496 comma = 1;
4497 }
4498 if (unp_flags & UNP_WANTCRED_ONESHOT) {
4499 db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : "");
4500 comma = 1;
4501 }
4502 if (unp_flags & UNP_CONNECTING) {
4503 db_printf("%sUNP_CONNECTING", comma ? ", " : "");
4504 comma = 1;
4505 }
4506 if (unp_flags & UNP_BINDING) {
4507 db_printf("%sUNP_BINDING", comma ? ", " : "");
4508 comma = 1;
4509 }
4510 }
4511
4512 static void
db_print_xucred(int indent,struct xucred * xu)4513 db_print_xucred(int indent, struct xucred *xu)
4514 {
4515 int comma, i;
4516
4517 db_print_indent(indent);
4518 db_printf("cr_version: %u cr_uid: %u cr_pid: %d cr_ngroups: %d\n",
4519 xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups);
4520 db_print_indent(indent);
4521 db_printf("cr_groups: ");
4522 comma = 0;
4523 for (i = 0; i < xu->cr_ngroups; i++) {
4524 db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
4525 comma = 1;
4526 }
4527 db_printf("\n");
4528 }
4529
4530 static void
db_print_unprefs(int indent,struct unp_head * uh)4531 db_print_unprefs(int indent, struct unp_head *uh)
4532 {
4533 struct unpcb *unp;
4534 int counter;
4535
4536 counter = 0;
4537 LIST_FOREACH(unp, uh, unp_reflink) {
4538 if (counter % 4 == 0)
4539 db_print_indent(indent);
4540 db_printf("%p ", unp);
4541 if (counter % 4 == 3)
4542 db_printf("\n");
4543 counter++;
4544 }
4545 if (counter != 0 && counter % 4 != 0)
4546 db_printf("\n");
4547 }
4548
DB_SHOW_COMMAND(unpcb,db_show_unpcb)4549 DB_SHOW_COMMAND(unpcb, db_show_unpcb)
4550 {
4551 struct unpcb *unp;
4552
4553 if (!have_addr) {
4554 db_printf("usage: show unpcb <addr>\n");
4555 return;
4556 }
4557 unp = (struct unpcb *)addr;
4558
4559 db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket,
4560 unp->unp_vnode);
4561
4562 db_printf("unp_ino: %ju unp_conn: %p\n", (uintmax_t)unp->unp_ino,
4563 unp->unp_conn);
4564
4565 db_printf("unp_refs:\n");
4566 db_print_unprefs(2, &unp->unp_refs);
4567
4568 /* XXXRW: Would be nice to print the full address, if any. */
4569 db_printf("unp_addr: %p\n", unp->unp_addr);
4570
4571 db_printf("unp_gencnt: %llu\n",
4572 (unsigned long long)unp->unp_gencnt);
4573
4574 db_printf("unp_flags: %x (", unp->unp_flags);
4575 db_print_unpflags(unp->unp_flags);
4576 db_printf(")\n");
4577
4578 db_printf("unp_peercred:\n");
4579 db_print_xucred(2, &unp->unp_peercred);
4580
4581 db_printf("unp_refcount: %u\n", unp->unp_refcount);
4582 }
4583 #endif
4584