1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1989, 1991, 1993
5 * The Regents of the University of California. All Rights Reserved.
6 * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
7 * Copyright (c) 2018 Matthew Macy
8 * Copyright (c) 2022 Gleb Smirnoff <glebius@FreeBSD.org>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 /*
36 * UNIX Domain (Local) Sockets
37 *
38 * This is an implementation of UNIX (local) domain sockets. Each socket has
39 * an associated struct unpcb (UNIX protocol control block). Stream sockets
40 * may be connected to 0 or 1 other socket. Datagram sockets may be
41 * connected to 0, 1, or many other sockets. Sockets may be created and
42 * connected in pairs (socketpair(2)), or bound/connected to using the file
43 * system name space. For most purposes, only the receive socket buffer is
44 * used, as sending on one socket delivers directly to the receive socket
45 * buffer of a second socket.
46 *
47 * The implementation is substantially complicated by the fact that
48 * "ancillary data", such as file descriptors or credentials, may be passed
49 * across UNIX domain sockets. The potential for passing UNIX domain sockets
50 * over other UNIX domain sockets requires the implementation of a simple
51 * garbage collector to find and tear down cycles of disconnected sockets.
52 *
53 * TODO:
54 * RDM
55 * rethink name space problems
56 * need a proper out-of-band
57 */
58
59 #include <sys/cdefs.h>
60 #include "opt_ddb.h"
61
62 #include <sys/param.h>
63 #include <sys/capsicum.h>
64 #include <sys/domain.h>
65 #include <sys/eventhandler.h>
66 #include <sys/fcntl.h>
67 #include <sys/file.h>
68 #include <sys/filedesc.h>
69 #include <sys/kernel.h>
70 #include <sys/lock.h>
71 #include <sys/malloc.h>
72 #include <sys/mbuf.h>
73 #include <sys/mount.h>
74 #include <sys/mutex.h>
75 #include <sys/namei.h>
76 #include <sys/proc.h>
77 #include <sys/protosw.h>
78 #include <sys/queue.h>
79 #include <sys/resourcevar.h>
80 #include <sys/rwlock.h>
81 #include <sys/socket.h>
82 #include <sys/socketvar.h>
83 #include <sys/signalvar.h>
84 #include <sys/stat.h>
85 #include <sys/sx.h>
86 #include <sys/sysctl.h>
87 #include <sys/systm.h>
88 #include <sys/taskqueue.h>
89 #include <sys/un.h>
90 #include <sys/unpcb.h>
91 #include <sys/vnode.h>
92
93 #include <net/vnet.h>
94
95 #ifdef DDB
96 #include <ddb/ddb.h>
97 #endif
98
99 #include <security/mac/mac_framework.h>
100
101 #include <vm/uma.h>
102
103 MALLOC_DECLARE(M_FILECAPS);
104
105 static struct domain localdomain;
106
107 static uma_zone_t unp_zone;
108 static unp_gen_t unp_gencnt; /* (l) */
109 static u_int unp_count; /* (l) Count of local sockets. */
110 static ino_t unp_ino; /* Prototype for fake inode numbers. */
111 static int unp_rights; /* (g) File descriptors in flight. */
112 static struct unp_head unp_shead; /* (l) List of stream sockets. */
113 static struct unp_head unp_dhead; /* (l) List of datagram sockets. */
114 static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */
115 static struct mtx_pool *unp_vp_mtxpool;
116
117 struct unp_defer {
118 SLIST_ENTRY(unp_defer) ud_link;
119 struct file *ud_fp;
120 };
121 static SLIST_HEAD(, unp_defer) unp_defers;
122 static int unp_defers_count;
123
124 static const struct sockaddr sun_noname = {
125 .sa_len = sizeof(sun_noname),
126 .sa_family = AF_LOCAL,
127 };
128
129 /*
130 * Garbage collection of cyclic file descriptor/socket references occurs
131 * asynchronously in a taskqueue context in order to avoid recursion and
132 * reentrance in the UNIX domain socket, file descriptor, and socket layer
133 * code. See unp_gc() for a full description.
134 */
135 static struct timeout_task unp_gc_task;
136
137 /*
138 * The close of unix domain sockets attached as SCM_RIGHTS is
139 * postponed to the taskqueue, to avoid arbitrary recursion depth.
140 * The attached sockets might have another sockets attached.
141 */
142 static struct task unp_defer_task;
143
144 /*
145 * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
146 * stream sockets, although the total for sender and receiver is actually
147 * only PIPSIZ.
148 *
149 * Datagram sockets really use the sendspace as the maximum datagram size,
150 * and don't really want to reserve the sendspace. Their recvspace should be
151 * large enough for at least one max-size datagram plus address.
152 */
153 #ifndef PIPSIZ
154 #define PIPSIZ 8192
155 #endif
156 static u_long unpst_sendspace = PIPSIZ;
157 static u_long unpst_recvspace = PIPSIZ;
158 static u_long unpdg_maxdgram = 8*1024; /* support 8KB syslog msgs */
159 static u_long unpdg_recvspace = 16*1024;
160 static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */
161 static u_long unpsp_recvspace = PIPSIZ;
162
163 static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
164 "Local domain");
165 static SYSCTL_NODE(_net_local, SOCK_STREAM, stream,
166 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
167 "SOCK_STREAM");
168 static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram,
169 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
170 "SOCK_DGRAM");
171 static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket,
172 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
173 "SOCK_SEQPACKET");
174
175 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
176 &unpst_sendspace, 0, "Default stream send space.");
177 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
178 &unpst_recvspace, 0, "Default stream receive space.");
179 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
180 &unpdg_maxdgram, 0, "Maximum datagram size.");
181 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
182 &unpdg_recvspace, 0, "Default datagram receive space.");
183 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
184 &unpsp_sendspace, 0, "Default seqpacket send space.");
185 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
186 &unpsp_recvspace, 0, "Default seqpacket receive space.");
187 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
188 "File descriptors in flight.");
189 SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
190 &unp_defers_count, 0,
191 "File descriptors deferred to taskqueue for close.");
192
193 /*
194 * Locking and synchronization:
195 *
196 * Several types of locks exist in the local domain socket implementation:
197 * - a global linkage lock
198 * - a global connection list lock
199 * - the mtxpool lock
200 * - per-unpcb mutexes
201 *
202 * The linkage lock protects the global socket lists, the generation number
203 * counter and garbage collector state.
204 *
205 * The connection list lock protects the list of referring sockets in a datagram
206 * socket PCB. This lock is also overloaded to protect a global list of
207 * sockets whose buffers contain socket references in the form of SCM_RIGHTS
208 * messages. To avoid recursion, such references are released by a dedicated
209 * thread.
210 *
211 * The mtxpool lock protects the vnode from being modified while referenced.
212 * Lock ordering rules require that it be acquired before any PCB locks.
213 *
214 * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the
215 * unpcb. This includes the unp_conn field, which either links two connected
216 * PCBs together (for connected socket types) or points at the destination
217 * socket (for connectionless socket types). The operations of creating or
218 * destroying a connection therefore involve locking multiple PCBs. To avoid
219 * lock order reversals, in some cases this involves dropping a PCB lock and
220 * using a reference counter to maintain liveness.
221 *
222 * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
223 * allocated in pr_attach() and freed in pr_detach(). The validity of that
224 * pointer is an invariant, so no lock is required to dereference the so_pcb
225 * pointer if a valid socket reference is held by the caller. In practice,
226 * this is always true during operations performed on a socket. Each unpcb
227 * has a back-pointer to its socket, unp_socket, which will be stable under
228 * the same circumstances.
229 *
230 * This pointer may only be safely dereferenced as long as a valid reference
231 * to the unpcb is held. Typically, this reference will be from the socket,
232 * or from another unpcb when the referring unpcb's lock is held (in order
233 * that the reference not be invalidated during use). For example, to follow
234 * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
235 * that detach is not run clearing unp_socket.
236 *
237 * Blocking with UNIX domain sockets is a tricky issue: unlike most network
238 * protocols, bind() is a non-atomic operation, and connect() requires
239 * potential sleeping in the protocol, due to potentially waiting on local or
240 * distributed file systems. We try to separate "lookup" operations, which
241 * may sleep, and the IPC operations themselves, which typically can occur
242 * with relative atomicity as locks can be held over the entire operation.
243 *
244 * Another tricky issue is simultaneous multi-threaded or multi-process
245 * access to a single UNIX domain socket. These are handled by the flags
246 * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
247 * binding, both of which involve dropping UNIX domain socket locks in order
248 * to perform namei() and other file system operations.
249 */
250 static struct rwlock unp_link_rwlock;
251 static struct mtx unp_defers_lock;
252
253 #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \
254 "unp_link_rwlock")
255
256 #define UNP_LINK_LOCK_ASSERT() rw_assert(&unp_link_rwlock, \
257 RA_LOCKED)
258 #define UNP_LINK_UNLOCK_ASSERT() rw_assert(&unp_link_rwlock, \
259 RA_UNLOCKED)
260
261 #define UNP_LINK_RLOCK() rw_rlock(&unp_link_rwlock)
262 #define UNP_LINK_RUNLOCK() rw_runlock(&unp_link_rwlock)
263 #define UNP_LINK_WLOCK() rw_wlock(&unp_link_rwlock)
264 #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock)
265 #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \
266 RA_WLOCKED)
267 #define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock)
268
269 #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \
270 "unp_defer", NULL, MTX_DEF)
271 #define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock)
272 #define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock)
273
274 #define UNP_REF_LIST_LOCK() UNP_DEFERRED_LOCK();
275 #define UNP_REF_LIST_UNLOCK() UNP_DEFERRED_UNLOCK();
276
277 #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \
278 "unp", "unp", \
279 MTX_DUPOK|MTX_DEF)
280 #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx)
281 #define UNP_PCB_LOCKPTR(unp) (&(unp)->unp_mtx)
282 #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx)
283 #define UNP_PCB_TRYLOCK(unp) mtx_trylock(&(unp)->unp_mtx)
284 #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx)
285 #define UNP_PCB_OWNED(unp) mtx_owned(&(unp)->unp_mtx)
286 #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED)
287 #define UNP_PCB_UNLOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
288
289 static int uipc_connect2(struct socket *, struct socket *);
290 static int uipc_ctloutput(struct socket *, struct sockopt *);
291 static int unp_connect(struct socket *, struct sockaddr *,
292 struct thread *);
293 static int unp_connectat(int, struct socket *, struct sockaddr *,
294 struct thread *, bool);
295 static void unp_connect2(struct socket *so, struct socket *so2);
296 static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
297 static void unp_dispose(struct socket *so);
298 static void unp_shutdown(struct unpcb *);
299 static void unp_drop(struct unpcb *);
300 static void unp_gc(__unused void *, int);
301 static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
302 static void unp_discard(struct file *);
303 static void unp_freerights(struct filedescent **, int);
304 static int unp_internalize(struct mbuf **, struct thread *,
305 struct mbuf **, u_int *, u_int *);
306 static void unp_internalize_fp(struct file *);
307 static int unp_externalize(struct mbuf *, struct mbuf **, int);
308 static int unp_externalize_fp(struct file *);
309 static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *,
310 int, struct mbuf **, u_int *, u_int *);
311 static void unp_process_defers(void * __unused, int);
312
313 static void
unp_pcb_hold(struct unpcb * unp)314 unp_pcb_hold(struct unpcb *unp)
315 {
316 u_int old __unused;
317
318 old = refcount_acquire(&unp->unp_refcount);
319 KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp));
320 }
321
322 static __result_use_check bool
unp_pcb_rele(struct unpcb * unp)323 unp_pcb_rele(struct unpcb *unp)
324 {
325 bool ret;
326
327 UNP_PCB_LOCK_ASSERT(unp);
328
329 if ((ret = refcount_release(&unp->unp_refcount))) {
330 UNP_PCB_UNLOCK(unp);
331 UNP_PCB_LOCK_DESTROY(unp);
332 uma_zfree(unp_zone, unp);
333 }
334 return (ret);
335 }
336
337 static void
unp_pcb_rele_notlast(struct unpcb * unp)338 unp_pcb_rele_notlast(struct unpcb *unp)
339 {
340 bool ret __unused;
341
342 ret = refcount_release(&unp->unp_refcount);
343 KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp));
344 }
345
346 static void
unp_pcb_lock_pair(struct unpcb * unp,struct unpcb * unp2)347 unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2)
348 {
349 UNP_PCB_UNLOCK_ASSERT(unp);
350 UNP_PCB_UNLOCK_ASSERT(unp2);
351
352 if (unp == unp2) {
353 UNP_PCB_LOCK(unp);
354 } else if ((uintptr_t)unp2 > (uintptr_t)unp) {
355 UNP_PCB_LOCK(unp);
356 UNP_PCB_LOCK(unp2);
357 } else {
358 UNP_PCB_LOCK(unp2);
359 UNP_PCB_LOCK(unp);
360 }
361 }
362
363 static void
unp_pcb_unlock_pair(struct unpcb * unp,struct unpcb * unp2)364 unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2)
365 {
366 UNP_PCB_UNLOCK(unp);
367 if (unp != unp2)
368 UNP_PCB_UNLOCK(unp2);
369 }
370
371 /*
372 * Try to lock the connected peer of an already locked socket. In some cases
373 * this requires that we unlock the current socket. The pairbusy counter is
374 * used to block concurrent connection attempts while the lock is dropped. The
375 * caller must be careful to revalidate PCB state.
376 */
377 static struct unpcb *
unp_pcb_lock_peer(struct unpcb * unp)378 unp_pcb_lock_peer(struct unpcb *unp)
379 {
380 struct unpcb *unp2;
381
382 UNP_PCB_LOCK_ASSERT(unp);
383 unp2 = unp->unp_conn;
384 if (unp2 == NULL)
385 return (NULL);
386 if (__predict_false(unp == unp2))
387 return (unp);
388
389 UNP_PCB_UNLOCK_ASSERT(unp2);
390
391 if (__predict_true(UNP_PCB_TRYLOCK(unp2)))
392 return (unp2);
393 if ((uintptr_t)unp2 > (uintptr_t)unp) {
394 UNP_PCB_LOCK(unp2);
395 return (unp2);
396 }
397 unp->unp_pairbusy++;
398 unp_pcb_hold(unp2);
399 UNP_PCB_UNLOCK(unp);
400
401 UNP_PCB_LOCK(unp2);
402 UNP_PCB_LOCK(unp);
403 KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL,
404 ("%s: socket %p was reconnected", __func__, unp));
405 if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) {
406 unp->unp_flags &= ~UNP_WAITING;
407 wakeup(unp);
408 }
409 if (unp_pcb_rele(unp2)) {
410 /* unp2 is unlocked. */
411 return (NULL);
412 }
413 if (unp->unp_conn == NULL) {
414 UNP_PCB_UNLOCK(unp2);
415 return (NULL);
416 }
417 return (unp2);
418 }
419
420 static void
uipc_abort(struct socket * so)421 uipc_abort(struct socket *so)
422 {
423 struct unpcb *unp, *unp2;
424
425 unp = sotounpcb(so);
426 KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
427 UNP_PCB_UNLOCK_ASSERT(unp);
428
429 UNP_PCB_LOCK(unp);
430 unp2 = unp->unp_conn;
431 if (unp2 != NULL) {
432 unp_pcb_hold(unp2);
433 UNP_PCB_UNLOCK(unp);
434 unp_drop(unp2);
435 } else
436 UNP_PCB_UNLOCK(unp);
437 }
438
439 static int
uipc_attach(struct socket * so,int proto,struct thread * td)440 uipc_attach(struct socket *so, int proto, struct thread *td)
441 {
442 u_long sendspace, recvspace;
443 struct unpcb *unp;
444 int error;
445 bool locked;
446
447 KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
448 switch (so->so_type) {
449 case SOCK_STREAM:
450 sendspace = unpst_sendspace;
451 recvspace = unpst_recvspace;
452 break;
453
454 case SOCK_DGRAM:
455 STAILQ_INIT(&so->so_rcv.uxdg_mb);
456 STAILQ_INIT(&so->so_snd.uxdg_mb);
457 TAILQ_INIT(&so->so_rcv.uxdg_conns);
458 /*
459 * Since send buffer is either bypassed or is a part
460 * of one-to-many receive buffer, we assign both space
461 * limits to unpdg_recvspace.
462 */
463 sendspace = recvspace = unpdg_recvspace;
464 break;
465
466 case SOCK_SEQPACKET:
467 sendspace = unpsp_sendspace;
468 recvspace = unpsp_recvspace;
469 break;
470
471 default:
472 panic("uipc_attach");
473 }
474 error = soreserve(so, sendspace, recvspace);
475 if (error)
476 return (error);
477 unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
478 if (unp == NULL)
479 return (ENOBUFS);
480 LIST_INIT(&unp->unp_refs);
481 UNP_PCB_LOCK_INIT(unp);
482 unp->unp_socket = so;
483 so->so_pcb = unp;
484 refcount_init(&unp->unp_refcount, 1);
485 unp->unp_mode = ACCESSPERMS;
486
487 if ((locked = UNP_LINK_WOWNED()) == false)
488 UNP_LINK_WLOCK();
489
490 unp->unp_gencnt = ++unp_gencnt;
491 unp->unp_ino = ++unp_ino;
492 unp_count++;
493 switch (so->so_type) {
494 case SOCK_STREAM:
495 LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
496 break;
497
498 case SOCK_DGRAM:
499 LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
500 break;
501
502 case SOCK_SEQPACKET:
503 LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
504 break;
505
506 default:
507 panic("uipc_attach");
508 }
509
510 if (locked == false)
511 UNP_LINK_WUNLOCK();
512
513 return (0);
514 }
515
516 static int
uipc_bindat(int fd,struct socket * so,struct sockaddr * nam,struct thread * td)517 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
518 {
519 struct sockaddr_un *soun = (struct sockaddr_un *)nam;
520 struct vattr vattr;
521 int error, namelen;
522 struct nameidata nd;
523 struct unpcb *unp;
524 struct vnode *vp;
525 struct mount *mp;
526 cap_rights_t rights;
527 char *buf;
528 mode_t mode;
529
530 if (nam->sa_family != AF_UNIX)
531 return (EAFNOSUPPORT);
532
533 unp = sotounpcb(so);
534 KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
535
536 if (soun->sun_len > sizeof(struct sockaddr_un))
537 return (EINVAL);
538 namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
539 if (namelen <= 0)
540 return (EINVAL);
541
542 /*
543 * We don't allow simultaneous bind() calls on a single UNIX domain
544 * socket, so flag in-progress operations, and return an error if an
545 * operation is already in progress.
546 *
547 * Historically, we have not allowed a socket to be rebound, so this
548 * also returns an error. Not allowing re-binding simplifies the
549 * implementation and avoids a great many possible failure modes.
550 */
551 UNP_PCB_LOCK(unp);
552 if (unp->unp_vnode != NULL) {
553 UNP_PCB_UNLOCK(unp);
554 return (EINVAL);
555 }
556 if (unp->unp_flags & UNP_BINDING) {
557 UNP_PCB_UNLOCK(unp);
558 return (EALREADY);
559 }
560 unp->unp_flags |= UNP_BINDING;
561 mode = unp->unp_mode & ~td->td_proc->p_pd->pd_cmask;
562 UNP_PCB_UNLOCK(unp);
563
564 buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
565 bcopy(soun->sun_path, buf, namelen);
566 buf[namelen] = 0;
567
568 restart:
569 NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | NOCACHE,
570 UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT));
571 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
572 error = namei(&nd);
573 if (error)
574 goto error;
575 vp = nd.ni_vp;
576 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
577 NDFREE_PNBUF(&nd);
578 if (nd.ni_dvp == vp)
579 vrele(nd.ni_dvp);
580 else
581 vput(nd.ni_dvp);
582 if (vp != NULL) {
583 vrele(vp);
584 error = EADDRINUSE;
585 goto error;
586 }
587 error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
588 if (error)
589 goto error;
590 goto restart;
591 }
592 VATTR_NULL(&vattr);
593 vattr.va_type = VSOCK;
594 vattr.va_mode = mode;
595 #ifdef MAC
596 error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
597 &vattr);
598 #endif
599 if (error == 0) {
600 /*
601 * The prior lookup may have left LK_SHARED in cn_lkflags,
602 * and VOP_CREATE technically only requires the new vnode to
603 * be locked shared. Most filesystems will return the new vnode
604 * locked exclusive regardless, but we should explicitly
605 * specify that here since we require it and assert to that
606 * effect below.
607 */
608 nd.ni_cnd.cn_lkflags = (nd.ni_cnd.cn_lkflags & ~LK_SHARED) |
609 LK_EXCLUSIVE;
610 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
611 }
612 NDFREE_PNBUF(&nd);
613 if (error) {
614 VOP_VPUT_PAIR(nd.ni_dvp, NULL, true);
615 vn_finished_write(mp);
616 if (error == ERELOOKUP)
617 goto restart;
618 goto error;
619 }
620 vp = nd.ni_vp;
621 ASSERT_VOP_ELOCKED(vp, "uipc_bind");
622 soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
623
624 UNP_PCB_LOCK(unp);
625 VOP_UNP_BIND(vp, unp);
626 unp->unp_vnode = vp;
627 unp->unp_addr = soun;
628 unp->unp_flags &= ~UNP_BINDING;
629 UNP_PCB_UNLOCK(unp);
630 vref(vp);
631 VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
632 vn_finished_write(mp);
633 free(buf, M_TEMP);
634 return (0);
635
636 error:
637 UNP_PCB_LOCK(unp);
638 unp->unp_flags &= ~UNP_BINDING;
639 UNP_PCB_UNLOCK(unp);
640 free(buf, M_TEMP);
641 return (error);
642 }
643
644 static int
uipc_bind(struct socket * so,struct sockaddr * nam,struct thread * td)645 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
646 {
647
648 return (uipc_bindat(AT_FDCWD, so, nam, td));
649 }
650
651 static int
uipc_connect(struct socket * so,struct sockaddr * nam,struct thread * td)652 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
653 {
654 int error;
655
656 KASSERT(td == curthread, ("uipc_connect: td != curthread"));
657 error = unp_connect(so, nam, td);
658 return (error);
659 }
660
661 static int
uipc_connectat(int fd,struct socket * so,struct sockaddr * nam,struct thread * td)662 uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
663 struct thread *td)
664 {
665 int error;
666
667 KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
668 error = unp_connectat(fd, so, nam, td, false);
669 return (error);
670 }
671
672 static void
uipc_close(struct socket * so)673 uipc_close(struct socket *so)
674 {
675 struct unpcb *unp, *unp2;
676 struct vnode *vp = NULL;
677 struct mtx *vplock;
678
679 unp = sotounpcb(so);
680 KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
681
682 vplock = NULL;
683 if ((vp = unp->unp_vnode) != NULL) {
684 vplock = mtx_pool_find(unp_vp_mtxpool, vp);
685 mtx_lock(vplock);
686 }
687 UNP_PCB_LOCK(unp);
688 if (vp && unp->unp_vnode == NULL) {
689 mtx_unlock(vplock);
690 vp = NULL;
691 }
692 if (vp != NULL) {
693 VOP_UNP_DETACH(vp);
694 unp->unp_vnode = NULL;
695 }
696 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
697 unp_disconnect(unp, unp2);
698 else
699 UNP_PCB_UNLOCK(unp);
700 if (vp) {
701 mtx_unlock(vplock);
702 vrele(vp);
703 }
704 }
705
706 static int
uipc_chmod(struct socket * so,mode_t mode,struct ucred * cred __unused,struct thread * td __unused)707 uipc_chmod(struct socket *so, mode_t mode, struct ucred *cred __unused,
708 struct thread *td __unused)
709 {
710 struct unpcb *unp;
711 int error;
712
713 if ((mode & ~ACCESSPERMS) != 0)
714 return (EINVAL);
715
716 error = 0;
717 unp = sotounpcb(so);
718 UNP_PCB_LOCK(unp);
719 if (unp->unp_vnode != NULL || (unp->unp_flags & UNP_BINDING) != 0)
720 error = EINVAL;
721 else
722 unp->unp_mode = mode;
723 UNP_PCB_UNLOCK(unp);
724 return (error);
725 }
726
727 static int
uipc_connect2(struct socket * so1,struct socket * so2)728 uipc_connect2(struct socket *so1, struct socket *so2)
729 {
730 struct unpcb *unp, *unp2;
731
732 if (so1->so_type != so2->so_type)
733 return (EPROTOTYPE);
734
735 unp = so1->so_pcb;
736 KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
737 unp2 = so2->so_pcb;
738 KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
739 unp_pcb_lock_pair(unp, unp2);
740 unp_connect2(so1, so2);
741 unp_pcb_unlock_pair(unp, unp2);
742
743 return (0);
744 }
745
746 static void
uipc_detach(struct socket * so)747 uipc_detach(struct socket *so)
748 {
749 struct unpcb *unp, *unp2;
750 struct mtx *vplock;
751 struct vnode *vp;
752 int local_unp_rights;
753
754 unp = sotounpcb(so);
755 KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
756
757 vp = NULL;
758 vplock = NULL;
759
760 if (!SOLISTENING(so))
761 unp_dispose(so);
762
763 UNP_LINK_WLOCK();
764 LIST_REMOVE(unp, unp_link);
765 if (unp->unp_gcflag & UNPGC_DEAD)
766 LIST_REMOVE(unp, unp_dead);
767 unp->unp_gencnt = ++unp_gencnt;
768 --unp_count;
769 UNP_LINK_WUNLOCK();
770
771 UNP_PCB_UNLOCK_ASSERT(unp);
772 restart:
773 if ((vp = unp->unp_vnode) != NULL) {
774 vplock = mtx_pool_find(unp_vp_mtxpool, vp);
775 mtx_lock(vplock);
776 }
777 UNP_PCB_LOCK(unp);
778 if (unp->unp_vnode != vp && unp->unp_vnode != NULL) {
779 if (vplock)
780 mtx_unlock(vplock);
781 UNP_PCB_UNLOCK(unp);
782 goto restart;
783 }
784 if ((vp = unp->unp_vnode) != NULL) {
785 VOP_UNP_DETACH(vp);
786 unp->unp_vnode = NULL;
787 }
788 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
789 unp_disconnect(unp, unp2);
790 else
791 UNP_PCB_UNLOCK(unp);
792
793 UNP_REF_LIST_LOCK();
794 while (!LIST_EMPTY(&unp->unp_refs)) {
795 struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
796
797 unp_pcb_hold(ref);
798 UNP_REF_LIST_UNLOCK();
799
800 MPASS(ref != unp);
801 UNP_PCB_UNLOCK_ASSERT(ref);
802 unp_drop(ref);
803 UNP_REF_LIST_LOCK();
804 }
805 UNP_REF_LIST_UNLOCK();
806
807 UNP_PCB_LOCK(unp);
808 local_unp_rights = unp_rights;
809 unp->unp_socket->so_pcb = NULL;
810 unp->unp_socket = NULL;
811 free(unp->unp_addr, M_SONAME);
812 unp->unp_addr = NULL;
813 if (!unp_pcb_rele(unp))
814 UNP_PCB_UNLOCK(unp);
815 if (vp) {
816 mtx_unlock(vplock);
817 vrele(vp);
818 }
819 if (local_unp_rights)
820 taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
821
822 switch (so->so_type) {
823 case SOCK_DGRAM:
824 /*
825 * Everything should have been unlinked/freed by unp_dispose()
826 * and/or unp_disconnect().
827 */
828 MPASS(so->so_rcv.uxdg_peeked == NULL);
829 MPASS(STAILQ_EMPTY(&so->so_rcv.uxdg_mb));
830 MPASS(TAILQ_EMPTY(&so->so_rcv.uxdg_conns));
831 MPASS(STAILQ_EMPTY(&so->so_snd.uxdg_mb));
832 }
833 }
834
835 static int
uipc_disconnect(struct socket * so)836 uipc_disconnect(struct socket *so)
837 {
838 struct unpcb *unp, *unp2;
839
840 unp = sotounpcb(so);
841 KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
842
843 UNP_PCB_LOCK(unp);
844 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
845 unp_disconnect(unp, unp2);
846 else
847 UNP_PCB_UNLOCK(unp);
848 return (0);
849 }
850
851 static int
uipc_listen(struct socket * so,int backlog,struct thread * td)852 uipc_listen(struct socket *so, int backlog, struct thread *td)
853 {
854 struct unpcb *unp;
855 int error;
856
857 MPASS(so->so_type != SOCK_DGRAM);
858
859 /*
860 * Synchronize with concurrent connection attempts.
861 */
862 error = 0;
863 unp = sotounpcb(so);
864 UNP_PCB_LOCK(unp);
865 if (unp->unp_conn != NULL || (unp->unp_flags & UNP_CONNECTING) != 0)
866 error = EINVAL;
867 else if (unp->unp_vnode == NULL)
868 error = EDESTADDRREQ;
869 if (error != 0) {
870 UNP_PCB_UNLOCK(unp);
871 return (error);
872 }
873
874 SOCK_LOCK(so);
875 error = solisten_proto_check(so);
876 if (error == 0) {
877 cru2xt(td, &unp->unp_peercred);
878 solisten_proto(so, backlog);
879 }
880 SOCK_UNLOCK(so);
881 UNP_PCB_UNLOCK(unp);
882 return (error);
883 }
884
885 static int
uipc_peeraddr(struct socket * so,struct sockaddr * ret)886 uipc_peeraddr(struct socket *so, struct sockaddr *ret)
887 {
888 struct unpcb *unp, *unp2;
889 const struct sockaddr *sa;
890
891 unp = sotounpcb(so);
892 KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
893
894 UNP_PCB_LOCK(unp);
895 unp2 = unp_pcb_lock_peer(unp);
896 if (unp2 != NULL) {
897 if (unp2->unp_addr != NULL)
898 sa = (struct sockaddr *)unp2->unp_addr;
899 else
900 sa = &sun_noname;
901 bcopy(sa, ret, sa->sa_len);
902 unp_pcb_unlock_pair(unp, unp2);
903 } else {
904 UNP_PCB_UNLOCK(unp);
905 sa = &sun_noname;
906 bcopy(sa, ret, sa->sa_len);
907 }
908 return (0);
909 }
910
911 static int
uipc_rcvd(struct socket * so,int flags)912 uipc_rcvd(struct socket *so, int flags)
913 {
914 struct unpcb *unp, *unp2;
915 struct socket *so2;
916 u_int mbcnt, sbcc;
917
918 unp = sotounpcb(so);
919 KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
920 KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
921 ("%s: socktype %d", __func__, so->so_type));
922
923 /*
924 * Adjust backpressure on sender and wakeup any waiting to write.
925 *
926 * The unp lock is acquired to maintain the validity of the unp_conn
927 * pointer; no lock on unp2 is required as unp2->unp_socket will be
928 * static as long as we don't permit unp2 to disconnect from unp,
929 * which is prevented by the lock on unp. We cache values from
930 * so_rcv to avoid holding the so_rcv lock over the entire
931 * transaction on the remote so_snd.
932 */
933 SOCKBUF_LOCK(&so->so_rcv);
934 mbcnt = so->so_rcv.sb_mbcnt;
935 sbcc = sbavail(&so->so_rcv);
936 SOCKBUF_UNLOCK(&so->so_rcv);
937 /*
938 * There is a benign race condition at this point. If we're planning to
939 * clear SB_STOP, but uipc_send is called on the connected socket at
940 * this instant, it might add data to the sockbuf and set SB_STOP. Then
941 * we would erroneously clear SB_STOP below, even though the sockbuf is
942 * full. The race is benign because the only ill effect is to allow the
943 * sockbuf to exceed its size limit, and the size limits are not
944 * strictly guaranteed anyway.
945 */
946 UNP_PCB_LOCK(unp);
947 unp2 = unp->unp_conn;
948 if (unp2 == NULL) {
949 UNP_PCB_UNLOCK(unp);
950 return (0);
951 }
952 so2 = unp2->unp_socket;
953 SOCKBUF_LOCK(&so2->so_snd);
954 if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
955 so2->so_snd.sb_flags &= ~SB_STOP;
956 sowwakeup_locked(so2);
957 UNP_PCB_UNLOCK(unp);
958 return (0);
959 }
960
961 static int
uipc_send(struct socket * so,int flags,struct mbuf * m,struct sockaddr * nam,struct mbuf * control,struct thread * td)962 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
963 struct mbuf *control, struct thread *td)
964 {
965 struct unpcb *unp, *unp2;
966 struct socket *so2;
967 u_int mbcnt, sbcc;
968 int error;
969
970 unp = sotounpcb(so);
971 KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
972 KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
973 ("%s: socktype %d", __func__, so->so_type));
974
975 error = 0;
976 if (flags & PRUS_OOB) {
977 error = EOPNOTSUPP;
978 goto release;
979 }
980 if (control != NULL &&
981 (error = unp_internalize(&control, td, NULL, NULL, NULL)))
982 goto release;
983
984 unp2 = NULL;
985 if ((so->so_state & SS_ISCONNECTED) == 0) {
986 if (nam != NULL) {
987 if ((error = unp_connect(so, nam, td)) != 0)
988 goto out;
989 } else {
990 error = ENOTCONN;
991 goto out;
992 }
993 }
994
995 UNP_PCB_LOCK(unp);
996 if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) {
997 UNP_PCB_UNLOCK(unp);
998 error = ENOTCONN;
999 goto out;
1000 } else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1001 unp_pcb_unlock_pair(unp, unp2);
1002 error = EPIPE;
1003 goto out;
1004 }
1005 UNP_PCB_UNLOCK(unp);
1006 if ((so2 = unp2->unp_socket) == NULL) {
1007 UNP_PCB_UNLOCK(unp2);
1008 error = ENOTCONN;
1009 goto out;
1010 }
1011 SOCKBUF_LOCK(&so2->so_rcv);
1012 if (unp2->unp_flags & UNP_WANTCRED_MASK) {
1013 /*
1014 * Credentials are passed only once on SOCK_STREAM and
1015 * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
1016 * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
1017 */
1018 control = unp_addsockcred(td, control, unp2->unp_flags, NULL,
1019 NULL, NULL);
1020 unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
1021 }
1022
1023 /*
1024 * Send to paired receive port and wake up readers. Don't
1025 * check for space available in the receive buffer if we're
1026 * attaching ancillary data; Unix domain sockets only check
1027 * for space in the sending sockbuf, and that check is
1028 * performed one level up the stack. At that level we cannot
1029 * precisely account for the amount of buffer space used
1030 * (e.g., because control messages are not yet internalized).
1031 */
1032 switch (so->so_type) {
1033 case SOCK_STREAM:
1034 if (control != NULL) {
1035 sbappendcontrol_locked(&so2->so_rcv,
1036 m->m_len > 0 ? m : NULL, control, flags);
1037 control = NULL;
1038 } else
1039 sbappend_locked(&so2->so_rcv, m, flags);
1040 break;
1041
1042 case SOCK_SEQPACKET:
1043 if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
1044 &sun_noname, m, control))
1045 control = NULL;
1046 break;
1047 }
1048
1049 mbcnt = so2->so_rcv.sb_mbcnt;
1050 sbcc = sbavail(&so2->so_rcv);
1051 if (sbcc)
1052 sorwakeup_locked(so2);
1053 else
1054 SOCKBUF_UNLOCK(&so2->so_rcv);
1055
1056 /*
1057 * The PCB lock on unp2 protects the SB_STOP flag. Without it,
1058 * it would be possible for uipc_rcvd to be called at this
1059 * point, drain the receiving sockbuf, clear SB_STOP, and then
1060 * we would set SB_STOP below. That could lead to an empty
1061 * sockbuf having SB_STOP set
1062 */
1063 SOCKBUF_LOCK(&so->so_snd);
1064 if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
1065 so->so_snd.sb_flags |= SB_STOP;
1066 SOCKBUF_UNLOCK(&so->so_snd);
1067 UNP_PCB_UNLOCK(unp2);
1068 m = NULL;
1069 out:
1070 /*
1071 * PRUS_EOF is equivalent to pr_send followed by pr_shutdown.
1072 */
1073 if (flags & PRUS_EOF) {
1074 UNP_PCB_LOCK(unp);
1075 socantsendmore(so);
1076 unp_shutdown(unp);
1077 UNP_PCB_UNLOCK(unp);
1078 }
1079 if (control != NULL && error != 0)
1080 unp_scan(control, unp_freerights);
1081
1082 release:
1083 if (control != NULL)
1084 m_freem(control);
1085 /*
1086 * In case of PRUS_NOTREADY, uipc_ready() is responsible
1087 * for freeing memory.
1088 */
1089 if (m != NULL && (flags & PRUS_NOTREADY) == 0)
1090 m_freem(m);
1091 return (error);
1092 }
1093
1094 /* PF_UNIX/SOCK_DGRAM version of sbspace() */
1095 static inline bool
uipc_dgram_sbspace(struct sockbuf * sb,u_int cc,u_int mbcnt)1096 uipc_dgram_sbspace(struct sockbuf *sb, u_int cc, u_int mbcnt)
1097 {
1098 u_int bleft, mleft;
1099
1100 /*
1101 * Negative space may happen if send(2) is followed by
1102 * setsockopt(SO_SNDBUF/SO_RCVBUF) that shrinks maximum.
1103 */
1104 if (__predict_false(sb->sb_hiwat < sb->uxdg_cc ||
1105 sb->sb_mbmax < sb->uxdg_mbcnt))
1106 return (false);
1107
1108 if (__predict_false(sb->sb_state & SBS_CANTRCVMORE))
1109 return (false);
1110
1111 bleft = sb->sb_hiwat - sb->uxdg_cc;
1112 mleft = sb->sb_mbmax - sb->uxdg_mbcnt;
1113
1114 return (bleft >= cc && mleft >= mbcnt);
1115 }
1116
1117 /*
1118 * PF_UNIX/SOCK_DGRAM send
1119 *
1120 * Allocate a record consisting of 3 mbufs in the sequence of
1121 * from -> control -> data and append it to the socket buffer.
1122 *
1123 * The first mbuf carries sender's name and is a pkthdr that stores
1124 * overall length of datagram, its memory consumption and control length.
1125 */
1126 #define ctllen PH_loc.thirtytwo[1]
1127 _Static_assert(offsetof(struct pkthdr, memlen) + sizeof(u_int) <=
1128 offsetof(struct pkthdr, ctllen), "unix/dgram can not store ctllen");
1129 static int
uipc_sosend_dgram(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * m,struct mbuf * c,int flags,struct thread * td)1130 uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1131 struct mbuf *m, struct mbuf *c, int flags, struct thread *td)
1132 {
1133 struct unpcb *unp, *unp2;
1134 const struct sockaddr *from;
1135 struct socket *so2;
1136 struct sockbuf *sb;
1137 struct mbuf *f, *clast;
1138 u_int cc, ctl, mbcnt;
1139 u_int dcc __diagused, dctl __diagused, dmbcnt __diagused;
1140 int error;
1141
1142 MPASS((uio != NULL && m == NULL) || (m != NULL && uio == NULL));
1143
1144 error = 0;
1145 f = NULL;
1146 ctl = 0;
1147
1148 if (__predict_false(flags & MSG_OOB)) {
1149 error = EOPNOTSUPP;
1150 goto out;
1151 }
1152 if (m == NULL) {
1153 if (__predict_false(uio->uio_resid > unpdg_maxdgram)) {
1154 error = EMSGSIZE;
1155 goto out;
1156 }
1157 m = m_uiotombuf(uio, M_WAITOK, 0, max_hdr, M_PKTHDR);
1158 if (__predict_false(m == NULL)) {
1159 error = EFAULT;
1160 goto out;
1161 }
1162 f = m_gethdr(M_WAITOK, MT_SONAME);
1163 cc = m->m_pkthdr.len;
1164 mbcnt = MSIZE + m->m_pkthdr.memlen;
1165 if (c != NULL &&
1166 (error = unp_internalize(&c, td, &clast, &ctl, &mbcnt)))
1167 goto out;
1168 } else {
1169 /* pr_sosend() with mbuf usually is a kernel thread. */
1170
1171 M_ASSERTPKTHDR(m);
1172 if (__predict_false(c != NULL))
1173 panic("%s: control from a kernel thread", __func__);
1174
1175 if (__predict_false(m->m_pkthdr.len > unpdg_maxdgram)) {
1176 error = EMSGSIZE;
1177 goto out;
1178 }
1179 if ((f = m_gethdr(M_NOWAIT, MT_SONAME)) == NULL) {
1180 error = ENOBUFS;
1181 goto out;
1182 }
1183 /* Condition the foreign mbuf to our standards. */
1184 m_clrprotoflags(m);
1185 m_tag_delete_chain(m, NULL);
1186 m->m_pkthdr.rcvif = NULL;
1187 m->m_pkthdr.flowid = 0;
1188 m->m_pkthdr.csum_flags = 0;
1189 m->m_pkthdr.fibnum = 0;
1190 m->m_pkthdr.rsstype = 0;
1191
1192 cc = m->m_pkthdr.len;
1193 mbcnt = MSIZE;
1194 for (struct mbuf *mb = m; mb != NULL; mb = mb->m_next) {
1195 mbcnt += MSIZE;
1196 if (mb->m_flags & M_EXT)
1197 mbcnt += mb->m_ext.ext_size;
1198 }
1199 }
1200
1201 unp = sotounpcb(so);
1202 MPASS(unp);
1203
1204 /*
1205 * XXXGL: would be cool to fully remove so_snd out of the equation
1206 * and avoid this lock, which is not only extraneous, but also being
1207 * released, thus still leaving possibility for a race. We can easily
1208 * handle SBS_CANTSENDMORE/SS_ISCONNECTED complement in unpcb, but it
1209 * is more difficult to invent something to handle so_error.
1210 */
1211 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1212 if (error)
1213 goto out2;
1214 SOCK_SENDBUF_LOCK(so);
1215 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1216 SOCK_SENDBUF_UNLOCK(so);
1217 error = EPIPE;
1218 goto out3;
1219 }
1220 if (so->so_error != 0) {
1221 error = so->so_error;
1222 so->so_error = 0;
1223 SOCK_SENDBUF_UNLOCK(so);
1224 goto out3;
1225 }
1226 if (((so->so_state & SS_ISCONNECTED) == 0) && addr == NULL) {
1227 SOCK_SENDBUF_UNLOCK(so);
1228 error = EDESTADDRREQ;
1229 goto out3;
1230 }
1231 SOCK_SENDBUF_UNLOCK(so);
1232
1233 if (addr != NULL) {
1234 if ((error = unp_connectat(AT_FDCWD, so, addr, td, true)))
1235 goto out3;
1236 UNP_PCB_LOCK_ASSERT(unp);
1237 unp2 = unp->unp_conn;
1238 UNP_PCB_LOCK_ASSERT(unp2);
1239 } else {
1240 UNP_PCB_LOCK(unp);
1241 unp2 = unp_pcb_lock_peer(unp);
1242 if (unp2 == NULL) {
1243 UNP_PCB_UNLOCK(unp);
1244 error = ENOTCONN;
1245 goto out3;
1246 }
1247 }
1248
1249 if (unp2->unp_flags & UNP_WANTCRED_MASK)
1250 c = unp_addsockcred(td, c, unp2->unp_flags, &clast, &ctl,
1251 &mbcnt);
1252 if (unp->unp_addr != NULL)
1253 from = (struct sockaddr *)unp->unp_addr;
1254 else
1255 from = &sun_noname;
1256 f->m_len = from->sa_len;
1257 MPASS(from->sa_len <= MLEN);
1258 bcopy(from, mtod(f, void *), from->sa_len);
1259 ctl += f->m_len;
1260
1261 /*
1262 * Concatenate mbufs: from -> control -> data.
1263 * Save overall cc and mbcnt in "from" mbuf.
1264 */
1265 if (c != NULL) {
1266 #ifdef INVARIANTS
1267 struct mbuf *mc;
1268
1269 for (mc = c; mc->m_next != NULL; mc = mc->m_next);
1270 MPASS(mc == clast);
1271 #endif
1272 f->m_next = c;
1273 clast->m_next = m;
1274 c = NULL;
1275 } else
1276 f->m_next = m;
1277 m = NULL;
1278 #ifdef INVARIANTS
1279 dcc = dctl = dmbcnt = 0;
1280 for (struct mbuf *mb = f; mb != NULL; mb = mb->m_next) {
1281 if (mb->m_type == MT_DATA)
1282 dcc += mb->m_len;
1283 else
1284 dctl += mb->m_len;
1285 dmbcnt += MSIZE;
1286 if (mb->m_flags & M_EXT)
1287 dmbcnt += mb->m_ext.ext_size;
1288 }
1289 MPASS(dcc == cc);
1290 MPASS(dctl == ctl);
1291 MPASS(dmbcnt == mbcnt);
1292 #endif
1293 f->m_pkthdr.len = cc + ctl;
1294 f->m_pkthdr.memlen = mbcnt;
1295 f->m_pkthdr.ctllen = ctl;
1296
1297 /*
1298 * Destination socket buffer selection.
1299 *
1300 * Unconnected sends, when !(so->so_state & SS_ISCONNECTED) and the
1301 * destination address is supplied, create a temporary connection for
1302 * the run time of the function (see call to unp_connectat() above and
1303 * to unp_disconnect() below). We distinguish them by condition of
1304 * (addr != NULL). We intentionally avoid adding 'bool connected' for
1305 * that condition, since, again, through the run time of this code we
1306 * are always connected. For such "unconnected" sends, the destination
1307 * buffer would be the receive buffer of destination socket so2.
1308 *
1309 * For connected sends, data lands on the send buffer of the sender's
1310 * socket "so". Then, if we just added the very first datagram
1311 * on this send buffer, we need to add the send buffer on to the
1312 * receiving socket's buffer list. We put ourselves on top of the
1313 * list. Such logic gives infrequent senders priority over frequent
1314 * senders.
1315 *
1316 * Note on byte count management. As long as event methods kevent(2),
1317 * select(2) are not protocol specific (yet), we need to maintain
1318 * meaningful values on the receive buffer. So, the receive buffer
1319 * would accumulate counters from all connected buffers potentially
1320 * having sb_ccc > sb_hiwat or sb_mbcnt > sb_mbmax.
1321 */
1322 so2 = unp2->unp_socket;
1323 sb = (addr == NULL) ? &so->so_snd : &so2->so_rcv;
1324 SOCK_RECVBUF_LOCK(so2);
1325 if (uipc_dgram_sbspace(sb, cc + ctl, mbcnt)) {
1326 if (addr == NULL && STAILQ_EMPTY(&sb->uxdg_mb))
1327 TAILQ_INSERT_HEAD(&so2->so_rcv.uxdg_conns, &so->so_snd,
1328 uxdg_clist);
1329 STAILQ_INSERT_TAIL(&sb->uxdg_mb, f, m_stailqpkt);
1330 sb->uxdg_cc += cc + ctl;
1331 sb->uxdg_ctl += ctl;
1332 sb->uxdg_mbcnt += mbcnt;
1333 so2->so_rcv.sb_acc += cc + ctl;
1334 so2->so_rcv.sb_ccc += cc + ctl;
1335 so2->so_rcv.sb_ctl += ctl;
1336 so2->so_rcv.sb_mbcnt += mbcnt;
1337 sorwakeup_locked(so2);
1338 f = NULL;
1339 } else {
1340 soroverflow_locked(so2);
1341 error = ENOBUFS;
1342 if (f->m_next->m_type == MT_CONTROL) {
1343 c = f->m_next;
1344 f->m_next = NULL;
1345 }
1346 }
1347
1348 if (addr != NULL)
1349 unp_disconnect(unp, unp2);
1350 else
1351 unp_pcb_unlock_pair(unp, unp2);
1352
1353 td->td_ru.ru_msgsnd++;
1354
1355 out3:
1356 SOCK_IO_SEND_UNLOCK(so);
1357 out2:
1358 if (c)
1359 unp_scan(c, unp_freerights);
1360 out:
1361 if (f)
1362 m_freem(f);
1363 if (c)
1364 m_freem(c);
1365 if (m)
1366 m_freem(m);
1367
1368 return (error);
1369 }
1370
1371 /*
1372 * PF_UNIX/SOCK_DGRAM receive with MSG_PEEK.
1373 * The mbuf has already been unlinked from the uxdg_mb of socket buffer
1374 * and needs to be linked onto uxdg_peeked of receive socket buffer.
1375 */
1376 static int
uipc_peek_dgram(struct socket * so,struct mbuf * m,struct sockaddr ** psa,struct uio * uio,struct mbuf ** controlp,int * flagsp)1377 uipc_peek_dgram(struct socket *so, struct mbuf *m, struct sockaddr **psa,
1378 struct uio *uio, struct mbuf **controlp, int *flagsp)
1379 {
1380 ssize_t len = 0;
1381 int error;
1382
1383 so->so_rcv.uxdg_peeked = m;
1384 so->so_rcv.uxdg_cc += m->m_pkthdr.len;
1385 so->so_rcv.uxdg_ctl += m->m_pkthdr.ctllen;
1386 so->so_rcv.uxdg_mbcnt += m->m_pkthdr.memlen;
1387 SOCK_RECVBUF_UNLOCK(so);
1388
1389 KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
1390 if (psa != NULL)
1391 *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
1392
1393 m = m->m_next;
1394 KASSERT(m, ("%s: no data or control after soname", __func__));
1395
1396 /*
1397 * With MSG_PEEK the control isn't executed, just copied.
1398 */
1399 while (m != NULL && m->m_type == MT_CONTROL) {
1400 if (controlp != NULL) {
1401 *controlp = m_copym(m, 0, m->m_len, M_WAITOK);
1402 controlp = &(*controlp)->m_next;
1403 }
1404 m = m->m_next;
1405 }
1406 KASSERT(m == NULL || m->m_type == MT_DATA,
1407 ("%s: not MT_DATA mbuf %p", __func__, m));
1408 while (m != NULL && uio->uio_resid > 0) {
1409 len = uio->uio_resid;
1410 if (len > m->m_len)
1411 len = m->m_len;
1412 error = uiomove(mtod(m, char *), (int)len, uio);
1413 if (error) {
1414 SOCK_IO_RECV_UNLOCK(so);
1415 return (error);
1416 }
1417 if (len == m->m_len)
1418 m = m->m_next;
1419 }
1420 SOCK_IO_RECV_UNLOCK(so);
1421
1422 if (flagsp != NULL) {
1423 if (m != NULL) {
1424 if (*flagsp & MSG_TRUNC) {
1425 /* Report real length of the packet */
1426 uio->uio_resid -= m_length(m, NULL) - len;
1427 }
1428 *flagsp |= MSG_TRUNC;
1429 } else
1430 *flagsp &= ~MSG_TRUNC;
1431 }
1432
1433 return (0);
1434 }
1435
1436 /*
1437 * PF_UNIX/SOCK_DGRAM receive
1438 */
1439 static int
uipc_soreceive_dgram(struct socket * so,struct sockaddr ** psa,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1440 uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
1441 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1442 {
1443 struct sockbuf *sb = NULL;
1444 struct mbuf *m;
1445 int flags, error;
1446 ssize_t len = 0;
1447 bool nonblock;
1448
1449 MPASS(mp0 == NULL);
1450
1451 if (psa != NULL)
1452 *psa = NULL;
1453 if (controlp != NULL)
1454 *controlp = NULL;
1455
1456 flags = flagsp != NULL ? *flagsp : 0;
1457 nonblock = (so->so_state & SS_NBIO) ||
1458 (flags & (MSG_DONTWAIT | MSG_NBIO));
1459
1460 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
1461 if (__predict_false(error))
1462 return (error);
1463
1464 /*
1465 * Loop blocking while waiting for a datagram. Prioritize connected
1466 * peers over unconnected sends. Set sb to selected socket buffer
1467 * containing an mbuf on exit from the wait loop. A datagram that
1468 * had already been peeked at has top priority.
1469 */
1470 SOCK_RECVBUF_LOCK(so);
1471 while ((m = so->so_rcv.uxdg_peeked) == NULL &&
1472 (sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) == NULL &&
1473 (m = STAILQ_FIRST(&so->so_rcv.uxdg_mb)) == NULL) {
1474 if (so->so_error) {
1475 error = so->so_error;
1476 if (!(flags & MSG_PEEK))
1477 so->so_error = 0;
1478 SOCK_RECVBUF_UNLOCK(so);
1479 SOCK_IO_RECV_UNLOCK(so);
1480 return (error);
1481 }
1482 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
1483 uio->uio_resid == 0) {
1484 SOCK_RECVBUF_UNLOCK(so);
1485 SOCK_IO_RECV_UNLOCK(so);
1486 return (0);
1487 }
1488 if (nonblock) {
1489 SOCK_RECVBUF_UNLOCK(so);
1490 SOCK_IO_RECV_UNLOCK(so);
1491 return (EWOULDBLOCK);
1492 }
1493 error = sbwait(so, SO_RCV);
1494 if (error) {
1495 SOCK_RECVBUF_UNLOCK(so);
1496 SOCK_IO_RECV_UNLOCK(so);
1497 return (error);
1498 }
1499 }
1500
1501 if (sb == NULL)
1502 sb = &so->so_rcv;
1503 else if (m == NULL)
1504 m = STAILQ_FIRST(&sb->uxdg_mb);
1505 else
1506 MPASS(m == so->so_rcv.uxdg_peeked);
1507
1508 MPASS(sb->uxdg_cc > 0);
1509 M_ASSERTPKTHDR(m);
1510 KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
1511
1512 if (uio->uio_td)
1513 uio->uio_td->td_ru.ru_msgrcv++;
1514
1515 if (__predict_true(m != so->so_rcv.uxdg_peeked)) {
1516 STAILQ_REMOVE_HEAD(&sb->uxdg_mb, m_stailqpkt);
1517 if (STAILQ_EMPTY(&sb->uxdg_mb) && sb != &so->so_rcv)
1518 TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist);
1519 } else
1520 so->so_rcv.uxdg_peeked = NULL;
1521
1522 sb->uxdg_cc -= m->m_pkthdr.len;
1523 sb->uxdg_ctl -= m->m_pkthdr.ctllen;
1524 sb->uxdg_mbcnt -= m->m_pkthdr.memlen;
1525
1526 if (__predict_false(flags & MSG_PEEK))
1527 return (uipc_peek_dgram(so, m, psa, uio, controlp, flagsp));
1528
1529 so->so_rcv.sb_acc -= m->m_pkthdr.len;
1530 so->so_rcv.sb_ccc -= m->m_pkthdr.len;
1531 so->so_rcv.sb_ctl -= m->m_pkthdr.ctllen;
1532 so->so_rcv.sb_mbcnt -= m->m_pkthdr.memlen;
1533 SOCK_RECVBUF_UNLOCK(so);
1534
1535 if (psa != NULL)
1536 *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
1537 m = m_free(m);
1538 KASSERT(m, ("%s: no data or control after soname", __func__));
1539
1540 /*
1541 * Packet to copyout() is now in 'm' and it is disconnected from the
1542 * queue.
1543 *
1544 * Process one or more MT_CONTROL mbufs present before any data mbufs
1545 * in the first mbuf chain on the socket buffer. We call into the
1546 * unp_externalize() to perform externalization (or freeing if
1547 * controlp == NULL). In some cases there can be only MT_CONTROL mbufs
1548 * without MT_DATA mbufs.
1549 */
1550 while (m != NULL && m->m_type == MT_CONTROL) {
1551 struct mbuf *cm;
1552
1553 /* XXXGL: unp_externalize() is also dom_externalize() KBI and
1554 * it frees whole chain, so we must disconnect the mbuf.
1555 */
1556 cm = m; m = m->m_next; cm->m_next = NULL;
1557 error = unp_externalize(cm, controlp, flags);
1558 if (error != 0) {
1559 SOCK_IO_RECV_UNLOCK(so);
1560 unp_scan(m, unp_freerights);
1561 m_freem(m);
1562 return (error);
1563 }
1564 if (controlp != NULL) {
1565 while (*controlp != NULL)
1566 controlp = &(*controlp)->m_next;
1567 }
1568 }
1569 KASSERT(m == NULL || m->m_type == MT_DATA,
1570 ("%s: not MT_DATA mbuf %p", __func__, m));
1571 while (m != NULL && uio->uio_resid > 0) {
1572 len = uio->uio_resid;
1573 if (len > m->m_len)
1574 len = m->m_len;
1575 error = uiomove(mtod(m, char *), (int)len, uio);
1576 if (error) {
1577 SOCK_IO_RECV_UNLOCK(so);
1578 m_freem(m);
1579 return (error);
1580 }
1581 if (len == m->m_len)
1582 m = m_free(m);
1583 else {
1584 m->m_data += len;
1585 m->m_len -= len;
1586 }
1587 }
1588 SOCK_IO_RECV_UNLOCK(so);
1589
1590 if (m != NULL) {
1591 if (flagsp != NULL) {
1592 if (flags & MSG_TRUNC) {
1593 /* Report real length of the packet */
1594 uio->uio_resid -= m_length(m, NULL);
1595 }
1596 *flagsp |= MSG_TRUNC;
1597 }
1598 m_freem(m);
1599 } else if (flagsp != NULL)
1600 *flagsp &= ~MSG_TRUNC;
1601
1602 return (0);
1603 }
1604
1605 static bool
uipc_ready_scan(struct socket * so,struct mbuf * m,int count,int * errorp)1606 uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp)
1607 {
1608 struct mbuf *mb, *n;
1609 struct sockbuf *sb;
1610
1611 SOCK_LOCK(so);
1612 if (SOLISTENING(so)) {
1613 SOCK_UNLOCK(so);
1614 return (false);
1615 }
1616 mb = NULL;
1617 sb = &so->so_rcv;
1618 SOCKBUF_LOCK(sb);
1619 if (sb->sb_fnrdy != NULL) {
1620 for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) {
1621 if (mb == m) {
1622 *errorp = sbready(sb, m, count);
1623 break;
1624 }
1625 mb = mb->m_next;
1626 if (mb == NULL) {
1627 mb = n;
1628 if (mb != NULL)
1629 n = mb->m_nextpkt;
1630 }
1631 }
1632 }
1633 SOCKBUF_UNLOCK(sb);
1634 SOCK_UNLOCK(so);
1635 return (mb != NULL);
1636 }
1637
1638 static int
uipc_ready(struct socket * so,struct mbuf * m,int count)1639 uipc_ready(struct socket *so, struct mbuf *m, int count)
1640 {
1641 struct unpcb *unp, *unp2;
1642 struct socket *so2;
1643 int error, i;
1644
1645 unp = sotounpcb(so);
1646
1647 KASSERT(so->so_type == SOCK_STREAM,
1648 ("%s: unexpected socket type for %p", __func__, so));
1649
1650 UNP_PCB_LOCK(unp);
1651 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
1652 UNP_PCB_UNLOCK(unp);
1653 so2 = unp2->unp_socket;
1654 SOCKBUF_LOCK(&so2->so_rcv);
1655 if ((error = sbready(&so2->so_rcv, m, count)) == 0)
1656 sorwakeup_locked(so2);
1657 else
1658 SOCKBUF_UNLOCK(&so2->so_rcv);
1659 UNP_PCB_UNLOCK(unp2);
1660 return (error);
1661 }
1662 UNP_PCB_UNLOCK(unp);
1663
1664 /*
1665 * The receiving socket has been disconnected, but may still be valid.
1666 * In this case, the now-ready mbufs are still present in its socket
1667 * buffer, so perform an exhaustive search before giving up and freeing
1668 * the mbufs.
1669 */
1670 UNP_LINK_RLOCK();
1671 LIST_FOREACH(unp, &unp_shead, unp_link) {
1672 if (uipc_ready_scan(unp->unp_socket, m, count, &error))
1673 break;
1674 }
1675 UNP_LINK_RUNLOCK();
1676
1677 if (unp == NULL) {
1678 for (i = 0; i < count; i++)
1679 m = m_free(m);
1680 error = ECONNRESET;
1681 }
1682 return (error);
1683 }
1684
1685 static int
uipc_sense(struct socket * so,struct stat * sb)1686 uipc_sense(struct socket *so, struct stat *sb)
1687 {
1688 struct unpcb *unp;
1689
1690 unp = sotounpcb(so);
1691 KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
1692
1693 sb->st_blksize = so->so_snd.sb_hiwat;
1694 sb->st_dev = NODEV;
1695 sb->st_ino = unp->unp_ino;
1696 return (0);
1697 }
1698
1699 static int
uipc_shutdown(struct socket * so,enum shutdown_how how)1700 uipc_shutdown(struct socket *so, enum shutdown_how how)
1701 {
1702 struct unpcb *unp = sotounpcb(so);
1703 int error;
1704
1705 SOCK_LOCK(so);
1706 if (SOLISTENING(so)) {
1707 if (how != SHUT_WR) {
1708 so->so_error = ECONNABORTED;
1709 solisten_wakeup(so); /* unlocks so */
1710 } else
1711 SOCK_UNLOCK(so);
1712 return (ENOTCONN);
1713 } else if ((so->so_state &
1714 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
1715 /*
1716 * POSIX mandates us to just return ENOTCONN when shutdown(2) is
1717 * invoked on a datagram sockets, however historically we would
1718 * actually tear socket down. This is known to be leveraged by
1719 * some applications to unblock process waiting in recv(2) by
1720 * other process that it shares that socket with. Try to meet
1721 * both backward-compatibility and POSIX requirements by forcing
1722 * ENOTCONN but still flushing buffers and performing wakeup(9).
1723 *
1724 * XXXGL: it remains unknown what applications expect this
1725 * behavior and is this isolated to unix/dgram or inet/dgram or
1726 * both. See: D10351, D3039.
1727 */
1728 error = ENOTCONN;
1729 if (so->so_type != SOCK_DGRAM) {
1730 SOCK_UNLOCK(so);
1731 return (error);
1732 }
1733 } else
1734 error = 0;
1735 SOCK_UNLOCK(so);
1736
1737 switch (how) {
1738 case SHUT_RD:
1739 socantrcvmore(so);
1740 unp_dispose(so);
1741 break;
1742 case SHUT_RDWR:
1743 socantrcvmore(so);
1744 unp_dispose(so);
1745 /* FALLTHROUGH */
1746 case SHUT_WR:
1747 UNP_PCB_LOCK(unp);
1748 socantsendmore(so);
1749 unp_shutdown(unp);
1750 UNP_PCB_UNLOCK(unp);
1751 }
1752 wakeup(&so->so_timeo);
1753
1754 return (error);
1755 }
1756
1757 static int
uipc_sockaddr(struct socket * so,struct sockaddr * ret)1758 uipc_sockaddr(struct socket *so, struct sockaddr *ret)
1759 {
1760 struct unpcb *unp;
1761 const struct sockaddr *sa;
1762
1763 unp = sotounpcb(so);
1764 KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
1765
1766 UNP_PCB_LOCK(unp);
1767 if (unp->unp_addr != NULL)
1768 sa = (struct sockaddr *) unp->unp_addr;
1769 else
1770 sa = &sun_noname;
1771 bcopy(sa, ret, sa->sa_len);
1772 UNP_PCB_UNLOCK(unp);
1773 return (0);
1774 }
1775
1776 static int
uipc_ctloutput(struct socket * so,struct sockopt * sopt)1777 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
1778 {
1779 struct unpcb *unp;
1780 struct xucred xu;
1781 int error, optval;
1782
1783 if (sopt->sopt_level != SOL_LOCAL)
1784 return (EINVAL);
1785
1786 unp = sotounpcb(so);
1787 KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
1788 error = 0;
1789 switch (sopt->sopt_dir) {
1790 case SOPT_GET:
1791 switch (sopt->sopt_name) {
1792 case LOCAL_PEERCRED:
1793 UNP_PCB_LOCK(unp);
1794 if (unp->unp_flags & UNP_HAVEPC)
1795 xu = unp->unp_peercred;
1796 else {
1797 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1798 error = ENOTCONN;
1799 else
1800 error = EINVAL;
1801 }
1802 UNP_PCB_UNLOCK(unp);
1803 if (error == 0)
1804 error = sooptcopyout(sopt, &xu, sizeof(xu));
1805 break;
1806
1807 case LOCAL_CREDS:
1808 /* Unlocked read. */
1809 optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0;
1810 error = sooptcopyout(sopt, &optval, sizeof(optval));
1811 break;
1812
1813 case LOCAL_CREDS_PERSISTENT:
1814 /* Unlocked read. */
1815 optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0;
1816 error = sooptcopyout(sopt, &optval, sizeof(optval));
1817 break;
1818
1819 default:
1820 error = EOPNOTSUPP;
1821 break;
1822 }
1823 break;
1824
1825 case SOPT_SET:
1826 switch (sopt->sopt_name) {
1827 case LOCAL_CREDS:
1828 case LOCAL_CREDS_PERSISTENT:
1829 error = sooptcopyin(sopt, &optval, sizeof(optval),
1830 sizeof(optval));
1831 if (error)
1832 break;
1833
1834 #define OPTSET(bit, exclusive) do { \
1835 UNP_PCB_LOCK(unp); \
1836 if (optval) { \
1837 if ((unp->unp_flags & (exclusive)) != 0) { \
1838 UNP_PCB_UNLOCK(unp); \
1839 error = EINVAL; \
1840 break; \
1841 } \
1842 unp->unp_flags |= (bit); \
1843 } else \
1844 unp->unp_flags &= ~(bit); \
1845 UNP_PCB_UNLOCK(unp); \
1846 } while (0)
1847
1848 switch (sopt->sopt_name) {
1849 case LOCAL_CREDS:
1850 OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS);
1851 break;
1852
1853 case LOCAL_CREDS_PERSISTENT:
1854 OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT);
1855 break;
1856
1857 default:
1858 break;
1859 }
1860 break;
1861 #undef OPTSET
1862 default:
1863 error = ENOPROTOOPT;
1864 break;
1865 }
1866 break;
1867
1868 default:
1869 error = EOPNOTSUPP;
1870 break;
1871 }
1872 return (error);
1873 }
1874
1875 static int
unp_connect(struct socket * so,struct sockaddr * nam,struct thread * td)1876 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
1877 {
1878
1879 return (unp_connectat(AT_FDCWD, so, nam, td, false));
1880 }
1881
1882 static int
unp_connectat(int fd,struct socket * so,struct sockaddr * nam,struct thread * td,bool return_locked)1883 unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
1884 struct thread *td, bool return_locked)
1885 {
1886 struct mtx *vplock;
1887 struct sockaddr_un *soun;
1888 struct vnode *vp;
1889 struct socket *so2;
1890 struct unpcb *unp, *unp2, *unp3;
1891 struct nameidata nd;
1892 char buf[SOCK_MAXADDRLEN];
1893 struct sockaddr *sa;
1894 cap_rights_t rights;
1895 int error, len;
1896 bool connreq;
1897
1898 CURVNET_ASSERT_SET();
1899
1900 if (nam->sa_family != AF_UNIX)
1901 return (EAFNOSUPPORT);
1902 if (nam->sa_len > sizeof(struct sockaddr_un))
1903 return (EINVAL);
1904 len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
1905 if (len <= 0)
1906 return (EINVAL);
1907 soun = (struct sockaddr_un *)nam;
1908 bcopy(soun->sun_path, buf, len);
1909 buf[len] = 0;
1910
1911 error = 0;
1912 unp = sotounpcb(so);
1913 UNP_PCB_LOCK(unp);
1914 for (;;) {
1915 /*
1916 * Wait for connection state to stabilize. If a connection
1917 * already exists, give up. For datagram sockets, which permit
1918 * multiple consecutive connect(2) calls, upper layers are
1919 * responsible for disconnecting in advance of a subsequent
1920 * connect(2), but this is not synchronized with PCB connection
1921 * state.
1922 *
1923 * Also make sure that no threads are currently attempting to
1924 * lock the peer socket, to ensure that unp_conn cannot
1925 * transition between two valid sockets while locks are dropped.
1926 */
1927 if (SOLISTENING(so))
1928 error = EOPNOTSUPP;
1929 else if (unp->unp_conn != NULL)
1930 error = EISCONN;
1931 else if ((unp->unp_flags & UNP_CONNECTING) != 0) {
1932 error = EALREADY;
1933 }
1934 if (error != 0) {
1935 UNP_PCB_UNLOCK(unp);
1936 return (error);
1937 }
1938 if (unp->unp_pairbusy > 0) {
1939 unp->unp_flags |= UNP_WAITING;
1940 mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0);
1941 continue;
1942 }
1943 break;
1944 }
1945 unp->unp_flags |= UNP_CONNECTING;
1946 UNP_PCB_UNLOCK(unp);
1947
1948 connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0;
1949 if (connreq)
1950 sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
1951 else
1952 sa = NULL;
1953 NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
1954 UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT));
1955 error = namei(&nd);
1956 if (error)
1957 vp = NULL;
1958 else
1959 vp = nd.ni_vp;
1960 ASSERT_VOP_LOCKED(vp, "unp_connect");
1961 if (error)
1962 goto bad;
1963 NDFREE_PNBUF(&nd);
1964
1965 if (vp->v_type != VSOCK) {
1966 error = ENOTSOCK;
1967 goto bad;
1968 }
1969 #ifdef MAC
1970 error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
1971 if (error)
1972 goto bad;
1973 #endif
1974 error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
1975 if (error)
1976 goto bad;
1977
1978 unp = sotounpcb(so);
1979 KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
1980
1981 vplock = mtx_pool_find(unp_vp_mtxpool, vp);
1982 mtx_lock(vplock);
1983 VOP_UNP_CONNECT(vp, &unp2);
1984 if (unp2 == NULL) {
1985 error = ECONNREFUSED;
1986 goto bad2;
1987 }
1988 so2 = unp2->unp_socket;
1989 if (so->so_type != so2->so_type) {
1990 error = EPROTOTYPE;
1991 goto bad2;
1992 }
1993 if (connreq) {
1994 if (SOLISTENING(so2))
1995 so2 = sonewconn(so2, 0);
1996 else
1997 so2 = NULL;
1998 if (so2 == NULL) {
1999 error = ECONNREFUSED;
2000 goto bad2;
2001 }
2002 unp3 = sotounpcb(so2);
2003 unp_pcb_lock_pair(unp2, unp3);
2004 if (unp2->unp_addr != NULL) {
2005 bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
2006 unp3->unp_addr = (struct sockaddr_un *) sa;
2007 sa = NULL;
2008 }
2009
2010 unp_copy_peercred(td, unp3, unp, unp2);
2011
2012 UNP_PCB_UNLOCK(unp2);
2013 unp2 = unp3;
2014
2015 /*
2016 * It is safe to block on the PCB lock here since unp2 is
2017 * nascent and cannot be connected to any other sockets.
2018 */
2019 UNP_PCB_LOCK(unp);
2020 #ifdef MAC
2021 mac_socketpeer_set_from_socket(so, so2);
2022 mac_socketpeer_set_from_socket(so2, so);
2023 #endif
2024 } else {
2025 unp_pcb_lock_pair(unp, unp2);
2026 }
2027 KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
2028 sotounpcb(so2) == unp2,
2029 ("%s: unp2 %p so2 %p", __func__, unp2, so2));
2030 unp_connect2(so, so2);
2031 KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
2032 ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
2033 unp->unp_flags &= ~UNP_CONNECTING;
2034 if (!return_locked)
2035 unp_pcb_unlock_pair(unp, unp2);
2036 bad2:
2037 mtx_unlock(vplock);
2038 bad:
2039 if (vp != NULL) {
2040 /*
2041 * If we are returning locked (called via uipc_sosend_dgram()),
2042 * we need to be sure that vput() won't sleep. This is
2043 * guaranteed by VOP_UNP_CONNECT() call above and unp2 lock.
2044 * SOCK_STREAM/SEQPACKET can't request return_locked (yet).
2045 */
2046 MPASS(!(return_locked && connreq));
2047 vput(vp);
2048 }
2049 free(sa, M_SONAME);
2050 if (__predict_false(error)) {
2051 UNP_PCB_LOCK(unp);
2052 KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
2053 ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
2054 unp->unp_flags &= ~UNP_CONNECTING;
2055 UNP_PCB_UNLOCK(unp);
2056 }
2057 return (error);
2058 }
2059
2060 /*
2061 * Set socket peer credentials at connection time.
2062 *
2063 * The client's PCB credentials are copied from its process structure. The
2064 * server's PCB credentials are copied from the socket on which it called
2065 * listen(2). uipc_listen cached that process's credentials at the time.
2066 */
2067 void
unp_copy_peercred(struct thread * td,struct unpcb * client_unp,struct unpcb * server_unp,struct unpcb * listen_unp)2068 unp_copy_peercred(struct thread *td, struct unpcb *client_unp,
2069 struct unpcb *server_unp, struct unpcb *listen_unp)
2070 {
2071 cru2xt(td, &client_unp->unp_peercred);
2072 client_unp->unp_flags |= UNP_HAVEPC;
2073
2074 memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred,
2075 sizeof(server_unp->unp_peercred));
2076 server_unp->unp_flags |= UNP_HAVEPC;
2077 client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK);
2078 }
2079
2080 static void
unp_connect2(struct socket * so,struct socket * so2)2081 unp_connect2(struct socket *so, struct socket *so2)
2082 {
2083 struct unpcb *unp;
2084 struct unpcb *unp2;
2085
2086 MPASS(so2->so_type == so->so_type);
2087 unp = sotounpcb(so);
2088 KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
2089 unp2 = sotounpcb(so2);
2090 KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
2091
2092 UNP_PCB_LOCK_ASSERT(unp);
2093 UNP_PCB_LOCK_ASSERT(unp2);
2094 KASSERT(unp->unp_conn == NULL,
2095 ("%s: socket %p is already connected", __func__, unp));
2096
2097 unp->unp_conn = unp2;
2098 unp_pcb_hold(unp2);
2099 unp_pcb_hold(unp);
2100 switch (so->so_type) {
2101 case SOCK_DGRAM:
2102 UNP_REF_LIST_LOCK();
2103 LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
2104 UNP_REF_LIST_UNLOCK();
2105 soisconnected(so);
2106 break;
2107
2108 case SOCK_STREAM:
2109 case SOCK_SEQPACKET:
2110 KASSERT(unp2->unp_conn == NULL,
2111 ("%s: socket %p is already connected", __func__, unp2));
2112 unp2->unp_conn = unp;
2113 soisconnected(so);
2114 soisconnected(so2);
2115 break;
2116
2117 default:
2118 panic("unp_connect2");
2119 }
2120 }
2121
2122 static void
unp_disconnect(struct unpcb * unp,struct unpcb * unp2)2123 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
2124 {
2125 struct socket *so, *so2;
2126 struct mbuf *m = NULL;
2127 #ifdef INVARIANTS
2128 struct unpcb *unptmp;
2129 #endif
2130
2131 UNP_PCB_LOCK_ASSERT(unp);
2132 UNP_PCB_LOCK_ASSERT(unp2);
2133 KASSERT(unp->unp_conn == unp2,
2134 ("%s: unpcb %p is not connected to %p", __func__, unp, unp2));
2135
2136 unp->unp_conn = NULL;
2137 so = unp->unp_socket;
2138 so2 = unp2->unp_socket;
2139 switch (unp->unp_socket->so_type) {
2140 case SOCK_DGRAM:
2141 /*
2142 * Remove our send socket buffer from the peer's receive buffer.
2143 * Move the data to the receive buffer only if it is empty.
2144 * This is a protection against a scenario where a peer
2145 * connects, floods and disconnects, effectively blocking
2146 * sendto() from unconnected sockets.
2147 */
2148 SOCK_RECVBUF_LOCK(so2);
2149 if (!STAILQ_EMPTY(&so->so_snd.uxdg_mb)) {
2150 TAILQ_REMOVE(&so2->so_rcv.uxdg_conns, &so->so_snd,
2151 uxdg_clist);
2152 if (__predict_true((so2->so_rcv.sb_state &
2153 SBS_CANTRCVMORE) == 0) &&
2154 STAILQ_EMPTY(&so2->so_rcv.uxdg_mb)) {
2155 STAILQ_CONCAT(&so2->so_rcv.uxdg_mb,
2156 &so->so_snd.uxdg_mb);
2157 so2->so_rcv.uxdg_cc += so->so_snd.uxdg_cc;
2158 so2->so_rcv.uxdg_ctl += so->so_snd.uxdg_ctl;
2159 so2->so_rcv.uxdg_mbcnt += so->so_snd.uxdg_mbcnt;
2160 } else {
2161 m = STAILQ_FIRST(&so->so_snd.uxdg_mb);
2162 STAILQ_INIT(&so->so_snd.uxdg_mb);
2163 so2->so_rcv.sb_acc -= so->so_snd.uxdg_cc;
2164 so2->so_rcv.sb_ccc -= so->so_snd.uxdg_cc;
2165 so2->so_rcv.sb_ctl -= so->so_snd.uxdg_ctl;
2166 so2->so_rcv.sb_mbcnt -= so->so_snd.uxdg_mbcnt;
2167 }
2168 /* Note: so may reconnect. */
2169 so->so_snd.uxdg_cc = 0;
2170 so->so_snd.uxdg_ctl = 0;
2171 so->so_snd.uxdg_mbcnt = 0;
2172 }
2173 SOCK_RECVBUF_UNLOCK(so2);
2174 UNP_REF_LIST_LOCK();
2175 #ifdef INVARIANTS
2176 LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) {
2177 if (unptmp == unp)
2178 break;
2179 }
2180 KASSERT(unptmp != NULL,
2181 ("%s: %p not found in reflist of %p", __func__, unp, unp2));
2182 #endif
2183 LIST_REMOVE(unp, unp_reflink);
2184 UNP_REF_LIST_UNLOCK();
2185 if (so) {
2186 SOCK_LOCK(so);
2187 so->so_state &= ~SS_ISCONNECTED;
2188 SOCK_UNLOCK(so);
2189 }
2190 break;
2191
2192 case SOCK_STREAM:
2193 case SOCK_SEQPACKET:
2194 if (so)
2195 soisdisconnected(so);
2196 MPASS(unp2->unp_conn == unp);
2197 unp2->unp_conn = NULL;
2198 if (so2)
2199 soisdisconnected(so2);
2200 break;
2201 }
2202
2203 if (unp == unp2) {
2204 unp_pcb_rele_notlast(unp);
2205 if (!unp_pcb_rele(unp))
2206 UNP_PCB_UNLOCK(unp);
2207 } else {
2208 if (!unp_pcb_rele(unp))
2209 UNP_PCB_UNLOCK(unp);
2210 if (!unp_pcb_rele(unp2))
2211 UNP_PCB_UNLOCK(unp2);
2212 }
2213
2214 if (m != NULL) {
2215 unp_scan(m, unp_freerights);
2216 m_freemp(m);
2217 }
2218 }
2219
2220 /*
2221 * unp_pcblist() walks the global list of struct unpcb's to generate a
2222 * pointer list, bumping the refcount on each unpcb. It then copies them out
2223 * sequentially, validating the generation number on each to see if it has
2224 * been detached. All of this is necessary because copyout() may sleep on
2225 * disk I/O.
2226 */
2227 static int
unp_pcblist(SYSCTL_HANDLER_ARGS)2228 unp_pcblist(SYSCTL_HANDLER_ARGS)
2229 {
2230 struct unpcb *unp, **unp_list;
2231 unp_gen_t gencnt;
2232 struct xunpgen *xug;
2233 struct unp_head *head;
2234 struct xunpcb *xu;
2235 u_int i;
2236 int error, n;
2237
2238 switch ((intptr_t)arg1) {
2239 case SOCK_STREAM:
2240 head = &unp_shead;
2241 break;
2242
2243 case SOCK_DGRAM:
2244 head = &unp_dhead;
2245 break;
2246
2247 case SOCK_SEQPACKET:
2248 head = &unp_sphead;
2249 break;
2250
2251 default:
2252 panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
2253 }
2254
2255 /*
2256 * The process of preparing the PCB list is too time-consuming and
2257 * resource-intensive to repeat twice on every request.
2258 */
2259 if (req->oldptr == NULL) {
2260 n = unp_count;
2261 req->oldidx = 2 * (sizeof *xug)
2262 + (n + n/8) * sizeof(struct xunpcb);
2263 return (0);
2264 }
2265
2266 if (req->newptr != NULL)
2267 return (EPERM);
2268
2269 /*
2270 * OK, now we're committed to doing something.
2271 */
2272 xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO);
2273 UNP_LINK_RLOCK();
2274 gencnt = unp_gencnt;
2275 n = unp_count;
2276 UNP_LINK_RUNLOCK();
2277
2278 xug->xug_len = sizeof *xug;
2279 xug->xug_count = n;
2280 xug->xug_gen = gencnt;
2281 xug->xug_sogen = so_gencnt;
2282 error = SYSCTL_OUT(req, xug, sizeof *xug);
2283 if (error) {
2284 free(xug, M_TEMP);
2285 return (error);
2286 }
2287
2288 unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
2289
2290 UNP_LINK_RLOCK();
2291 for (unp = LIST_FIRST(head), i = 0; unp && i < n;
2292 unp = LIST_NEXT(unp, unp_link)) {
2293 UNP_PCB_LOCK(unp);
2294 if (unp->unp_gencnt <= gencnt) {
2295 if (cr_cansee(req->td->td_ucred,
2296 unp->unp_socket->so_cred)) {
2297 UNP_PCB_UNLOCK(unp);
2298 continue;
2299 }
2300 unp_list[i++] = unp;
2301 unp_pcb_hold(unp);
2302 }
2303 UNP_PCB_UNLOCK(unp);
2304 }
2305 UNP_LINK_RUNLOCK();
2306 n = i; /* In case we lost some during malloc. */
2307
2308 error = 0;
2309 xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
2310 for (i = 0; i < n; i++) {
2311 unp = unp_list[i];
2312 UNP_PCB_LOCK(unp);
2313 if (unp_pcb_rele(unp))
2314 continue;
2315
2316 if (unp->unp_gencnt <= gencnt) {
2317 xu->xu_len = sizeof *xu;
2318 xu->xu_unpp = (uintptr_t)unp;
2319 /*
2320 * XXX - need more locking here to protect against
2321 * connect/disconnect races for SMP.
2322 */
2323 if (unp->unp_addr != NULL)
2324 bcopy(unp->unp_addr, &xu->xu_addr,
2325 unp->unp_addr->sun_len);
2326 else
2327 bzero(&xu->xu_addr, sizeof(xu->xu_addr));
2328 if (unp->unp_conn != NULL &&
2329 unp->unp_conn->unp_addr != NULL)
2330 bcopy(unp->unp_conn->unp_addr,
2331 &xu->xu_caddr,
2332 unp->unp_conn->unp_addr->sun_len);
2333 else
2334 bzero(&xu->xu_caddr, sizeof(xu->xu_caddr));
2335 xu->unp_vnode = (uintptr_t)unp->unp_vnode;
2336 xu->unp_conn = (uintptr_t)unp->unp_conn;
2337 xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs);
2338 xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink);
2339 xu->unp_gencnt = unp->unp_gencnt;
2340 sotoxsocket(unp->unp_socket, &xu->xu_socket);
2341 UNP_PCB_UNLOCK(unp);
2342 error = SYSCTL_OUT(req, xu, sizeof *xu);
2343 } else {
2344 UNP_PCB_UNLOCK(unp);
2345 }
2346 }
2347 free(xu, M_TEMP);
2348 if (!error) {
2349 /*
2350 * Give the user an updated idea of our state. If the
2351 * generation differs from what we told her before, she knows
2352 * that something happened while we were processing this
2353 * request, and it might be necessary to retry.
2354 */
2355 xug->xug_gen = unp_gencnt;
2356 xug->xug_sogen = so_gencnt;
2357 xug->xug_count = unp_count;
2358 error = SYSCTL_OUT(req, xug, sizeof *xug);
2359 }
2360 free(unp_list, M_TEMP);
2361 free(xug, M_TEMP);
2362 return (error);
2363 }
2364
2365 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist,
2366 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
2367 (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
2368 "List of active local datagram sockets");
2369 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist,
2370 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
2371 (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
2372 "List of active local stream sockets");
2373 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
2374 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
2375 (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
2376 "List of active local seqpacket sockets");
2377
2378 static void
unp_shutdown(struct unpcb * unp)2379 unp_shutdown(struct unpcb *unp)
2380 {
2381 struct unpcb *unp2;
2382 struct socket *so;
2383
2384 UNP_PCB_LOCK_ASSERT(unp);
2385
2386 unp2 = unp->unp_conn;
2387 if ((unp->unp_socket->so_type == SOCK_STREAM ||
2388 (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
2389 so = unp2->unp_socket;
2390 if (so != NULL)
2391 socantrcvmore(so);
2392 }
2393 }
2394
2395 static void
unp_drop(struct unpcb * unp)2396 unp_drop(struct unpcb *unp)
2397 {
2398 struct socket *so;
2399 struct unpcb *unp2;
2400
2401 /*
2402 * Regardless of whether the socket's peer dropped the connection
2403 * with this socket by aborting or disconnecting, POSIX requires
2404 * that ECONNRESET is returned.
2405 */
2406
2407 UNP_PCB_LOCK(unp);
2408 so = unp->unp_socket;
2409 if (so)
2410 so->so_error = ECONNRESET;
2411 if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
2412 /* Last reference dropped in unp_disconnect(). */
2413 unp_pcb_rele_notlast(unp);
2414 unp_disconnect(unp, unp2);
2415 } else if (!unp_pcb_rele(unp)) {
2416 UNP_PCB_UNLOCK(unp);
2417 }
2418 }
2419
2420 static void
unp_freerights(struct filedescent ** fdep,int fdcount)2421 unp_freerights(struct filedescent **fdep, int fdcount)
2422 {
2423 struct file *fp;
2424 int i;
2425
2426 KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
2427
2428 for (i = 0; i < fdcount; i++) {
2429 fp = fdep[i]->fde_file;
2430 filecaps_free(&fdep[i]->fde_caps);
2431 unp_discard(fp);
2432 }
2433 free(fdep[0], M_FILECAPS);
2434 }
2435
2436 static int
unp_externalize(struct mbuf * control,struct mbuf ** controlp,int flags)2437 unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
2438 {
2439 struct thread *td = curthread; /* XXX */
2440 struct cmsghdr *cm = mtod(control, struct cmsghdr *);
2441 int i;
2442 int *fdp;
2443 struct filedesc *fdesc = td->td_proc->p_fd;
2444 struct filedescent **fdep;
2445 void *data;
2446 socklen_t clen = control->m_len, datalen;
2447 int error, newfds;
2448 u_int newlen;
2449
2450 UNP_LINK_UNLOCK_ASSERT();
2451
2452 error = 0;
2453 if (controlp != NULL) /* controlp == NULL => free control messages */
2454 *controlp = NULL;
2455 while (cm != NULL) {
2456 MPASS(clen >= sizeof(*cm) && clen >= cm->cmsg_len);
2457
2458 data = CMSG_DATA(cm);
2459 datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
2460 if (cm->cmsg_level == SOL_SOCKET
2461 && cm->cmsg_type == SCM_RIGHTS) {
2462 newfds = datalen / sizeof(*fdep);
2463 if (newfds == 0)
2464 goto next;
2465 fdep = data;
2466
2467 /* If we're not outputting the descriptors free them. */
2468 if (error || controlp == NULL) {
2469 unp_freerights(fdep, newfds);
2470 goto next;
2471 }
2472 FILEDESC_XLOCK(fdesc);
2473
2474 /*
2475 * Now change each pointer to an fd in the global
2476 * table to an integer that is the index to the local
2477 * fd table entry that we set up to point to the
2478 * global one we are transferring.
2479 */
2480 newlen = newfds * sizeof(int);
2481 *controlp = sbcreatecontrol(NULL, newlen,
2482 SCM_RIGHTS, SOL_SOCKET, M_WAITOK);
2483
2484 fdp = (int *)
2485 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2486 if ((error = fdallocn(td, 0, fdp, newfds))) {
2487 FILEDESC_XUNLOCK(fdesc);
2488 unp_freerights(fdep, newfds);
2489 m_freem(*controlp);
2490 *controlp = NULL;
2491 goto next;
2492 }
2493 for (i = 0; i < newfds; i++, fdp++) {
2494 _finstall(fdesc, fdep[i]->fde_file, *fdp,
2495 (flags & MSG_CMSG_CLOEXEC) != 0 ? O_CLOEXEC : 0,
2496 &fdep[i]->fde_caps);
2497 unp_externalize_fp(fdep[i]->fde_file);
2498 }
2499
2500 /*
2501 * The new type indicates that the mbuf data refers to
2502 * kernel resources that may need to be released before
2503 * the mbuf is freed.
2504 */
2505 m_chtype(*controlp, MT_EXTCONTROL);
2506 FILEDESC_XUNLOCK(fdesc);
2507 free(fdep[0], M_FILECAPS);
2508 } else {
2509 /* We can just copy anything else across. */
2510 if (error || controlp == NULL)
2511 goto next;
2512 *controlp = sbcreatecontrol(NULL, datalen,
2513 cm->cmsg_type, cm->cmsg_level, M_WAITOK);
2514 bcopy(data,
2515 CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
2516 datalen);
2517 }
2518 controlp = &(*controlp)->m_next;
2519
2520 next:
2521 if (CMSG_SPACE(datalen) < clen) {
2522 clen -= CMSG_SPACE(datalen);
2523 cm = (struct cmsghdr *)
2524 ((caddr_t)cm + CMSG_SPACE(datalen));
2525 } else {
2526 clen = 0;
2527 cm = NULL;
2528 }
2529 }
2530
2531 m_freem(control);
2532 return (error);
2533 }
2534
2535 static void
unp_zone_change(void * tag)2536 unp_zone_change(void *tag)
2537 {
2538
2539 uma_zone_set_max(unp_zone, maxsockets);
2540 }
2541
2542 #ifdef INVARIANTS
2543 static void
unp_zdtor(void * mem,int size __unused,void * arg __unused)2544 unp_zdtor(void *mem, int size __unused, void *arg __unused)
2545 {
2546 struct unpcb *unp;
2547
2548 unp = mem;
2549
2550 KASSERT(LIST_EMPTY(&unp->unp_refs),
2551 ("%s: unpcb %p has lingering refs", __func__, unp));
2552 KASSERT(unp->unp_socket == NULL,
2553 ("%s: unpcb %p has socket backpointer", __func__, unp));
2554 KASSERT(unp->unp_vnode == NULL,
2555 ("%s: unpcb %p has vnode references", __func__, unp));
2556 KASSERT(unp->unp_conn == NULL,
2557 ("%s: unpcb %p is still connected", __func__, unp));
2558 KASSERT(unp->unp_addr == NULL,
2559 ("%s: unpcb %p has leaked addr", __func__, unp));
2560 }
2561 #endif
2562
2563 static void
unp_init(void * arg __unused)2564 unp_init(void *arg __unused)
2565 {
2566 uma_dtor dtor;
2567
2568 #ifdef INVARIANTS
2569 dtor = unp_zdtor;
2570 #else
2571 dtor = NULL;
2572 #endif
2573 unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor,
2574 NULL, NULL, UMA_ALIGN_CACHE, 0);
2575 uma_zone_set_max(unp_zone, maxsockets);
2576 uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
2577 EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
2578 NULL, EVENTHANDLER_PRI_ANY);
2579 LIST_INIT(&unp_dhead);
2580 LIST_INIT(&unp_shead);
2581 LIST_INIT(&unp_sphead);
2582 SLIST_INIT(&unp_defers);
2583 TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
2584 TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
2585 UNP_LINK_LOCK_INIT();
2586 UNP_DEFERRED_LOCK_INIT();
2587 unp_vp_mtxpool = mtx_pool_create("unp vp mtxpool", 32, MTX_DEF);
2588 }
2589 SYSINIT(unp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, unp_init, NULL);
2590
2591 static void
unp_internalize_cleanup_rights(struct mbuf * control)2592 unp_internalize_cleanup_rights(struct mbuf *control)
2593 {
2594 struct cmsghdr *cp;
2595 struct mbuf *m;
2596 void *data;
2597 socklen_t datalen;
2598
2599 for (m = control; m != NULL; m = m->m_next) {
2600 cp = mtod(m, struct cmsghdr *);
2601 if (cp->cmsg_level != SOL_SOCKET ||
2602 cp->cmsg_type != SCM_RIGHTS)
2603 continue;
2604 data = CMSG_DATA(cp);
2605 datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data;
2606 unp_freerights(data, datalen / sizeof(struct filedesc *));
2607 }
2608 }
2609
2610 static int
unp_internalize(struct mbuf ** controlp,struct thread * td,struct mbuf ** clast,u_int * space,u_int * mbcnt)2611 unp_internalize(struct mbuf **controlp, struct thread *td,
2612 struct mbuf **clast, u_int *space, u_int *mbcnt)
2613 {
2614 struct mbuf *control, **initial_controlp;
2615 struct proc *p;
2616 struct filedesc *fdesc;
2617 struct bintime *bt;
2618 struct cmsghdr *cm;
2619 struct cmsgcred *cmcred;
2620 struct filedescent *fde, **fdep, *fdev;
2621 struct file *fp;
2622 struct timeval *tv;
2623 struct timespec *ts;
2624 void *data;
2625 socklen_t clen, datalen;
2626 int i, j, error, *fdp, oldfds;
2627 u_int newlen;
2628
2629 MPASS((*controlp)->m_next == NULL); /* COMPAT_OLDSOCK may violate */
2630 UNP_LINK_UNLOCK_ASSERT();
2631
2632 p = td->td_proc;
2633 fdesc = p->p_fd;
2634 error = 0;
2635 control = *controlp;
2636 *controlp = NULL;
2637 initial_controlp = controlp;
2638 for (clen = control->m_len, cm = mtod(control, struct cmsghdr *),
2639 data = CMSG_DATA(cm);
2640
2641 clen >= sizeof(*cm) && cm->cmsg_level == SOL_SOCKET &&
2642 clen >= cm->cmsg_len && cm->cmsg_len >= sizeof(*cm) &&
2643 (char *)cm + cm->cmsg_len >= (char *)data;
2644
2645 clen -= min(CMSG_SPACE(datalen), clen),
2646 cm = (struct cmsghdr *) ((char *)cm + CMSG_SPACE(datalen)),
2647 data = CMSG_DATA(cm)) {
2648 datalen = (char *)cm + cm->cmsg_len - (char *)data;
2649 switch (cm->cmsg_type) {
2650 case SCM_CREDS:
2651 *controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
2652 SCM_CREDS, SOL_SOCKET, M_WAITOK);
2653 cmcred = (struct cmsgcred *)
2654 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2655 cmcred->cmcred_pid = p->p_pid;
2656 cmcred->cmcred_uid = td->td_ucred->cr_ruid;
2657 cmcred->cmcred_gid = td->td_ucred->cr_rgid;
2658 cmcred->cmcred_euid = td->td_ucred->cr_uid;
2659 cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
2660 CMGROUP_MAX);
2661 for (i = 0; i < cmcred->cmcred_ngroups; i++)
2662 cmcred->cmcred_groups[i] =
2663 td->td_ucred->cr_groups[i];
2664 break;
2665
2666 case SCM_RIGHTS:
2667 oldfds = datalen / sizeof (int);
2668 if (oldfds == 0)
2669 continue;
2670 /* On some machines sizeof pointer is bigger than
2671 * sizeof int, so we need to check if data fits into
2672 * single mbuf. We could allocate several mbufs, and
2673 * unp_externalize() should even properly handle that.
2674 * But it is not worth to complicate the code for an
2675 * insane scenario of passing over 200 file descriptors
2676 * at once.
2677 */
2678 newlen = oldfds * sizeof(fdep[0]);
2679 if (CMSG_SPACE(newlen) > MCLBYTES) {
2680 error = EMSGSIZE;
2681 goto out;
2682 }
2683 /*
2684 * Check that all the FDs passed in refer to legal
2685 * files. If not, reject the entire operation.
2686 */
2687 fdp = data;
2688 FILEDESC_SLOCK(fdesc);
2689 for (i = 0; i < oldfds; i++, fdp++) {
2690 fp = fget_noref(fdesc, *fdp);
2691 if (fp == NULL) {
2692 FILEDESC_SUNLOCK(fdesc);
2693 error = EBADF;
2694 goto out;
2695 }
2696 if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
2697 FILEDESC_SUNLOCK(fdesc);
2698 error = EOPNOTSUPP;
2699 goto out;
2700 }
2701 }
2702
2703 /*
2704 * Now replace the integer FDs with pointers to the
2705 * file structure and capability rights.
2706 */
2707 *controlp = sbcreatecontrol(NULL, newlen,
2708 SCM_RIGHTS, SOL_SOCKET, M_WAITOK);
2709 fdp = data;
2710 for (i = 0; i < oldfds; i++, fdp++) {
2711 if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) {
2712 fdp = data;
2713 for (j = 0; j < i; j++, fdp++) {
2714 fdrop(fdesc->fd_ofiles[*fdp].
2715 fde_file, td);
2716 }
2717 FILEDESC_SUNLOCK(fdesc);
2718 error = EBADF;
2719 goto out;
2720 }
2721 }
2722 fdp = data;
2723 fdep = (struct filedescent **)
2724 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2725 fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
2726 M_WAITOK);
2727 for (i = 0; i < oldfds; i++, fdev++, fdp++) {
2728 fde = &fdesc->fd_ofiles[*fdp];
2729 fdep[i] = fdev;
2730 fdep[i]->fde_file = fde->fde_file;
2731 filecaps_copy(&fde->fde_caps,
2732 &fdep[i]->fde_caps, true);
2733 unp_internalize_fp(fdep[i]->fde_file);
2734 }
2735 FILEDESC_SUNLOCK(fdesc);
2736 break;
2737
2738 case SCM_TIMESTAMP:
2739 *controlp = sbcreatecontrol(NULL, sizeof(*tv),
2740 SCM_TIMESTAMP, SOL_SOCKET, M_WAITOK);
2741 tv = (struct timeval *)
2742 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2743 microtime(tv);
2744 break;
2745
2746 case SCM_BINTIME:
2747 *controlp = sbcreatecontrol(NULL, sizeof(*bt),
2748 SCM_BINTIME, SOL_SOCKET, M_WAITOK);
2749 bt = (struct bintime *)
2750 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2751 bintime(bt);
2752 break;
2753
2754 case SCM_REALTIME:
2755 *controlp = sbcreatecontrol(NULL, sizeof(*ts),
2756 SCM_REALTIME, SOL_SOCKET, M_WAITOK);
2757 ts = (struct timespec *)
2758 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2759 nanotime(ts);
2760 break;
2761
2762 case SCM_MONOTONIC:
2763 *controlp = sbcreatecontrol(NULL, sizeof(*ts),
2764 SCM_MONOTONIC, SOL_SOCKET, M_WAITOK);
2765 ts = (struct timespec *)
2766 CMSG_DATA(mtod(*controlp, struct cmsghdr *));
2767 nanouptime(ts);
2768 break;
2769
2770 default:
2771 error = EINVAL;
2772 goto out;
2773 }
2774
2775 if (space != NULL) {
2776 *space += (*controlp)->m_len;
2777 *mbcnt += MSIZE;
2778 if ((*controlp)->m_flags & M_EXT)
2779 *mbcnt += (*controlp)->m_ext.ext_size;
2780 *clast = *controlp;
2781 }
2782 controlp = &(*controlp)->m_next;
2783 }
2784 if (clen > 0)
2785 error = EINVAL;
2786
2787 out:
2788 if (error != 0 && initial_controlp != NULL)
2789 unp_internalize_cleanup_rights(*initial_controlp);
2790 m_freem(control);
2791 return (error);
2792 }
2793
2794 static struct mbuf *
unp_addsockcred(struct thread * td,struct mbuf * control,int mode,struct mbuf ** clast,u_int * space,u_int * mbcnt)2795 unp_addsockcred(struct thread *td, struct mbuf *control, int mode,
2796 struct mbuf **clast, u_int *space, u_int *mbcnt)
2797 {
2798 struct mbuf *m, *n, *n_prev;
2799 const struct cmsghdr *cm;
2800 int ngroups, i, cmsgtype;
2801 size_t ctrlsz;
2802
2803 ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
2804 if (mode & UNP_WANTCRED_ALWAYS) {
2805 ctrlsz = SOCKCRED2SIZE(ngroups);
2806 cmsgtype = SCM_CREDS2;
2807 } else {
2808 ctrlsz = SOCKCREDSIZE(ngroups);
2809 cmsgtype = SCM_CREDS;
2810 }
2811
2812 m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET, M_NOWAIT);
2813 if (m == NULL)
2814 return (control);
2815 MPASS((m->m_flags & M_EXT) == 0 && m->m_next == NULL);
2816
2817 if (mode & UNP_WANTCRED_ALWAYS) {
2818 struct sockcred2 *sc;
2819
2820 sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
2821 sc->sc_version = 0;
2822 sc->sc_pid = td->td_proc->p_pid;
2823 sc->sc_uid = td->td_ucred->cr_ruid;
2824 sc->sc_euid = td->td_ucred->cr_uid;
2825 sc->sc_gid = td->td_ucred->cr_rgid;
2826 sc->sc_egid = td->td_ucred->cr_gid;
2827 sc->sc_ngroups = ngroups;
2828 for (i = 0; i < sc->sc_ngroups; i++)
2829 sc->sc_groups[i] = td->td_ucred->cr_groups[i];
2830 } else {
2831 struct sockcred *sc;
2832
2833 sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
2834 sc->sc_uid = td->td_ucred->cr_ruid;
2835 sc->sc_euid = td->td_ucred->cr_uid;
2836 sc->sc_gid = td->td_ucred->cr_rgid;
2837 sc->sc_egid = td->td_ucred->cr_gid;
2838 sc->sc_ngroups = ngroups;
2839 for (i = 0; i < sc->sc_ngroups; i++)
2840 sc->sc_groups[i] = td->td_ucred->cr_groups[i];
2841 }
2842
2843 /*
2844 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
2845 * created SCM_CREDS control message (struct sockcred) has another
2846 * format.
2847 */
2848 if (control != NULL && cmsgtype == SCM_CREDS)
2849 for (n = control, n_prev = NULL; n != NULL;) {
2850 cm = mtod(n, struct cmsghdr *);
2851 if (cm->cmsg_level == SOL_SOCKET &&
2852 cm->cmsg_type == SCM_CREDS) {
2853 if (n_prev == NULL)
2854 control = n->m_next;
2855 else
2856 n_prev->m_next = n->m_next;
2857 if (space != NULL) {
2858 MPASS(*space >= n->m_len);
2859 *space -= n->m_len;
2860 MPASS(*mbcnt >= MSIZE);
2861 *mbcnt -= MSIZE;
2862 if (n->m_flags & M_EXT) {
2863 MPASS(*mbcnt >=
2864 n->m_ext.ext_size);
2865 *mbcnt -= n->m_ext.ext_size;
2866 }
2867 MPASS(clast);
2868 if (*clast == n) {
2869 MPASS(n->m_next == NULL);
2870 if (n_prev == NULL)
2871 *clast = m;
2872 else
2873 *clast = n_prev;
2874 }
2875 }
2876 n = m_free(n);
2877 } else {
2878 n_prev = n;
2879 n = n->m_next;
2880 }
2881 }
2882
2883 /* Prepend it to the head. */
2884 m->m_next = control;
2885 if (space != NULL) {
2886 *space += m->m_len;
2887 *mbcnt += MSIZE;
2888 if (control == NULL)
2889 *clast = m;
2890 }
2891 return (m);
2892 }
2893
2894 static struct unpcb *
fptounp(struct file * fp)2895 fptounp(struct file *fp)
2896 {
2897 struct socket *so;
2898
2899 if (fp->f_type != DTYPE_SOCKET)
2900 return (NULL);
2901 if ((so = fp->f_data) == NULL)
2902 return (NULL);
2903 if (so->so_proto->pr_domain != &localdomain)
2904 return (NULL);
2905 return sotounpcb(so);
2906 }
2907
2908 static void
unp_discard(struct file * fp)2909 unp_discard(struct file *fp)
2910 {
2911 struct unp_defer *dr;
2912
2913 if (unp_externalize_fp(fp)) {
2914 dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
2915 dr->ud_fp = fp;
2916 UNP_DEFERRED_LOCK();
2917 SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
2918 UNP_DEFERRED_UNLOCK();
2919 atomic_add_int(&unp_defers_count, 1);
2920 taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
2921 } else
2922 closef_nothread(fp);
2923 }
2924
2925 static void
unp_process_defers(void * arg __unused,int pending)2926 unp_process_defers(void *arg __unused, int pending)
2927 {
2928 struct unp_defer *dr;
2929 SLIST_HEAD(, unp_defer) drl;
2930 int count;
2931
2932 SLIST_INIT(&drl);
2933 for (;;) {
2934 UNP_DEFERRED_LOCK();
2935 if (SLIST_FIRST(&unp_defers) == NULL) {
2936 UNP_DEFERRED_UNLOCK();
2937 break;
2938 }
2939 SLIST_SWAP(&unp_defers, &drl, unp_defer);
2940 UNP_DEFERRED_UNLOCK();
2941 count = 0;
2942 while ((dr = SLIST_FIRST(&drl)) != NULL) {
2943 SLIST_REMOVE_HEAD(&drl, ud_link);
2944 closef_nothread(dr->ud_fp);
2945 free(dr, M_TEMP);
2946 count++;
2947 }
2948 atomic_add_int(&unp_defers_count, -count);
2949 }
2950 }
2951
2952 static void
unp_internalize_fp(struct file * fp)2953 unp_internalize_fp(struct file *fp)
2954 {
2955 struct unpcb *unp;
2956
2957 UNP_LINK_WLOCK();
2958 if ((unp = fptounp(fp)) != NULL) {
2959 unp->unp_file = fp;
2960 unp->unp_msgcount++;
2961 }
2962 unp_rights++;
2963 UNP_LINK_WUNLOCK();
2964 }
2965
2966 static int
unp_externalize_fp(struct file * fp)2967 unp_externalize_fp(struct file *fp)
2968 {
2969 struct unpcb *unp;
2970 int ret;
2971
2972 UNP_LINK_WLOCK();
2973 if ((unp = fptounp(fp)) != NULL) {
2974 unp->unp_msgcount--;
2975 ret = 1;
2976 } else
2977 ret = 0;
2978 unp_rights--;
2979 UNP_LINK_WUNLOCK();
2980 return (ret);
2981 }
2982
2983 /*
2984 * unp_defer indicates whether additional work has been defered for a future
2985 * pass through unp_gc(). It is thread local and does not require explicit
2986 * synchronization.
2987 */
2988 static int unp_marked;
2989
2990 static void
unp_remove_dead_ref(struct filedescent ** fdep,int fdcount)2991 unp_remove_dead_ref(struct filedescent **fdep, int fdcount)
2992 {
2993 struct unpcb *unp;
2994 struct file *fp;
2995 int i;
2996
2997 /*
2998 * This function can only be called from the gc task.
2999 */
3000 KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
3001 ("%s: not on gc callout", __func__));
3002 UNP_LINK_LOCK_ASSERT();
3003
3004 for (i = 0; i < fdcount; i++) {
3005 fp = fdep[i]->fde_file;
3006 if ((unp = fptounp(fp)) == NULL)
3007 continue;
3008 if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
3009 continue;
3010 unp->unp_gcrefs--;
3011 }
3012 }
3013
3014 static void
unp_restore_undead_ref(struct filedescent ** fdep,int fdcount)3015 unp_restore_undead_ref(struct filedescent **fdep, int fdcount)
3016 {
3017 struct unpcb *unp;
3018 struct file *fp;
3019 int i;
3020
3021 /*
3022 * This function can only be called from the gc task.
3023 */
3024 KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
3025 ("%s: not on gc callout", __func__));
3026 UNP_LINK_LOCK_ASSERT();
3027
3028 for (i = 0; i < fdcount; i++) {
3029 fp = fdep[i]->fde_file;
3030 if ((unp = fptounp(fp)) == NULL)
3031 continue;
3032 if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
3033 continue;
3034 unp->unp_gcrefs++;
3035 unp_marked++;
3036 }
3037 }
3038
3039 static void
unp_scan_socket(struct socket * so,void (* op)(struct filedescent **,int))3040 unp_scan_socket(struct socket *so, void (*op)(struct filedescent **, int))
3041 {
3042 struct sockbuf *sb;
3043
3044 SOCK_LOCK_ASSERT(so);
3045
3046 if (sotounpcb(so)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
3047 return;
3048
3049 SOCK_RECVBUF_LOCK(so);
3050 switch (so->so_type) {
3051 case SOCK_DGRAM:
3052 unp_scan(STAILQ_FIRST(&so->so_rcv.uxdg_mb), op);
3053 unp_scan(so->so_rcv.uxdg_peeked, op);
3054 TAILQ_FOREACH(sb, &so->so_rcv.uxdg_conns, uxdg_clist)
3055 unp_scan(STAILQ_FIRST(&sb->uxdg_mb), op);
3056 break;
3057 case SOCK_STREAM:
3058 case SOCK_SEQPACKET:
3059 unp_scan(so->so_rcv.sb_mb, op);
3060 break;
3061 }
3062 SOCK_RECVBUF_UNLOCK(so);
3063 }
3064
3065 static void
unp_gc_scan(struct unpcb * unp,void (* op)(struct filedescent **,int))3066 unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int))
3067 {
3068 struct socket *so, *soa;
3069
3070 so = unp->unp_socket;
3071 SOCK_LOCK(so);
3072 if (SOLISTENING(so)) {
3073 /*
3074 * Mark all sockets in our accept queue.
3075 */
3076 TAILQ_FOREACH(soa, &so->sol_comp, so_list)
3077 unp_scan_socket(soa, op);
3078 } else {
3079 /*
3080 * Mark all sockets we reference with RIGHTS.
3081 */
3082 unp_scan_socket(so, op);
3083 }
3084 SOCK_UNLOCK(so);
3085 }
3086
3087 static int unp_recycled;
3088 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0,
3089 "Number of unreachable sockets claimed by the garbage collector.");
3090
3091 static int unp_taskcount;
3092 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0,
3093 "Number of times the garbage collector has run.");
3094
3095 SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0,
3096 "Number of active local sockets.");
3097
3098 static void
unp_gc(__unused void * arg,int pending)3099 unp_gc(__unused void *arg, int pending)
3100 {
3101 struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
3102 NULL };
3103 struct unp_head **head;
3104 struct unp_head unp_deadhead; /* List of potentially-dead sockets. */
3105 struct file *f, **unref;
3106 struct unpcb *unp, *unptmp;
3107 int i, total, unp_unreachable;
3108
3109 LIST_INIT(&unp_deadhead);
3110 unp_taskcount++;
3111 UNP_LINK_RLOCK();
3112 /*
3113 * First determine which sockets may be in cycles.
3114 */
3115 unp_unreachable = 0;
3116
3117 for (head = heads; *head != NULL; head++)
3118 LIST_FOREACH(unp, *head, unp_link) {
3119 KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0,
3120 ("%s: unp %p has unexpected gc flags 0x%x",
3121 __func__, unp, (unsigned int)unp->unp_gcflag));
3122
3123 f = unp->unp_file;
3124
3125 /*
3126 * Check for an unreachable socket potentially in a
3127 * cycle. It must be in a queue as indicated by
3128 * msgcount, and this must equal the file reference
3129 * count. Note that when msgcount is 0 the file is
3130 * NULL.
3131 */
3132 if (f != NULL && unp->unp_msgcount != 0 &&
3133 refcount_load(&f->f_count) == unp->unp_msgcount) {
3134 LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead);
3135 unp->unp_gcflag |= UNPGC_DEAD;
3136 unp->unp_gcrefs = unp->unp_msgcount;
3137 unp_unreachable++;
3138 }
3139 }
3140
3141 /*
3142 * Scan all sockets previously marked as potentially being in a cycle
3143 * and remove the references each socket holds on any UNPGC_DEAD
3144 * sockets in its queue. After this step, all remaining references on
3145 * sockets marked UNPGC_DEAD should not be part of any cycle.
3146 */
3147 LIST_FOREACH(unp, &unp_deadhead, unp_dead)
3148 unp_gc_scan(unp, unp_remove_dead_ref);
3149
3150 /*
3151 * If a socket still has a non-negative refcount, it cannot be in a
3152 * cycle. In this case increment refcount of all children iteratively.
3153 * Stop the scan once we do a complete loop without discovering
3154 * a new reachable socket.
3155 */
3156 do {
3157 unp_marked = 0;
3158 LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp)
3159 if (unp->unp_gcrefs > 0) {
3160 unp->unp_gcflag &= ~UNPGC_DEAD;
3161 LIST_REMOVE(unp, unp_dead);
3162 KASSERT(unp_unreachable > 0,
3163 ("%s: unp_unreachable underflow.",
3164 __func__));
3165 unp_unreachable--;
3166 unp_gc_scan(unp, unp_restore_undead_ref);
3167 }
3168 } while (unp_marked);
3169
3170 UNP_LINK_RUNLOCK();
3171
3172 if (unp_unreachable == 0)
3173 return;
3174
3175 /*
3176 * Allocate space for a local array of dead unpcbs.
3177 * TODO: can this path be simplified by instead using the local
3178 * dead list at unp_deadhead, after taking out references
3179 * on the file object and/or unpcb and dropping the link lock?
3180 */
3181 unref = malloc(unp_unreachable * sizeof(struct file *),
3182 M_TEMP, M_WAITOK);
3183
3184 /*
3185 * Iterate looking for sockets which have been specifically marked
3186 * as unreachable and store them locally.
3187 */
3188 UNP_LINK_RLOCK();
3189 total = 0;
3190 LIST_FOREACH(unp, &unp_deadhead, unp_dead) {
3191 KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0,
3192 ("%s: unp %p not marked UNPGC_DEAD", __func__, unp));
3193 unp->unp_gcflag &= ~UNPGC_DEAD;
3194 f = unp->unp_file;
3195 if (unp->unp_msgcount == 0 || f == NULL ||
3196 refcount_load(&f->f_count) != unp->unp_msgcount ||
3197 !fhold(f))
3198 continue;
3199 unref[total++] = f;
3200 KASSERT(total <= unp_unreachable,
3201 ("%s: incorrect unreachable count.", __func__));
3202 }
3203 UNP_LINK_RUNLOCK();
3204
3205 /*
3206 * Now flush all sockets, free'ing rights. This will free the
3207 * struct files associated with these sockets but leave each socket
3208 * with one remaining ref.
3209 */
3210 for (i = 0; i < total; i++) {
3211 struct socket *so;
3212
3213 so = unref[i]->f_data;
3214 CURVNET_SET(so->so_vnet);
3215 socantrcvmore(so);
3216 unp_dispose(so);
3217 CURVNET_RESTORE();
3218 }
3219
3220 /*
3221 * And finally release the sockets so they can be reclaimed.
3222 */
3223 for (i = 0; i < total; i++)
3224 fdrop(unref[i], NULL);
3225 unp_recycled += total;
3226 free(unref, M_TEMP);
3227 }
3228
3229 /*
3230 * Synchronize against unp_gc, which can trip over data as we are freeing it.
3231 */
3232 static void
unp_dispose(struct socket * so)3233 unp_dispose(struct socket *so)
3234 {
3235 struct sockbuf *sb;
3236 struct unpcb *unp;
3237 struct mbuf *m;
3238 int error __diagused;
3239
3240 MPASS(!SOLISTENING(so));
3241
3242 unp = sotounpcb(so);
3243 UNP_LINK_WLOCK();
3244 unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
3245 UNP_LINK_WUNLOCK();
3246
3247 /*
3248 * Grab our special mbufs before calling sbrelease().
3249 */
3250 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
3251 MPASS(!error);
3252 SOCK_RECVBUF_LOCK(so);
3253 switch (so->so_type) {
3254 case SOCK_DGRAM:
3255 while ((sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) != NULL) {
3256 STAILQ_CONCAT(&so->so_rcv.uxdg_mb, &sb->uxdg_mb);
3257 TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist);
3258 /* Note: socket of sb may reconnect. */
3259 sb->uxdg_cc = sb->uxdg_ctl = sb->uxdg_mbcnt = 0;
3260 }
3261 sb = &so->so_rcv;
3262 if (sb->uxdg_peeked != NULL) {
3263 STAILQ_INSERT_HEAD(&sb->uxdg_mb, sb->uxdg_peeked,
3264 m_stailqpkt);
3265 sb->uxdg_peeked = NULL;
3266 }
3267 m = STAILQ_FIRST(&sb->uxdg_mb);
3268 STAILQ_INIT(&sb->uxdg_mb);
3269 /* XXX: our shortened sbrelease() */
3270 (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
3271 RLIM_INFINITY);
3272 /*
3273 * XXXGL Mark sb with SBS_CANTRCVMORE. This is needed to
3274 * prevent uipc_sosend_dgram() or unp_disconnect() adding more
3275 * data to the socket.
3276 * We came here either through shutdown(2) or from the final
3277 * sofree(). The sofree() case is simple as it guarantees
3278 * that no more sends will happen, however we can race with
3279 * unp_disconnect() from our peer. The shutdown(2) case is
3280 * more exotic. It would call into unp_dispose() only if
3281 * socket is SS_ISCONNECTED. This is possible if we did
3282 * connect(2) on this socket and we also had it bound with
3283 * bind(2) and receive connections from other sockets.
3284 * Because uipc_shutdown() violates POSIX (see comment
3285 * there) we will end up here shutting down our receive side.
3286 * Of course this will have affect not only on the peer we
3287 * connect(2)ed to, but also on all of the peers who had
3288 * connect(2)ed to us. Their sends would end up with ENOBUFS.
3289 */
3290 sb->sb_state |= SBS_CANTRCVMORE;
3291 break;
3292 case SOCK_STREAM:
3293 case SOCK_SEQPACKET:
3294 sb = &so->so_rcv;
3295 m = sbcut_locked(sb, sb->sb_ccc);
3296 KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
3297 ("%s: ccc %u mb %p mbcnt %u", __func__,
3298 sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
3299 sbrelease_locked(so, SO_RCV);
3300 break;
3301 }
3302 SOCK_RECVBUF_UNLOCK(so);
3303 SOCK_IO_RECV_UNLOCK(so);
3304
3305 if (m != NULL) {
3306 unp_scan(m, unp_freerights);
3307 m_freemp(m);
3308 }
3309 }
3310
3311 static void
unp_scan(struct mbuf * m0,void (* op)(struct filedescent **,int))3312 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
3313 {
3314 struct mbuf *m;
3315 struct cmsghdr *cm;
3316 void *data;
3317 socklen_t clen, datalen;
3318
3319 while (m0 != NULL) {
3320 for (m = m0; m; m = m->m_next) {
3321 if (m->m_type != MT_CONTROL)
3322 continue;
3323
3324 cm = mtod(m, struct cmsghdr *);
3325 clen = m->m_len;
3326
3327 while (cm != NULL) {
3328 if (sizeof(*cm) > clen || cm->cmsg_len > clen)
3329 break;
3330
3331 data = CMSG_DATA(cm);
3332 datalen = (caddr_t)cm + cm->cmsg_len
3333 - (caddr_t)data;
3334
3335 if (cm->cmsg_level == SOL_SOCKET &&
3336 cm->cmsg_type == SCM_RIGHTS) {
3337 (*op)(data, datalen /
3338 sizeof(struct filedescent *));
3339 }
3340
3341 if (CMSG_SPACE(datalen) < clen) {
3342 clen -= CMSG_SPACE(datalen);
3343 cm = (struct cmsghdr *)
3344 ((caddr_t)cm + CMSG_SPACE(datalen));
3345 } else {
3346 clen = 0;
3347 cm = NULL;
3348 }
3349 }
3350 }
3351 m0 = m0->m_nextpkt;
3352 }
3353 }
3354
3355 /*
3356 * Definitions of protocols supported in the LOCAL domain.
3357 */
3358 static struct protosw streamproto = {
3359 .pr_type = SOCK_STREAM,
3360 .pr_flags = PR_CONNREQUIRED | PR_WANTRCVD | PR_CAPATTACH,
3361 .pr_ctloutput = &uipc_ctloutput,
3362 .pr_abort = uipc_abort,
3363 .pr_accept = uipc_peeraddr,
3364 .pr_attach = uipc_attach,
3365 .pr_bind = uipc_bind,
3366 .pr_bindat = uipc_bindat,
3367 .pr_connect = uipc_connect,
3368 .pr_connectat = uipc_connectat,
3369 .pr_connect2 = uipc_connect2,
3370 .pr_detach = uipc_detach,
3371 .pr_disconnect = uipc_disconnect,
3372 .pr_listen = uipc_listen,
3373 .pr_peeraddr = uipc_peeraddr,
3374 .pr_rcvd = uipc_rcvd,
3375 .pr_send = uipc_send,
3376 .pr_ready = uipc_ready,
3377 .pr_sense = uipc_sense,
3378 .pr_shutdown = uipc_shutdown,
3379 .pr_sockaddr = uipc_sockaddr,
3380 .pr_soreceive = soreceive_generic,
3381 .pr_close = uipc_close,
3382 .pr_chmod = uipc_chmod,
3383 };
3384
3385 static struct protosw dgramproto = {
3386 .pr_type = SOCK_DGRAM,
3387 .pr_flags = PR_ATOMIC | PR_ADDR | PR_CAPATTACH | PR_SOCKBUF,
3388 .pr_ctloutput = &uipc_ctloutput,
3389 .pr_abort = uipc_abort,
3390 .pr_accept = uipc_peeraddr,
3391 .pr_attach = uipc_attach,
3392 .pr_bind = uipc_bind,
3393 .pr_bindat = uipc_bindat,
3394 .pr_connect = uipc_connect,
3395 .pr_connectat = uipc_connectat,
3396 .pr_connect2 = uipc_connect2,
3397 .pr_detach = uipc_detach,
3398 .pr_disconnect = uipc_disconnect,
3399 .pr_peeraddr = uipc_peeraddr,
3400 .pr_sosend = uipc_sosend_dgram,
3401 .pr_sense = uipc_sense,
3402 .pr_shutdown = uipc_shutdown,
3403 .pr_sockaddr = uipc_sockaddr,
3404 .pr_soreceive = uipc_soreceive_dgram,
3405 .pr_close = uipc_close,
3406 .pr_chmod = uipc_chmod,
3407 };
3408
3409 static struct protosw seqpacketproto = {
3410 .pr_type = SOCK_SEQPACKET,
3411 /*
3412 * XXXRW: For now, PR_ADDR because soreceive will bump into them
3413 * due to our use of sbappendaddr. A new sbappend variants is needed
3414 * that supports both atomic record writes and control data.
3415 */
3416 .pr_flags = PR_ADDR | PR_ATOMIC | PR_CONNREQUIRED |
3417 PR_WANTRCVD | PR_CAPATTACH,
3418 .pr_ctloutput = &uipc_ctloutput,
3419 .pr_abort = uipc_abort,
3420 .pr_accept = uipc_peeraddr,
3421 .pr_attach = uipc_attach,
3422 .pr_bind = uipc_bind,
3423 .pr_bindat = uipc_bindat,
3424 .pr_connect = uipc_connect,
3425 .pr_connectat = uipc_connectat,
3426 .pr_connect2 = uipc_connect2,
3427 .pr_detach = uipc_detach,
3428 .pr_disconnect = uipc_disconnect,
3429 .pr_listen = uipc_listen,
3430 .pr_peeraddr = uipc_peeraddr,
3431 .pr_rcvd = uipc_rcvd,
3432 .pr_send = uipc_send,
3433 .pr_sense = uipc_sense,
3434 .pr_shutdown = uipc_shutdown,
3435 .pr_sockaddr = uipc_sockaddr,
3436 .pr_soreceive = soreceive_generic, /* XXX: or...? */
3437 .pr_close = uipc_close,
3438 .pr_chmod = uipc_chmod,
3439 };
3440
3441 static struct domain localdomain = {
3442 .dom_family = AF_LOCAL,
3443 .dom_name = "local",
3444 .dom_externalize = unp_externalize,
3445 .dom_nprotosw = 3,
3446 .dom_protosw = {
3447 &streamproto,
3448 &dgramproto,
3449 &seqpacketproto,
3450 }
3451 };
3452 DOMAIN_SET(local);
3453
3454 /*
3455 * A helper function called by VFS before socket-type vnode reclamation.
3456 * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
3457 * use count.
3458 */
3459 void
vfs_unp_reclaim(struct vnode * vp)3460 vfs_unp_reclaim(struct vnode *vp)
3461 {
3462 struct unpcb *unp;
3463 int active;
3464 struct mtx *vplock;
3465
3466 ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
3467 KASSERT(vp->v_type == VSOCK,
3468 ("vfs_unp_reclaim: vp->v_type != VSOCK"));
3469
3470 active = 0;
3471 vplock = mtx_pool_find(unp_vp_mtxpool, vp);
3472 mtx_lock(vplock);
3473 VOP_UNP_CONNECT(vp, &unp);
3474 if (unp == NULL)
3475 goto done;
3476 UNP_PCB_LOCK(unp);
3477 if (unp->unp_vnode == vp) {
3478 VOP_UNP_DETACH(vp);
3479 unp->unp_vnode = NULL;
3480 active = 1;
3481 }
3482 UNP_PCB_UNLOCK(unp);
3483 done:
3484 mtx_unlock(vplock);
3485 if (active)
3486 vunref(vp);
3487 }
3488
3489 #ifdef DDB
3490 static void
db_print_indent(int indent)3491 db_print_indent(int indent)
3492 {
3493 int i;
3494
3495 for (i = 0; i < indent; i++)
3496 db_printf(" ");
3497 }
3498
3499 static void
db_print_unpflags(int unp_flags)3500 db_print_unpflags(int unp_flags)
3501 {
3502 int comma;
3503
3504 comma = 0;
3505 if (unp_flags & UNP_HAVEPC) {
3506 db_printf("%sUNP_HAVEPC", comma ? ", " : "");
3507 comma = 1;
3508 }
3509 if (unp_flags & UNP_WANTCRED_ALWAYS) {
3510 db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : "");
3511 comma = 1;
3512 }
3513 if (unp_flags & UNP_WANTCRED_ONESHOT) {
3514 db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : "");
3515 comma = 1;
3516 }
3517 if (unp_flags & UNP_CONNECTING) {
3518 db_printf("%sUNP_CONNECTING", comma ? ", " : "");
3519 comma = 1;
3520 }
3521 if (unp_flags & UNP_BINDING) {
3522 db_printf("%sUNP_BINDING", comma ? ", " : "");
3523 comma = 1;
3524 }
3525 }
3526
3527 static void
db_print_xucred(int indent,struct xucred * xu)3528 db_print_xucred(int indent, struct xucred *xu)
3529 {
3530 int comma, i;
3531
3532 db_print_indent(indent);
3533 db_printf("cr_version: %u cr_uid: %u cr_pid: %d cr_ngroups: %d\n",
3534 xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups);
3535 db_print_indent(indent);
3536 db_printf("cr_groups: ");
3537 comma = 0;
3538 for (i = 0; i < xu->cr_ngroups; i++) {
3539 db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
3540 comma = 1;
3541 }
3542 db_printf("\n");
3543 }
3544
3545 static void
db_print_unprefs(int indent,struct unp_head * uh)3546 db_print_unprefs(int indent, struct unp_head *uh)
3547 {
3548 struct unpcb *unp;
3549 int counter;
3550
3551 counter = 0;
3552 LIST_FOREACH(unp, uh, unp_reflink) {
3553 if (counter % 4 == 0)
3554 db_print_indent(indent);
3555 db_printf("%p ", unp);
3556 if (counter % 4 == 3)
3557 db_printf("\n");
3558 counter++;
3559 }
3560 if (counter != 0 && counter % 4 != 0)
3561 db_printf("\n");
3562 }
3563
DB_SHOW_COMMAND(unpcb,db_show_unpcb)3564 DB_SHOW_COMMAND(unpcb, db_show_unpcb)
3565 {
3566 struct unpcb *unp;
3567
3568 if (!have_addr) {
3569 db_printf("usage: show unpcb <addr>\n");
3570 return;
3571 }
3572 unp = (struct unpcb *)addr;
3573
3574 db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket,
3575 unp->unp_vnode);
3576
3577 db_printf("unp_ino: %ju unp_conn: %p\n", (uintmax_t)unp->unp_ino,
3578 unp->unp_conn);
3579
3580 db_printf("unp_refs:\n");
3581 db_print_unprefs(2, &unp->unp_refs);
3582
3583 /* XXXRW: Would be nice to print the full address, if any. */
3584 db_printf("unp_addr: %p\n", unp->unp_addr);
3585
3586 db_printf("unp_gencnt: %llu\n",
3587 (unsigned long long)unp->unp_gencnt);
3588
3589 db_printf("unp_flags: %x (", unp->unp_flags);
3590 db_print_unpflags(unp->unp_flags);
3591 db_printf(")\n");
3592
3593 db_printf("unp_peercred:\n");
3594 db_print_xucred(2, &unp->unp_peercred);
3595
3596 db_printf("unp_refcount: %u\n", unp->unp_refcount);
3597 }
3598 #endif
3599