xref: /linux/net/unix/af_unix.c (revision 1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/bpf-cgroup.h>
81 #include <linux/btf_ids.h>
82 #include <linux/dcache.h>
83 #include <linux/errno.h>
84 #include <linux/fcntl.h>
85 #include <linux/file.h>
86 #include <linux/filter.h>
87 #include <linux/fs.h>
88 #include <linux/init.h>
89 #include <linux/kernel.h>
90 #include <linux/mount.h>
91 #include <linux/namei.h>
92 #include <linux/poll.h>
93 #include <linux/proc_fs.h>
94 #include <linux/sched/signal.h>
95 #include <linux/security.h>
96 #include <linux/seq_file.h>
97 #include <linux/skbuff.h>
98 #include <linux/slab.h>
99 #include <linux/socket.h>
100 #include <linux/splice.h>
101 #include <linux/string.h>
102 #include <linux/uaccess.h>
103 #include <net/af_unix.h>
104 #include <net/net_namespace.h>
105 #include <net/scm.h>
106 #include <net/tcp_states.h>
107 #include <uapi/linux/sockios.h>
108 #include <uapi/linux/termios.h>
109 
110 #include "af_unix.h"
111 
112 static atomic_long_t unix_nr_socks;
113 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
114 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
115 
116 /* SMP locking strategy:
117  *    hash table is protected with spinlock.
118  *    each socket state is protected by separate spinlock.
119  */
120 #ifdef CONFIG_PROVE_LOCKING
121 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
122 
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)123 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
124 				  const struct lockdep_map *b)
125 {
126 	return cmp_ptr(a, b);
127 }
128 
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)129 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
130 				  const struct lockdep_map *_b)
131 {
132 	const struct unix_sock *a, *b;
133 
134 	a = container_of(_a, struct unix_sock, lock.dep_map);
135 	b = container_of(_b, struct unix_sock, lock.dep_map);
136 
137 	if (a->sk.sk_state == TCP_LISTEN) {
138 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
139 		 *
140 		 *   1. a is TCP_LISTEN.
141 		 *   2. b is not a.
142 		 *   3. concurrent connect(b -> a) must fail.
143 		 *
144 		 * Except for 2. & 3., the b's state can be any possible
145 		 * value due to concurrent connect() or listen().
146 		 *
147 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
148 		 * be expressed as lock_cmp_fn.
149 		 */
150 		switch (b->sk.sk_state) {
151 		case TCP_CLOSE:
152 		case TCP_ESTABLISHED:
153 		case TCP_LISTEN:
154 			return -1;
155 		default:
156 			/* Invalid case. */
157 			return 0;
158 		}
159 	}
160 
161 	/* Should never happen.  Just to be symmetric. */
162 	if (b->sk.sk_state == TCP_LISTEN) {
163 		switch (b->sk.sk_state) {
164 		case TCP_CLOSE:
165 		case TCP_ESTABLISHED:
166 			return 1;
167 		default:
168 			return 0;
169 		}
170 	}
171 
172 	/* unix_state_double_lock(): ascending address order. */
173 	return cmp_ptr(a, b);
174 }
175 
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)176 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
177 				  const struct lockdep_map *_b)
178 {
179 	const struct sock *a, *b;
180 
181 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
182 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
183 
184 	/* unix_collect_skb(): listener -> embryo order. */
185 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
186 		return -1;
187 
188 	/* Should never happen.  Just to be symmetric. */
189 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
190 		return 1;
191 
192 	return 0;
193 }
194 #endif
195 
unix_unbound_hash(struct sock * sk)196 static unsigned int unix_unbound_hash(struct sock *sk)
197 {
198 	unsigned long hash = (unsigned long)sk;
199 
200 	hash ^= hash >> 16;
201 	hash ^= hash >> 8;
202 	hash ^= sk->sk_type;
203 
204 	return hash & UNIX_HASH_MOD;
205 }
206 
unix_bsd_hash(struct inode * i)207 static unsigned int unix_bsd_hash(struct inode *i)
208 {
209 	return i->i_ino & UNIX_HASH_MOD;
210 }
211 
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)212 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
213 				       int addr_len, int type)
214 {
215 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
216 	unsigned int hash;
217 
218 	hash = (__force unsigned int)csum_fold(csum);
219 	hash ^= hash >> 8;
220 	hash ^= type;
221 
222 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
223 }
224 
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)225 static void unix_table_double_lock(struct net *net,
226 				   unsigned int hash1, unsigned int hash2)
227 {
228 	if (hash1 == hash2) {
229 		spin_lock(&net->unx.table.locks[hash1]);
230 		return;
231 	}
232 
233 	if (hash1 > hash2)
234 		swap(hash1, hash2);
235 
236 	spin_lock(&net->unx.table.locks[hash1]);
237 	spin_lock(&net->unx.table.locks[hash2]);
238 }
239 
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)240 static void unix_table_double_unlock(struct net *net,
241 				     unsigned int hash1, unsigned int hash2)
242 {
243 	if (hash1 == hash2) {
244 		spin_unlock(&net->unx.table.locks[hash1]);
245 		return;
246 	}
247 
248 	spin_unlock(&net->unx.table.locks[hash1]);
249 	spin_unlock(&net->unx.table.locks[hash2]);
250 }
251 
252 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)253 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
254 {
255 	UNIXCB(skb).secid = scm->secid;
256 }
257 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)258 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
259 {
260 	scm->secid = UNIXCB(skb).secid;
261 }
262 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)263 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
264 {
265 	return (scm->secid == UNIXCB(skb).secid);
266 }
267 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)268 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
269 { }
270 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)271 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
272 { }
273 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)274 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
275 {
276 	return true;
277 }
278 #endif /* CONFIG_SECURITY_NETWORK */
279 
unix_may_send(struct sock * sk,struct sock * osk)280 static inline int unix_may_send(struct sock *sk, struct sock *osk)
281 {
282 	return !unix_peer(osk) || unix_peer(osk) == sk;
283 }
284 
unix_recvq_full_lockless(const struct sock * sk)285 static inline int unix_recvq_full_lockless(const struct sock *sk)
286 {
287 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
288 }
289 
unix_peer_get(struct sock * s)290 struct sock *unix_peer_get(struct sock *s)
291 {
292 	struct sock *peer;
293 
294 	unix_state_lock(s);
295 	peer = unix_peer(s);
296 	if (peer)
297 		sock_hold(peer);
298 	unix_state_unlock(s);
299 	return peer;
300 }
301 EXPORT_SYMBOL_GPL(unix_peer_get);
302 
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)303 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
304 					     int addr_len)
305 {
306 	struct unix_address *addr;
307 
308 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
309 	if (!addr)
310 		return NULL;
311 
312 	refcount_set(&addr->refcnt, 1);
313 	addr->len = addr_len;
314 	memcpy(addr->name, sunaddr, addr_len);
315 
316 	return addr;
317 }
318 
unix_release_addr(struct unix_address * addr)319 static inline void unix_release_addr(struct unix_address *addr)
320 {
321 	if (refcount_dec_and_test(&addr->refcnt))
322 		kfree(addr);
323 }
324 
325 /*
326  *	Check unix socket name:
327  *		- should be not zero length.
328  *	        - if started by not zero, should be NULL terminated (FS object)
329  *		- if started by zero, it is abstract name.
330  */
331 
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)332 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
333 {
334 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
335 	    addr_len > sizeof(*sunaddr))
336 		return -EINVAL;
337 
338 	if (sunaddr->sun_family != AF_UNIX)
339 		return -EINVAL;
340 
341 	return 0;
342 }
343 
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)344 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
345 {
346 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
347 	short offset = offsetof(struct sockaddr_storage, __data);
348 
349 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
350 
351 	/* This may look like an off by one error but it is a bit more
352 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
353 	 * sun_path[108] doesn't as such exist.  However in kernel space
354 	 * we are guaranteed that it is a valid memory location in our
355 	 * kernel address buffer because syscall functions always pass
356 	 * a pointer of struct sockaddr_storage which has a bigger buffer
357 	 * than 108.  Also, we must terminate sun_path for strlen() in
358 	 * getname_kernel().
359 	 */
360 	addr->__data[addr_len - offset] = 0;
361 
362 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
363 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
364 	 * know the actual buffer.
365 	 */
366 	return strlen(addr->__data) + offset + 1;
367 }
368 
__unix_remove_socket(struct sock * sk)369 static void __unix_remove_socket(struct sock *sk)
370 {
371 	sk_del_node_init(sk);
372 }
373 
__unix_insert_socket(struct net * net,struct sock * sk)374 static void __unix_insert_socket(struct net *net, struct sock *sk)
375 {
376 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
377 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
378 }
379 
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)380 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
381 				 struct unix_address *addr, unsigned int hash)
382 {
383 	__unix_remove_socket(sk);
384 	smp_store_release(&unix_sk(sk)->addr, addr);
385 
386 	sk->sk_hash = hash;
387 	__unix_insert_socket(net, sk);
388 }
389 
unix_remove_socket(struct net * net,struct sock * sk)390 static void unix_remove_socket(struct net *net, struct sock *sk)
391 {
392 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
393 	__unix_remove_socket(sk);
394 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
395 }
396 
unix_insert_unbound_socket(struct net * net,struct sock * sk)397 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
398 {
399 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
400 	__unix_insert_socket(net, sk);
401 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
402 }
403 
unix_insert_bsd_socket(struct sock * sk)404 static void unix_insert_bsd_socket(struct sock *sk)
405 {
406 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
407 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
408 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
409 }
410 
unix_remove_bsd_socket(struct sock * sk)411 static void unix_remove_bsd_socket(struct sock *sk)
412 {
413 	if (!hlist_unhashed(&sk->sk_bind_node)) {
414 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
415 		__sk_del_bind_node(sk);
416 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
417 
418 		sk_node_init(&sk->sk_bind_node);
419 	}
420 }
421 
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)422 static struct sock *__unix_find_socket_byname(struct net *net,
423 					      struct sockaddr_un *sunname,
424 					      int len, unsigned int hash)
425 {
426 	struct sock *s;
427 
428 	sk_for_each(s, &net->unx.table.buckets[hash]) {
429 		struct unix_sock *u = unix_sk(s);
430 
431 		if (u->addr->len == len &&
432 		    !memcmp(u->addr->name, sunname, len))
433 			return s;
434 	}
435 	return NULL;
436 }
437 
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)438 static inline struct sock *unix_find_socket_byname(struct net *net,
439 						   struct sockaddr_un *sunname,
440 						   int len, unsigned int hash)
441 {
442 	struct sock *s;
443 
444 	spin_lock(&net->unx.table.locks[hash]);
445 	s = __unix_find_socket_byname(net, sunname, len, hash);
446 	if (s)
447 		sock_hold(s);
448 	spin_unlock(&net->unx.table.locks[hash]);
449 	return s;
450 }
451 
unix_find_socket_byinode(struct inode * i)452 static struct sock *unix_find_socket_byinode(struct inode *i)
453 {
454 	unsigned int hash = unix_bsd_hash(i);
455 	struct sock *s;
456 
457 	spin_lock(&bsd_socket_locks[hash]);
458 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
459 		struct dentry *dentry = unix_sk(s)->path.dentry;
460 
461 		if (dentry && d_backing_inode(dentry) == i) {
462 			sock_hold(s);
463 			spin_unlock(&bsd_socket_locks[hash]);
464 			return s;
465 		}
466 	}
467 	spin_unlock(&bsd_socket_locks[hash]);
468 	return NULL;
469 }
470 
471 /* Support code for asymmetrically connected dgram sockets
472  *
473  * If a datagram socket is connected to a socket not itself connected
474  * to the first socket (eg, /dev/log), clients may only enqueue more
475  * messages if the present receive queue of the server socket is not
476  * "too large". This means there's a second writeability condition
477  * poll and sendmsg need to test. The dgram recv code will do a wake
478  * up on the peer_wait wait queue of a socket upon reception of a
479  * datagram which needs to be propagated to sleeping would-be writers
480  * since these might not have sent anything so far. This can't be
481  * accomplished via poll_wait because the lifetime of the server
482  * socket might be less than that of its clients if these break their
483  * association with it or if the server socket is closed while clients
484  * are still connected to it and there's no way to inform "a polling
485  * implementation" that it should let go of a certain wait queue
486  *
487  * In order to propagate a wake up, a wait_queue_entry_t of the client
488  * socket is enqueued on the peer_wait queue of the server socket
489  * whose wake function does a wake_up on the ordinary client socket
490  * wait queue. This connection is established whenever a write (or
491  * poll for write) hit the flow control condition and broken when the
492  * association to the server socket is dissolved or after a wake up
493  * was relayed.
494  */
495 
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)496 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
497 				      void *key)
498 {
499 	struct unix_sock *u;
500 	wait_queue_head_t *u_sleep;
501 
502 	u = container_of(q, struct unix_sock, peer_wake);
503 
504 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
505 			    q);
506 	u->peer_wake.private = NULL;
507 
508 	/* relaying can only happen while the wq still exists */
509 	u_sleep = sk_sleep(&u->sk);
510 	if (u_sleep)
511 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
512 
513 	return 0;
514 }
515 
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)516 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
517 {
518 	struct unix_sock *u, *u_other;
519 	int rc;
520 
521 	u = unix_sk(sk);
522 	u_other = unix_sk(other);
523 	rc = 0;
524 	spin_lock(&u_other->peer_wait.lock);
525 
526 	if (!u->peer_wake.private) {
527 		u->peer_wake.private = other;
528 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
529 
530 		rc = 1;
531 	}
532 
533 	spin_unlock(&u_other->peer_wait.lock);
534 	return rc;
535 }
536 
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)537 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
538 					    struct sock *other)
539 {
540 	struct unix_sock *u, *u_other;
541 
542 	u = unix_sk(sk);
543 	u_other = unix_sk(other);
544 	spin_lock(&u_other->peer_wait.lock);
545 
546 	if (u->peer_wake.private == other) {
547 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
548 		u->peer_wake.private = NULL;
549 	}
550 
551 	spin_unlock(&u_other->peer_wait.lock);
552 }
553 
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)554 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
555 						   struct sock *other)
556 {
557 	unix_dgram_peer_wake_disconnect(sk, other);
558 	wake_up_interruptible_poll(sk_sleep(sk),
559 				   EPOLLOUT |
560 				   EPOLLWRNORM |
561 				   EPOLLWRBAND);
562 }
563 
564 /* preconditions:
565  *	- unix_peer(sk) == other
566  *	- association is stable
567  */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)568 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
569 {
570 	int connected;
571 
572 	connected = unix_dgram_peer_wake_connect(sk, other);
573 
574 	/* If other is SOCK_DEAD, we want to make sure we signal
575 	 * POLLOUT, such that a subsequent write() can get a
576 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
577 	 * to other and its full, we will hang waiting for POLLOUT.
578 	 */
579 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
580 		return 1;
581 
582 	if (connected)
583 		unix_dgram_peer_wake_disconnect(sk, other);
584 
585 	return 0;
586 }
587 
unix_writable(const struct sock * sk,unsigned char state)588 static int unix_writable(const struct sock *sk, unsigned char state)
589 {
590 	return state != TCP_LISTEN &&
591 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
592 }
593 
unix_write_space(struct sock * sk)594 static void unix_write_space(struct sock *sk)
595 {
596 	struct socket_wq *wq;
597 
598 	rcu_read_lock();
599 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
600 		wq = rcu_dereference(sk->sk_wq);
601 		if (skwq_has_sleeper(wq))
602 			wake_up_interruptible_sync_poll(&wq->wait,
603 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
604 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
605 	}
606 	rcu_read_unlock();
607 }
608 
609 /* When dgram socket disconnects (or changes its peer), we clear its receive
610  * queue of packets arrived from previous peer. First, it allows to do
611  * flow control based only on wmem_alloc; second, sk connected to peer
612  * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)613 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
614 {
615 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
616 		skb_queue_purge_reason(&sk->sk_receive_queue,
617 				       SKB_DROP_REASON_UNIX_DISCONNECT);
618 
619 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
620 
621 		/* If one link of bidirectional dgram pipe is disconnected,
622 		 * we signal error. Messages are lost. Do not make this,
623 		 * when peer was not connected to us.
624 		 */
625 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
626 			WRITE_ONCE(other->sk_err, ECONNRESET);
627 			sk_error_report(other);
628 		}
629 	}
630 }
631 
unix_sock_destructor(struct sock * sk)632 static void unix_sock_destructor(struct sock *sk)
633 {
634 	struct unix_sock *u = unix_sk(sk);
635 
636 	skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
637 
638 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
639 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
640 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
641 	if (!sock_flag(sk, SOCK_DEAD)) {
642 		pr_info("Attempt to release alive unix socket: %p\n", sk);
643 		return;
644 	}
645 
646 	if (u->addr)
647 		unix_release_addr(u->addr);
648 
649 	atomic_long_dec(&unix_nr_socks);
650 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
651 #ifdef UNIX_REFCNT_DEBUG
652 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
653 		atomic_long_read(&unix_nr_socks));
654 #endif
655 }
656 
unix_release_sock(struct sock * sk,int embrion)657 static void unix_release_sock(struct sock *sk, int embrion)
658 {
659 	struct unix_sock *u = unix_sk(sk);
660 	struct sock *skpair;
661 	struct sk_buff *skb;
662 	struct path path;
663 	int state;
664 
665 	unix_remove_socket(sock_net(sk), sk);
666 	unix_remove_bsd_socket(sk);
667 
668 	/* Clear state */
669 	unix_state_lock(sk);
670 	sock_orphan(sk);
671 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
672 	path	     = u->path;
673 	u->path.dentry = NULL;
674 	u->path.mnt = NULL;
675 	state = sk->sk_state;
676 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
677 
678 	skpair = unix_peer(sk);
679 	unix_peer(sk) = NULL;
680 
681 	unix_state_unlock(sk);
682 
683 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
684 	u->oob_skb = NULL;
685 #endif
686 
687 	wake_up_interruptible_all(&u->peer_wait);
688 
689 	if (skpair != NULL) {
690 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
691 			unix_state_lock(skpair);
692 			/* No more writes */
693 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
694 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
695 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
696 			unix_state_unlock(skpair);
697 			skpair->sk_state_change(skpair);
698 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
699 		}
700 
701 		unix_dgram_peer_wake_disconnect(sk, skpair);
702 		sock_put(skpair); /* It may now die */
703 	}
704 
705 	/* Try to flush out this socket. Throw out buffers at least */
706 
707 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
708 		if (state == TCP_LISTEN)
709 			unix_release_sock(skb->sk, 1);
710 
711 		/* passed fds are erased in the kfree_skb hook */
712 		kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
713 	}
714 
715 	if (path.dentry)
716 		path_put(&path);
717 
718 	sock_put(sk);
719 
720 	/* ---- Socket is dead now and most probably destroyed ---- */
721 
722 	/*
723 	 * Fixme: BSD difference: In BSD all sockets connected to us get
724 	 *	  ECONNRESET and we die on the spot. In Linux we behave
725 	 *	  like files and pipes do and wait for the last
726 	 *	  dereference.
727 	 *
728 	 * Can't we simply set sock->err?
729 	 *
730 	 *	  What the above comment does talk about? --ANK(980817)
731 	 */
732 
733 	if (READ_ONCE(unix_tot_inflight))
734 		unix_gc();		/* Garbage collect fds */
735 }
736 
init_peercred(struct sock * sk)737 static void init_peercred(struct sock *sk)
738 {
739 	sk->sk_peer_pid = get_pid(task_tgid(current));
740 	sk->sk_peer_cred = get_current_cred();
741 }
742 
update_peercred(struct sock * sk)743 static void update_peercred(struct sock *sk)
744 {
745 	const struct cred *old_cred;
746 	struct pid *old_pid;
747 
748 	spin_lock(&sk->sk_peer_lock);
749 	old_pid = sk->sk_peer_pid;
750 	old_cred = sk->sk_peer_cred;
751 	init_peercred(sk);
752 	spin_unlock(&sk->sk_peer_lock);
753 
754 	put_pid(old_pid);
755 	put_cred(old_cred);
756 }
757 
copy_peercred(struct sock * sk,struct sock * peersk)758 static void copy_peercred(struct sock *sk, struct sock *peersk)
759 {
760 	lockdep_assert_held(&unix_sk(peersk)->lock);
761 
762 	spin_lock(&sk->sk_peer_lock);
763 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
764 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
765 	spin_unlock(&sk->sk_peer_lock);
766 }
767 
unix_listen(struct socket * sock,int backlog)768 static int unix_listen(struct socket *sock, int backlog)
769 {
770 	int err;
771 	struct sock *sk = sock->sk;
772 	struct unix_sock *u = unix_sk(sk);
773 
774 	err = -EOPNOTSUPP;
775 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
776 		goto out;	/* Only stream/seqpacket sockets accept */
777 	err = -EINVAL;
778 	if (!READ_ONCE(u->addr))
779 		goto out;	/* No listens on an unbound socket */
780 	unix_state_lock(sk);
781 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
782 		goto out_unlock;
783 	if (backlog > sk->sk_max_ack_backlog)
784 		wake_up_interruptible_all(&u->peer_wait);
785 	sk->sk_max_ack_backlog	= backlog;
786 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
787 
788 	/* set credentials so connect can copy them */
789 	update_peercred(sk);
790 	err = 0;
791 
792 out_unlock:
793 	unix_state_unlock(sk);
794 out:
795 	return err;
796 }
797 
798 static int unix_release(struct socket *);
799 static int unix_bind(struct socket *, struct sockaddr *, int);
800 static int unix_stream_connect(struct socket *, struct sockaddr *,
801 			       int addr_len, int flags);
802 static int unix_socketpair(struct socket *, struct socket *);
803 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
804 static int unix_getname(struct socket *, struct sockaddr *, int);
805 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
806 static __poll_t unix_dgram_poll(struct file *, struct socket *,
807 				    poll_table *);
808 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
809 #ifdef CONFIG_COMPAT
810 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
811 #endif
812 static int unix_shutdown(struct socket *, int);
813 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
814 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
815 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
816 				       struct pipe_inode_info *, size_t size,
817 				       unsigned int flags);
818 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
819 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
820 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
821 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
822 static int unix_dgram_connect(struct socket *, struct sockaddr *,
823 			      int, int);
824 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
825 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
826 				  int);
827 
828 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)829 static int unix_count_nr_fds(struct sock *sk)
830 {
831 	struct sk_buff *skb;
832 	struct unix_sock *u;
833 	int nr_fds = 0;
834 
835 	spin_lock(&sk->sk_receive_queue.lock);
836 	skb = skb_peek(&sk->sk_receive_queue);
837 	while (skb) {
838 		u = unix_sk(skb->sk);
839 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
840 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
841 	}
842 	spin_unlock(&sk->sk_receive_queue.lock);
843 
844 	return nr_fds;
845 }
846 
unix_show_fdinfo(struct seq_file * m,struct socket * sock)847 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
848 {
849 	struct sock *sk = sock->sk;
850 	unsigned char s_state;
851 	struct unix_sock *u;
852 	int nr_fds = 0;
853 
854 	if (sk) {
855 		s_state = READ_ONCE(sk->sk_state);
856 		u = unix_sk(sk);
857 
858 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
859 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
860 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
861 		 */
862 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
863 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
864 		else if (s_state == TCP_LISTEN)
865 			nr_fds = unix_count_nr_fds(sk);
866 
867 		seq_printf(m, "scm_fds: %u\n", nr_fds);
868 	}
869 }
870 #else
871 #define unix_show_fdinfo NULL
872 #endif
873 
874 static const struct proto_ops unix_stream_ops = {
875 	.family =	PF_UNIX,
876 	.owner =	THIS_MODULE,
877 	.release =	unix_release,
878 	.bind =		unix_bind,
879 	.connect =	unix_stream_connect,
880 	.socketpair =	unix_socketpair,
881 	.accept =	unix_accept,
882 	.getname =	unix_getname,
883 	.poll =		unix_poll,
884 	.ioctl =	unix_ioctl,
885 #ifdef CONFIG_COMPAT
886 	.compat_ioctl =	unix_compat_ioctl,
887 #endif
888 	.listen =	unix_listen,
889 	.shutdown =	unix_shutdown,
890 	.sendmsg =	unix_stream_sendmsg,
891 	.recvmsg =	unix_stream_recvmsg,
892 	.read_skb =	unix_stream_read_skb,
893 	.mmap =		sock_no_mmap,
894 	.splice_read =	unix_stream_splice_read,
895 	.set_peek_off =	sk_set_peek_off,
896 	.show_fdinfo =	unix_show_fdinfo,
897 };
898 
899 static const struct proto_ops unix_dgram_ops = {
900 	.family =	PF_UNIX,
901 	.owner =	THIS_MODULE,
902 	.release =	unix_release,
903 	.bind =		unix_bind,
904 	.connect =	unix_dgram_connect,
905 	.socketpair =	unix_socketpair,
906 	.accept =	sock_no_accept,
907 	.getname =	unix_getname,
908 	.poll =		unix_dgram_poll,
909 	.ioctl =	unix_ioctl,
910 #ifdef CONFIG_COMPAT
911 	.compat_ioctl =	unix_compat_ioctl,
912 #endif
913 	.listen =	sock_no_listen,
914 	.shutdown =	unix_shutdown,
915 	.sendmsg =	unix_dgram_sendmsg,
916 	.read_skb =	unix_read_skb,
917 	.recvmsg =	unix_dgram_recvmsg,
918 	.mmap =		sock_no_mmap,
919 	.set_peek_off =	sk_set_peek_off,
920 	.show_fdinfo =	unix_show_fdinfo,
921 };
922 
923 static const struct proto_ops unix_seqpacket_ops = {
924 	.family =	PF_UNIX,
925 	.owner =	THIS_MODULE,
926 	.release =	unix_release,
927 	.bind =		unix_bind,
928 	.connect =	unix_stream_connect,
929 	.socketpair =	unix_socketpair,
930 	.accept =	unix_accept,
931 	.getname =	unix_getname,
932 	.poll =		unix_dgram_poll,
933 	.ioctl =	unix_ioctl,
934 #ifdef CONFIG_COMPAT
935 	.compat_ioctl =	unix_compat_ioctl,
936 #endif
937 	.listen =	unix_listen,
938 	.shutdown =	unix_shutdown,
939 	.sendmsg =	unix_seqpacket_sendmsg,
940 	.recvmsg =	unix_seqpacket_recvmsg,
941 	.mmap =		sock_no_mmap,
942 	.set_peek_off =	sk_set_peek_off,
943 	.show_fdinfo =	unix_show_fdinfo,
944 };
945 
unix_close(struct sock * sk,long timeout)946 static void unix_close(struct sock *sk, long timeout)
947 {
948 	/* Nothing to do here, unix socket does not need a ->close().
949 	 * This is merely for sockmap.
950 	 */
951 }
952 
unix_unhash(struct sock * sk)953 static void unix_unhash(struct sock *sk)
954 {
955 	/* Nothing to do here, unix socket does not need a ->unhash().
956 	 * This is merely for sockmap.
957 	 */
958 }
959 
unix_bpf_bypass_getsockopt(int level,int optname)960 static bool unix_bpf_bypass_getsockopt(int level, int optname)
961 {
962 	if (level == SOL_SOCKET) {
963 		switch (optname) {
964 		case SO_PEERPIDFD:
965 			return true;
966 		default:
967 			return false;
968 		}
969 	}
970 
971 	return false;
972 }
973 
974 struct proto unix_dgram_proto = {
975 	.name			= "UNIX",
976 	.owner			= THIS_MODULE,
977 	.obj_size		= sizeof(struct unix_sock),
978 	.close			= unix_close,
979 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
980 #ifdef CONFIG_BPF_SYSCALL
981 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
982 #endif
983 };
984 
985 struct proto unix_stream_proto = {
986 	.name			= "UNIX-STREAM",
987 	.owner			= THIS_MODULE,
988 	.obj_size		= sizeof(struct unix_sock),
989 	.close			= unix_close,
990 	.unhash			= unix_unhash,
991 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
992 #ifdef CONFIG_BPF_SYSCALL
993 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
994 #endif
995 };
996 
unix_create1(struct net * net,struct socket * sock,int kern,int type)997 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
998 {
999 	struct unix_sock *u;
1000 	struct sock *sk;
1001 	int err;
1002 
1003 	atomic_long_inc(&unix_nr_socks);
1004 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1005 		err = -ENFILE;
1006 		goto err;
1007 	}
1008 
1009 	if (type == SOCK_STREAM)
1010 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1011 	else /*dgram and  seqpacket */
1012 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1013 
1014 	if (!sk) {
1015 		err = -ENOMEM;
1016 		goto err;
1017 	}
1018 
1019 	sock_init_data(sock, sk);
1020 
1021 	sk->sk_hash		= unix_unbound_hash(sk);
1022 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1023 	sk->sk_write_space	= unix_write_space;
1024 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1025 	sk->sk_destruct		= unix_sock_destructor;
1026 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1027 
1028 	u = unix_sk(sk);
1029 	u->listener = NULL;
1030 	u->vertex = NULL;
1031 	u->path.dentry = NULL;
1032 	u->path.mnt = NULL;
1033 	spin_lock_init(&u->lock);
1034 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1035 	mutex_init(&u->iolock); /* single task reading lock */
1036 	mutex_init(&u->bindlock); /* single task binding lock */
1037 	init_waitqueue_head(&u->peer_wait);
1038 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1039 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1040 	unix_insert_unbound_socket(net, sk);
1041 
1042 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1043 
1044 	return sk;
1045 
1046 err:
1047 	atomic_long_dec(&unix_nr_socks);
1048 	return ERR_PTR(err);
1049 }
1050 
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1051 static int unix_create(struct net *net, struct socket *sock, int protocol,
1052 		       int kern)
1053 {
1054 	struct sock *sk;
1055 
1056 	if (protocol && protocol != PF_UNIX)
1057 		return -EPROTONOSUPPORT;
1058 
1059 	sock->state = SS_UNCONNECTED;
1060 
1061 	switch (sock->type) {
1062 	case SOCK_STREAM:
1063 		sock->ops = &unix_stream_ops;
1064 		break;
1065 		/*
1066 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1067 		 *	nothing uses it.
1068 		 */
1069 	case SOCK_RAW:
1070 		sock->type = SOCK_DGRAM;
1071 		fallthrough;
1072 	case SOCK_DGRAM:
1073 		sock->ops = &unix_dgram_ops;
1074 		break;
1075 	case SOCK_SEQPACKET:
1076 		sock->ops = &unix_seqpacket_ops;
1077 		break;
1078 	default:
1079 		return -ESOCKTNOSUPPORT;
1080 	}
1081 
1082 	sk = unix_create1(net, sock, kern, sock->type);
1083 	if (IS_ERR(sk))
1084 		return PTR_ERR(sk);
1085 
1086 	return 0;
1087 }
1088 
unix_release(struct socket * sock)1089 static int unix_release(struct socket *sock)
1090 {
1091 	struct sock *sk = sock->sk;
1092 
1093 	if (!sk)
1094 		return 0;
1095 
1096 	sk->sk_prot->close(sk, 0);
1097 	unix_release_sock(sk, 0);
1098 	sock->sk = NULL;
1099 
1100 	return 0;
1101 }
1102 
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1103 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1104 				  int type)
1105 {
1106 	struct inode *inode;
1107 	struct path path;
1108 	struct sock *sk;
1109 	int err;
1110 
1111 	unix_mkname_bsd(sunaddr, addr_len);
1112 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1113 	if (err)
1114 		goto fail;
1115 
1116 	err = path_permission(&path, MAY_WRITE);
1117 	if (err)
1118 		goto path_put;
1119 
1120 	err = -ECONNREFUSED;
1121 	inode = d_backing_inode(path.dentry);
1122 	if (!S_ISSOCK(inode->i_mode))
1123 		goto path_put;
1124 
1125 	sk = unix_find_socket_byinode(inode);
1126 	if (!sk)
1127 		goto path_put;
1128 
1129 	err = -EPROTOTYPE;
1130 	if (sk->sk_type == type)
1131 		touch_atime(&path);
1132 	else
1133 		goto sock_put;
1134 
1135 	path_put(&path);
1136 
1137 	return sk;
1138 
1139 sock_put:
1140 	sock_put(sk);
1141 path_put:
1142 	path_put(&path);
1143 fail:
1144 	return ERR_PTR(err);
1145 }
1146 
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1147 static struct sock *unix_find_abstract(struct net *net,
1148 				       struct sockaddr_un *sunaddr,
1149 				       int addr_len, int type)
1150 {
1151 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1152 	struct dentry *dentry;
1153 	struct sock *sk;
1154 
1155 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1156 	if (!sk)
1157 		return ERR_PTR(-ECONNREFUSED);
1158 
1159 	dentry = unix_sk(sk)->path.dentry;
1160 	if (dentry)
1161 		touch_atime(&unix_sk(sk)->path);
1162 
1163 	return sk;
1164 }
1165 
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1166 static struct sock *unix_find_other(struct net *net,
1167 				    struct sockaddr_un *sunaddr,
1168 				    int addr_len, int type)
1169 {
1170 	struct sock *sk;
1171 
1172 	if (sunaddr->sun_path[0])
1173 		sk = unix_find_bsd(sunaddr, addr_len, type);
1174 	else
1175 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1176 
1177 	return sk;
1178 }
1179 
unix_autobind(struct sock * sk)1180 static int unix_autobind(struct sock *sk)
1181 {
1182 	struct unix_sock *u = unix_sk(sk);
1183 	unsigned int new_hash, old_hash;
1184 	struct net *net = sock_net(sk);
1185 	struct unix_address *addr;
1186 	u32 lastnum, ordernum;
1187 	int err;
1188 
1189 	err = mutex_lock_interruptible(&u->bindlock);
1190 	if (err)
1191 		return err;
1192 
1193 	if (u->addr)
1194 		goto out;
1195 
1196 	err = -ENOMEM;
1197 	addr = kzalloc(sizeof(*addr) +
1198 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1199 	if (!addr)
1200 		goto out;
1201 
1202 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1203 	addr->name->sun_family = AF_UNIX;
1204 	refcount_set(&addr->refcnt, 1);
1205 
1206 	old_hash = sk->sk_hash;
1207 	ordernum = get_random_u32();
1208 	lastnum = ordernum & 0xFFFFF;
1209 retry:
1210 	ordernum = (ordernum + 1) & 0xFFFFF;
1211 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1212 
1213 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1214 	unix_table_double_lock(net, old_hash, new_hash);
1215 
1216 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1217 		unix_table_double_unlock(net, old_hash, new_hash);
1218 
1219 		/* __unix_find_socket_byname() may take long time if many names
1220 		 * are already in use.
1221 		 */
1222 		cond_resched();
1223 
1224 		if (ordernum == lastnum) {
1225 			/* Give up if all names seems to be in use. */
1226 			err = -ENOSPC;
1227 			unix_release_addr(addr);
1228 			goto out;
1229 		}
1230 
1231 		goto retry;
1232 	}
1233 
1234 	__unix_set_addr_hash(net, sk, addr, new_hash);
1235 	unix_table_double_unlock(net, old_hash, new_hash);
1236 	err = 0;
1237 
1238 out:	mutex_unlock(&u->bindlock);
1239 	return err;
1240 }
1241 
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1242 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1243 			 int addr_len)
1244 {
1245 	umode_t mode = S_IFSOCK |
1246 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1247 	struct unix_sock *u = unix_sk(sk);
1248 	unsigned int new_hash, old_hash;
1249 	struct net *net = sock_net(sk);
1250 	struct mnt_idmap *idmap;
1251 	struct unix_address *addr;
1252 	struct dentry *dentry;
1253 	struct path parent;
1254 	int err;
1255 
1256 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1257 	addr = unix_create_addr(sunaddr, addr_len);
1258 	if (!addr)
1259 		return -ENOMEM;
1260 
1261 	/*
1262 	 * Get the parent directory, calculate the hash for last
1263 	 * component.
1264 	 */
1265 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1266 	if (IS_ERR(dentry)) {
1267 		err = PTR_ERR(dentry);
1268 		goto out;
1269 	}
1270 
1271 	/*
1272 	 * All right, let's create it.
1273 	 */
1274 	idmap = mnt_idmap(parent.mnt);
1275 	err = security_path_mknod(&parent, dentry, mode, 0);
1276 	if (!err)
1277 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1278 	if (err)
1279 		goto out_path;
1280 	err = mutex_lock_interruptible(&u->bindlock);
1281 	if (err)
1282 		goto out_unlink;
1283 	if (u->addr)
1284 		goto out_unlock;
1285 
1286 	old_hash = sk->sk_hash;
1287 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1288 	unix_table_double_lock(net, old_hash, new_hash);
1289 	u->path.mnt = mntget(parent.mnt);
1290 	u->path.dentry = dget(dentry);
1291 	__unix_set_addr_hash(net, sk, addr, new_hash);
1292 	unix_table_double_unlock(net, old_hash, new_hash);
1293 	unix_insert_bsd_socket(sk);
1294 	mutex_unlock(&u->bindlock);
1295 	done_path_create(&parent, dentry);
1296 	return 0;
1297 
1298 out_unlock:
1299 	mutex_unlock(&u->bindlock);
1300 	err = -EINVAL;
1301 out_unlink:
1302 	/* failed after successful mknod?  unlink what we'd created... */
1303 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1304 out_path:
1305 	done_path_create(&parent, dentry);
1306 out:
1307 	unix_release_addr(addr);
1308 	return err == -EEXIST ? -EADDRINUSE : err;
1309 }
1310 
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1311 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1312 			      int addr_len)
1313 {
1314 	struct unix_sock *u = unix_sk(sk);
1315 	unsigned int new_hash, old_hash;
1316 	struct net *net = sock_net(sk);
1317 	struct unix_address *addr;
1318 	int err;
1319 
1320 	addr = unix_create_addr(sunaddr, addr_len);
1321 	if (!addr)
1322 		return -ENOMEM;
1323 
1324 	err = mutex_lock_interruptible(&u->bindlock);
1325 	if (err)
1326 		goto out;
1327 
1328 	if (u->addr) {
1329 		err = -EINVAL;
1330 		goto out_mutex;
1331 	}
1332 
1333 	old_hash = sk->sk_hash;
1334 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1335 	unix_table_double_lock(net, old_hash, new_hash);
1336 
1337 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1338 		goto out_spin;
1339 
1340 	__unix_set_addr_hash(net, sk, addr, new_hash);
1341 	unix_table_double_unlock(net, old_hash, new_hash);
1342 	mutex_unlock(&u->bindlock);
1343 	return 0;
1344 
1345 out_spin:
1346 	unix_table_double_unlock(net, old_hash, new_hash);
1347 	err = -EADDRINUSE;
1348 out_mutex:
1349 	mutex_unlock(&u->bindlock);
1350 out:
1351 	unix_release_addr(addr);
1352 	return err;
1353 }
1354 
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1355 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1356 {
1357 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1358 	struct sock *sk = sock->sk;
1359 	int err;
1360 
1361 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1362 	    sunaddr->sun_family == AF_UNIX)
1363 		return unix_autobind(sk);
1364 
1365 	err = unix_validate_addr(sunaddr, addr_len);
1366 	if (err)
1367 		return err;
1368 
1369 	if (sunaddr->sun_path[0])
1370 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1371 	else
1372 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1373 
1374 	return err;
1375 }
1376 
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1377 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1378 {
1379 	if (unlikely(sk1 == sk2) || !sk2) {
1380 		unix_state_lock(sk1);
1381 		return;
1382 	}
1383 
1384 	if (sk1 > sk2)
1385 		swap(sk1, sk2);
1386 
1387 	unix_state_lock(sk1);
1388 	unix_state_lock(sk2);
1389 }
1390 
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1391 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1392 {
1393 	if (unlikely(sk1 == sk2) || !sk2) {
1394 		unix_state_unlock(sk1);
1395 		return;
1396 	}
1397 	unix_state_unlock(sk1);
1398 	unix_state_unlock(sk2);
1399 }
1400 
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1401 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1402 			      int alen, int flags)
1403 {
1404 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1405 	struct sock *sk = sock->sk;
1406 	struct sock *other;
1407 	int err;
1408 
1409 	err = -EINVAL;
1410 	if (alen < offsetofend(struct sockaddr, sa_family))
1411 		goto out;
1412 
1413 	if (addr->sa_family != AF_UNSPEC) {
1414 		err = unix_validate_addr(sunaddr, alen);
1415 		if (err)
1416 			goto out;
1417 
1418 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1419 		if (err)
1420 			goto out;
1421 
1422 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1423 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1424 		    !READ_ONCE(unix_sk(sk)->addr)) {
1425 			err = unix_autobind(sk);
1426 			if (err)
1427 				goto out;
1428 		}
1429 
1430 restart:
1431 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1432 		if (IS_ERR(other)) {
1433 			err = PTR_ERR(other);
1434 			goto out;
1435 		}
1436 
1437 		unix_state_double_lock(sk, other);
1438 
1439 		/* Apparently VFS overslept socket death. Retry. */
1440 		if (sock_flag(other, SOCK_DEAD)) {
1441 			unix_state_double_unlock(sk, other);
1442 			sock_put(other);
1443 			goto restart;
1444 		}
1445 
1446 		err = -EPERM;
1447 		if (!unix_may_send(sk, other))
1448 			goto out_unlock;
1449 
1450 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1451 		if (err)
1452 			goto out_unlock;
1453 
1454 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1455 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1456 	} else {
1457 		/*
1458 		 *	1003.1g breaking connected state with AF_UNSPEC
1459 		 */
1460 		other = NULL;
1461 		unix_state_double_lock(sk, other);
1462 	}
1463 
1464 	/*
1465 	 * If it was connected, reconnect.
1466 	 */
1467 	if (unix_peer(sk)) {
1468 		struct sock *old_peer = unix_peer(sk);
1469 
1470 		unix_peer(sk) = other;
1471 		if (!other)
1472 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1473 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1474 
1475 		unix_state_double_unlock(sk, other);
1476 
1477 		if (other != old_peer) {
1478 			unix_dgram_disconnected(sk, old_peer);
1479 
1480 			unix_state_lock(old_peer);
1481 			if (!unix_peer(old_peer))
1482 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1483 			unix_state_unlock(old_peer);
1484 		}
1485 
1486 		sock_put(old_peer);
1487 	} else {
1488 		unix_peer(sk) = other;
1489 		unix_state_double_unlock(sk, other);
1490 	}
1491 
1492 	return 0;
1493 
1494 out_unlock:
1495 	unix_state_double_unlock(sk, other);
1496 	sock_put(other);
1497 out:
1498 	return err;
1499 }
1500 
unix_wait_for_peer(struct sock * other,long timeo)1501 static long unix_wait_for_peer(struct sock *other, long timeo)
1502 {
1503 	struct unix_sock *u = unix_sk(other);
1504 	int sched;
1505 	DEFINE_WAIT(wait);
1506 
1507 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1508 
1509 	sched = !sock_flag(other, SOCK_DEAD) &&
1510 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1511 		unix_recvq_full_lockless(other);
1512 
1513 	unix_state_unlock(other);
1514 
1515 	if (sched)
1516 		timeo = schedule_timeout(timeo);
1517 
1518 	finish_wait(&u->peer_wait, &wait);
1519 	return timeo;
1520 }
1521 
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1522 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1523 			       int addr_len, int flags)
1524 {
1525 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1526 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1527 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1528 	struct net *net = sock_net(sk);
1529 	struct sk_buff *skb = NULL;
1530 	unsigned char state;
1531 	long timeo;
1532 	int err;
1533 
1534 	err = unix_validate_addr(sunaddr, addr_len);
1535 	if (err)
1536 		goto out;
1537 
1538 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1539 	if (err)
1540 		goto out;
1541 
1542 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1543 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1544 	    !READ_ONCE(u->addr)) {
1545 		err = unix_autobind(sk);
1546 		if (err)
1547 			goto out;
1548 	}
1549 
1550 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1551 
1552 	/* First of all allocate resources.
1553 	 * If we will make it after state is locked,
1554 	 * we will have to recheck all again in any case.
1555 	 */
1556 
1557 	/* create new sock for complete connection */
1558 	newsk = unix_create1(net, NULL, 0, sock->type);
1559 	if (IS_ERR(newsk)) {
1560 		err = PTR_ERR(newsk);
1561 		goto out;
1562 	}
1563 
1564 	/* Allocate skb for sending to listening sock */
1565 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1566 	if (!skb) {
1567 		err = -ENOMEM;
1568 		goto out_free_sk;
1569 	}
1570 
1571 restart:
1572 	/*  Find listening sock. */
1573 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1574 	if (IS_ERR(other)) {
1575 		err = PTR_ERR(other);
1576 		goto out_free_skb;
1577 	}
1578 
1579 	unix_state_lock(other);
1580 
1581 	/* Apparently VFS overslept socket death. Retry. */
1582 	if (sock_flag(other, SOCK_DEAD)) {
1583 		unix_state_unlock(other);
1584 		sock_put(other);
1585 		goto restart;
1586 	}
1587 
1588 	if (other->sk_state != TCP_LISTEN ||
1589 	    other->sk_shutdown & RCV_SHUTDOWN) {
1590 		err = -ECONNREFUSED;
1591 		goto out_unlock;
1592 	}
1593 
1594 	if (unix_recvq_full_lockless(other)) {
1595 		if (!timeo) {
1596 			err = -EAGAIN;
1597 			goto out_unlock;
1598 		}
1599 
1600 		timeo = unix_wait_for_peer(other, timeo);
1601 		sock_put(other);
1602 
1603 		err = sock_intr_errno(timeo);
1604 		if (signal_pending(current))
1605 			goto out_free_skb;
1606 
1607 		goto restart;
1608 	}
1609 
1610 	/* self connect and simultaneous connect are eliminated
1611 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1612 	 */
1613 	state = READ_ONCE(sk->sk_state);
1614 	if (unlikely(state != TCP_CLOSE)) {
1615 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1616 		goto out_unlock;
1617 	}
1618 
1619 	unix_state_lock(sk);
1620 
1621 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1622 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1623 		unix_state_unlock(sk);
1624 		goto out_unlock;
1625 	}
1626 
1627 	err = security_unix_stream_connect(sk, other, newsk);
1628 	if (err) {
1629 		unix_state_unlock(sk);
1630 		goto out_unlock;
1631 	}
1632 
1633 	/* The way is open! Fastly set all the necessary fields... */
1634 
1635 	sock_hold(sk);
1636 	unix_peer(newsk)	= sk;
1637 	newsk->sk_state		= TCP_ESTABLISHED;
1638 	newsk->sk_type		= sk->sk_type;
1639 	init_peercred(newsk);
1640 	newu = unix_sk(newsk);
1641 	newu->listener = other;
1642 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1643 	otheru = unix_sk(other);
1644 
1645 	/* copy address information from listening to new sock
1646 	 *
1647 	 * The contents of *(otheru->addr) and otheru->path
1648 	 * are seen fully set up here, since we have found
1649 	 * otheru in hash under its lock.  Insertion into the
1650 	 * hash chain we'd found it in had been done in an
1651 	 * earlier critical area protected by the chain's lock,
1652 	 * the same one where we'd set *(otheru->addr) contents,
1653 	 * as well as otheru->path and otheru->addr itself.
1654 	 *
1655 	 * Using smp_store_release() here to set newu->addr
1656 	 * is enough to make those stores, as well as stores
1657 	 * to newu->path visible to anyone who gets newu->addr
1658 	 * by smp_load_acquire().  IOW, the same warranties
1659 	 * as for unix_sock instances bound in unix_bind() or
1660 	 * in unix_autobind().
1661 	 */
1662 	if (otheru->path.dentry) {
1663 		path_get(&otheru->path);
1664 		newu->path = otheru->path;
1665 	}
1666 	refcount_inc(&otheru->addr->refcnt);
1667 	smp_store_release(&newu->addr, otheru->addr);
1668 
1669 	/* Set credentials */
1670 	copy_peercred(sk, other);
1671 
1672 	sock->state	= SS_CONNECTED;
1673 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1674 	sock_hold(newsk);
1675 
1676 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1677 	unix_peer(sk)	= newsk;
1678 
1679 	unix_state_unlock(sk);
1680 
1681 	/* take ten and send info to listening sock */
1682 	spin_lock(&other->sk_receive_queue.lock);
1683 	__skb_queue_tail(&other->sk_receive_queue, skb);
1684 	spin_unlock(&other->sk_receive_queue.lock);
1685 	unix_state_unlock(other);
1686 	other->sk_data_ready(other);
1687 	sock_put(other);
1688 	return 0;
1689 
1690 out_unlock:
1691 	unix_state_unlock(other);
1692 	sock_put(other);
1693 out_free_skb:
1694 	consume_skb(skb);
1695 out_free_sk:
1696 	unix_release_sock(newsk, 0);
1697 out:
1698 	return err;
1699 }
1700 
unix_socketpair(struct socket * socka,struct socket * sockb)1701 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1702 {
1703 	struct sock *ska = socka->sk, *skb = sockb->sk;
1704 
1705 	/* Join our sockets back to back */
1706 	sock_hold(ska);
1707 	sock_hold(skb);
1708 	unix_peer(ska) = skb;
1709 	unix_peer(skb) = ska;
1710 	init_peercred(ska);
1711 	init_peercred(skb);
1712 
1713 	ska->sk_state = TCP_ESTABLISHED;
1714 	skb->sk_state = TCP_ESTABLISHED;
1715 	socka->state  = SS_CONNECTED;
1716 	sockb->state  = SS_CONNECTED;
1717 	return 0;
1718 }
1719 
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1720 static void unix_sock_inherit_flags(const struct socket *old,
1721 				    struct socket *new)
1722 {
1723 	if (test_bit(SOCK_PASSCRED, &old->flags))
1724 		set_bit(SOCK_PASSCRED, &new->flags);
1725 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1726 		set_bit(SOCK_PASSPIDFD, &new->flags);
1727 	if (test_bit(SOCK_PASSSEC, &old->flags))
1728 		set_bit(SOCK_PASSSEC, &new->flags);
1729 }
1730 
unix_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)1731 static int unix_accept(struct socket *sock, struct socket *newsock,
1732 		       struct proto_accept_arg *arg)
1733 {
1734 	struct sock *sk = sock->sk;
1735 	struct sk_buff *skb;
1736 	struct sock *tsk;
1737 
1738 	arg->err = -EOPNOTSUPP;
1739 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1740 		goto out;
1741 
1742 	arg->err = -EINVAL;
1743 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1744 		goto out;
1745 
1746 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1747 	 * so that no locks are necessary.
1748 	 */
1749 
1750 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1751 				&arg->err);
1752 	if (!skb) {
1753 		/* This means receive shutdown. */
1754 		if (arg->err == 0)
1755 			arg->err = -EINVAL;
1756 		goto out;
1757 	}
1758 
1759 	tsk = skb->sk;
1760 	skb_free_datagram(sk, skb);
1761 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1762 
1763 	/* attach accepted sock to socket */
1764 	unix_state_lock(tsk);
1765 	unix_update_edges(unix_sk(tsk));
1766 	newsock->state = SS_CONNECTED;
1767 	unix_sock_inherit_flags(sock, newsock);
1768 	sock_graft(tsk, newsock);
1769 	unix_state_unlock(tsk);
1770 	return 0;
1771 
1772 out:
1773 	return arg->err;
1774 }
1775 
1776 
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1777 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1778 {
1779 	struct sock *sk = sock->sk;
1780 	struct unix_address *addr;
1781 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1782 	int err = 0;
1783 
1784 	if (peer) {
1785 		sk = unix_peer_get(sk);
1786 
1787 		err = -ENOTCONN;
1788 		if (!sk)
1789 			goto out;
1790 		err = 0;
1791 	} else {
1792 		sock_hold(sk);
1793 	}
1794 
1795 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1796 	if (!addr) {
1797 		sunaddr->sun_family = AF_UNIX;
1798 		sunaddr->sun_path[0] = 0;
1799 		err = offsetof(struct sockaddr_un, sun_path);
1800 	} else {
1801 		err = addr->len;
1802 		memcpy(sunaddr, addr->name, addr->len);
1803 
1804 		if (peer)
1805 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1806 					       CGROUP_UNIX_GETPEERNAME);
1807 		else
1808 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1809 					       CGROUP_UNIX_GETSOCKNAME);
1810 	}
1811 	sock_put(sk);
1812 out:
1813 	return err;
1814 }
1815 
1816 /* The "user->unix_inflight" variable is protected by the garbage
1817  * collection lock, and we just read it locklessly here. If you go
1818  * over the limit, there might be a tiny race in actually noticing
1819  * it across threads. Tough.
1820  */
too_many_unix_fds(struct task_struct * p)1821 static inline bool too_many_unix_fds(struct task_struct *p)
1822 {
1823 	struct user_struct *user = current_user();
1824 
1825 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1826 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1827 	return false;
1828 }
1829 
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1830 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1831 {
1832 	if (too_many_unix_fds(current))
1833 		return -ETOOMANYREFS;
1834 
1835 	UNIXCB(skb).fp = scm->fp;
1836 	scm->fp = NULL;
1837 
1838 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1839 		return -ENOMEM;
1840 
1841 	return 0;
1842 }
1843 
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1844 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1845 {
1846 	scm->fp = UNIXCB(skb).fp;
1847 	UNIXCB(skb).fp = NULL;
1848 
1849 	unix_destroy_fpl(scm->fp);
1850 }
1851 
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1852 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1853 {
1854 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1855 }
1856 
unix_destruct_scm(struct sk_buff * skb)1857 static void unix_destruct_scm(struct sk_buff *skb)
1858 {
1859 	struct scm_cookie scm;
1860 
1861 	memset(&scm, 0, sizeof(scm));
1862 	scm.pid  = UNIXCB(skb).pid;
1863 	if (UNIXCB(skb).fp)
1864 		unix_detach_fds(&scm, skb);
1865 
1866 	/* Alas, it calls VFS */
1867 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1868 	scm_destroy(&scm);
1869 	sock_wfree(skb);
1870 }
1871 
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1872 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1873 {
1874 	int err = 0;
1875 
1876 	UNIXCB(skb).pid  = get_pid(scm->pid);
1877 	UNIXCB(skb).uid = scm->creds.uid;
1878 	UNIXCB(skb).gid = scm->creds.gid;
1879 	UNIXCB(skb).fp = NULL;
1880 	unix_get_secdata(scm, skb);
1881 	if (scm->fp && send_fds)
1882 		err = unix_attach_fds(scm, skb);
1883 
1884 	skb->destructor = unix_destruct_scm;
1885 	return err;
1886 }
1887 
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1888 static bool unix_passcred_enabled(const struct socket *sock,
1889 				  const struct sock *other)
1890 {
1891 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1892 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1893 	       !other->sk_socket ||
1894 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1895 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1896 }
1897 
1898 /*
1899  * Some apps rely on write() giving SCM_CREDENTIALS
1900  * We include credentials if source or destination socket
1901  * asserted SOCK_PASSCRED.
1902  */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1903 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1904 			    const struct sock *other)
1905 {
1906 	if (UNIXCB(skb).pid)
1907 		return;
1908 	if (unix_passcred_enabled(sock, other)) {
1909 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1910 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1911 	}
1912 }
1913 
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1914 static bool unix_skb_scm_eq(struct sk_buff *skb,
1915 			    struct scm_cookie *scm)
1916 {
1917 	return UNIXCB(skb).pid == scm->pid &&
1918 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1919 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1920 	       unix_secdata_eq(scm, skb);
1921 }
1922 
scm_stat_add(struct sock * sk,struct sk_buff * skb)1923 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1924 {
1925 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1926 	struct unix_sock *u = unix_sk(sk);
1927 
1928 	if (unlikely(fp && fp->count)) {
1929 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1930 		unix_add_edges(fp, u);
1931 	}
1932 }
1933 
scm_stat_del(struct sock * sk,struct sk_buff * skb)1934 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1935 {
1936 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1937 	struct unix_sock *u = unix_sk(sk);
1938 
1939 	if (unlikely(fp && fp->count)) {
1940 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1941 		unix_del_edges(fp);
1942 	}
1943 }
1944 
1945 /*
1946  *	Send AF_UNIX data.
1947  */
1948 
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1949 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1950 			      size_t len)
1951 {
1952 	struct sock *sk = sock->sk, *other = NULL;
1953 	struct unix_sock *u = unix_sk(sk);
1954 	struct scm_cookie scm;
1955 	struct sk_buff *skb;
1956 	int data_len = 0;
1957 	int sk_locked;
1958 	long timeo;
1959 	int err;
1960 
1961 	err = scm_send(sock, msg, &scm, false);
1962 	if (err < 0)
1963 		return err;
1964 
1965 	wait_for_unix_gc(scm.fp);
1966 
1967 	if (msg->msg_flags & MSG_OOB) {
1968 		err = -EOPNOTSUPP;
1969 		goto out;
1970 	}
1971 
1972 	if (msg->msg_namelen) {
1973 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
1974 		if (err)
1975 			goto out;
1976 
1977 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1978 							    msg->msg_name,
1979 							    &msg->msg_namelen,
1980 							    NULL);
1981 		if (err)
1982 			goto out;
1983 	}
1984 
1985 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1986 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1987 	    !READ_ONCE(u->addr)) {
1988 		err = unix_autobind(sk);
1989 		if (err)
1990 			goto out;
1991 	}
1992 
1993 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
1994 		err = -EMSGSIZE;
1995 		goto out;
1996 	}
1997 
1998 	if (len > SKB_MAX_ALLOC) {
1999 		data_len = min_t(size_t,
2000 				 len - SKB_MAX_ALLOC,
2001 				 MAX_SKB_FRAGS * PAGE_SIZE);
2002 		data_len = PAGE_ALIGN(data_len);
2003 
2004 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2005 	}
2006 
2007 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2008 				   msg->msg_flags & MSG_DONTWAIT, &err,
2009 				   PAGE_ALLOC_COSTLY_ORDER);
2010 	if (!skb)
2011 		goto out;
2012 
2013 	err = unix_scm_to_skb(&scm, skb, true);
2014 	if (err < 0)
2015 		goto out_free;
2016 
2017 	skb_put(skb, len - data_len);
2018 	skb->data_len = data_len;
2019 	skb->len = len;
2020 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2021 	if (err)
2022 		goto out_free;
2023 
2024 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2025 
2026 	if (msg->msg_namelen) {
2027 lookup:
2028 		other = unix_find_other(sock_net(sk), msg->msg_name,
2029 					msg->msg_namelen, sk->sk_type);
2030 		if (IS_ERR(other)) {
2031 			err = PTR_ERR(other);
2032 			goto out_free;
2033 		}
2034 	} else {
2035 		other = unix_peer_get(sk);
2036 		if (!other) {
2037 			err = -ENOTCONN;
2038 			goto out_free;
2039 		}
2040 	}
2041 
2042 	if (sk_filter(other, skb) < 0) {
2043 		/* Toss the packet but do not return any error to the sender */
2044 		err = len;
2045 		goto out_sock_put;
2046 	}
2047 
2048 restart:
2049 	sk_locked = 0;
2050 	unix_state_lock(other);
2051 restart_locked:
2052 
2053 	if (!unix_may_send(sk, other)) {
2054 		err = -EPERM;
2055 		goto out_unlock;
2056 	}
2057 
2058 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2059 		/* Check with 1003.1g - what should datagram error */
2060 
2061 		unix_state_unlock(other);
2062 
2063 		if (sk->sk_type == SOCK_SEQPACKET) {
2064 			/* We are here only when racing with unix_release_sock()
2065 			 * is clearing @other. Never change state to TCP_CLOSE
2066 			 * unlike SOCK_DGRAM wants.
2067 			 */
2068 			err = -EPIPE;
2069 			goto out_sock_put;
2070 		}
2071 
2072 		if (!sk_locked)
2073 			unix_state_lock(sk);
2074 
2075 		if (unix_peer(sk) == other) {
2076 			unix_peer(sk) = NULL;
2077 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2078 
2079 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2080 			unix_state_unlock(sk);
2081 
2082 			unix_dgram_disconnected(sk, other);
2083 			sock_put(other);
2084 			err = -ECONNREFUSED;
2085 			goto out_sock_put;
2086 		}
2087 
2088 		unix_state_unlock(sk);
2089 
2090 		if (!msg->msg_namelen) {
2091 			err = -ECONNRESET;
2092 			goto out_sock_put;
2093 		}
2094 
2095 		sock_put(other);
2096 		goto lookup;
2097 	}
2098 
2099 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2100 		err = -EPIPE;
2101 		goto out_unlock;
2102 	}
2103 
2104 	if (sk->sk_type != SOCK_SEQPACKET) {
2105 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2106 		if (err)
2107 			goto out_unlock;
2108 	}
2109 
2110 	/* other == sk && unix_peer(other) != sk if
2111 	 * - unix_peer(sk) == NULL, destination address bound to sk
2112 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2113 	 */
2114 	if (other != sk &&
2115 	    unlikely(unix_peer(other) != sk &&
2116 	    unix_recvq_full_lockless(other))) {
2117 		if (timeo) {
2118 			timeo = unix_wait_for_peer(other, timeo);
2119 
2120 			err = sock_intr_errno(timeo);
2121 			if (signal_pending(current))
2122 				goto out_sock_put;
2123 
2124 			goto restart;
2125 		}
2126 
2127 		if (!sk_locked) {
2128 			unix_state_unlock(other);
2129 			unix_state_double_lock(sk, other);
2130 		}
2131 
2132 		if (unix_peer(sk) != other ||
2133 		    unix_dgram_peer_wake_me(sk, other)) {
2134 			err = -EAGAIN;
2135 			sk_locked = 1;
2136 			goto out_unlock;
2137 		}
2138 
2139 		if (!sk_locked) {
2140 			sk_locked = 1;
2141 			goto restart_locked;
2142 		}
2143 	}
2144 
2145 	if (unlikely(sk_locked))
2146 		unix_state_unlock(sk);
2147 
2148 	if (sock_flag(other, SOCK_RCVTSTAMP))
2149 		__net_timestamp(skb);
2150 	maybe_add_creds(skb, sock, other);
2151 	scm_stat_add(other, skb);
2152 	skb_queue_tail(&other->sk_receive_queue, skb);
2153 	unix_state_unlock(other);
2154 	other->sk_data_ready(other);
2155 	sock_put(other);
2156 	scm_destroy(&scm);
2157 	return len;
2158 
2159 out_unlock:
2160 	if (sk_locked)
2161 		unix_state_unlock(sk);
2162 	unix_state_unlock(other);
2163 out_sock_put:
2164 	sock_put(other);
2165 out_free:
2166 	consume_skb(skb);
2167 out:
2168 	scm_destroy(&scm);
2169 	return err;
2170 }
2171 
2172 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2173  * bytes, and a minimum of a full page.
2174  */
2175 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2176 
2177 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2178 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2179 		     struct scm_cookie *scm, bool fds_sent)
2180 {
2181 	struct unix_sock *ousk = unix_sk(other);
2182 	struct sk_buff *skb;
2183 	int err;
2184 
2185 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2186 
2187 	if (!skb)
2188 		return err;
2189 
2190 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2191 	if (err < 0)
2192 		goto out;
2193 
2194 	skb_put(skb, 1);
2195 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2196 
2197 	if (err)
2198 		goto out;
2199 
2200 	unix_state_lock(other);
2201 
2202 	if (sock_flag(other, SOCK_DEAD) ||
2203 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2204 		unix_state_unlock(other);
2205 		err = -EPIPE;
2206 		goto out;
2207 	}
2208 
2209 	maybe_add_creds(skb, sock, other);
2210 	scm_stat_add(other, skb);
2211 
2212 	spin_lock(&other->sk_receive_queue.lock);
2213 	WRITE_ONCE(ousk->oob_skb, skb);
2214 	__skb_queue_tail(&other->sk_receive_queue, skb);
2215 	spin_unlock(&other->sk_receive_queue.lock);
2216 
2217 	sk_send_sigurg(other);
2218 	unix_state_unlock(other);
2219 	other->sk_data_ready(other);
2220 
2221 	return 0;
2222 out:
2223 	consume_skb(skb);
2224 	return err;
2225 }
2226 #endif
2227 
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2228 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2229 			       size_t len)
2230 {
2231 	struct sock *sk = sock->sk;
2232 	struct sk_buff *skb = NULL;
2233 	struct sock *other = NULL;
2234 	struct scm_cookie scm;
2235 	bool fds_sent = false;
2236 	int err, sent = 0;
2237 
2238 	err = scm_send(sock, msg, &scm, false);
2239 	if (err < 0)
2240 		return err;
2241 
2242 	wait_for_unix_gc(scm.fp);
2243 
2244 	if (msg->msg_flags & MSG_OOB) {
2245 		err = -EOPNOTSUPP;
2246 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2247 		if (len)
2248 			len--;
2249 		else
2250 #endif
2251 			goto out_err;
2252 	}
2253 
2254 	if (msg->msg_namelen) {
2255 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2256 		goto out_err;
2257 	} else {
2258 		other = unix_peer(sk);
2259 		if (!other) {
2260 			err = -ENOTCONN;
2261 			goto out_err;
2262 		}
2263 	}
2264 
2265 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2266 		goto out_pipe;
2267 
2268 	while (sent < len) {
2269 		int size = len - sent;
2270 		int data_len;
2271 
2272 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2273 			skb = sock_alloc_send_pskb(sk, 0, 0,
2274 						   msg->msg_flags & MSG_DONTWAIT,
2275 						   &err, 0);
2276 		} else {
2277 			/* Keep two messages in the pipe so it schedules better */
2278 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2279 
2280 			/* allow fallback to order-0 allocations */
2281 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2282 
2283 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2284 
2285 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2286 
2287 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2288 						   msg->msg_flags & MSG_DONTWAIT, &err,
2289 						   get_order(UNIX_SKB_FRAGS_SZ));
2290 		}
2291 		if (!skb)
2292 			goto out_err;
2293 
2294 		/* Only send the fds in the first buffer */
2295 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2296 		if (err < 0)
2297 			goto out_free;
2298 
2299 		fds_sent = true;
2300 
2301 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2302 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2303 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2304 						   sk->sk_allocation);
2305 			if (err < 0)
2306 				goto out_free;
2307 
2308 			size = err;
2309 			refcount_add(size, &sk->sk_wmem_alloc);
2310 		} else {
2311 			skb_put(skb, size - data_len);
2312 			skb->data_len = data_len;
2313 			skb->len = size;
2314 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2315 			if (err)
2316 				goto out_free;
2317 		}
2318 
2319 		unix_state_lock(other);
2320 
2321 		if (sock_flag(other, SOCK_DEAD) ||
2322 		    (other->sk_shutdown & RCV_SHUTDOWN))
2323 			goto out_pipe_unlock;
2324 
2325 		maybe_add_creds(skb, sock, other);
2326 		scm_stat_add(other, skb);
2327 		skb_queue_tail(&other->sk_receive_queue, skb);
2328 		unix_state_unlock(other);
2329 		other->sk_data_ready(other);
2330 		sent += size;
2331 	}
2332 
2333 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2334 	if (msg->msg_flags & MSG_OOB) {
2335 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2336 		if (err)
2337 			goto out_err;
2338 		sent++;
2339 	}
2340 #endif
2341 
2342 	scm_destroy(&scm);
2343 
2344 	return sent;
2345 
2346 out_pipe_unlock:
2347 	unix_state_unlock(other);
2348 out_pipe:
2349 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2350 		send_sig(SIGPIPE, current, 0);
2351 	err = -EPIPE;
2352 out_free:
2353 	consume_skb(skb);
2354 out_err:
2355 	scm_destroy(&scm);
2356 	return sent ? : err;
2357 }
2358 
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2359 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2360 				  size_t len)
2361 {
2362 	int err;
2363 	struct sock *sk = sock->sk;
2364 
2365 	err = sock_error(sk);
2366 	if (err)
2367 		return err;
2368 
2369 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2370 		return -ENOTCONN;
2371 
2372 	if (msg->msg_namelen)
2373 		msg->msg_namelen = 0;
2374 
2375 	return unix_dgram_sendmsg(sock, msg, len);
2376 }
2377 
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2378 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2379 				  size_t size, int flags)
2380 {
2381 	struct sock *sk = sock->sk;
2382 
2383 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2384 		return -ENOTCONN;
2385 
2386 	return unix_dgram_recvmsg(sock, msg, size, flags);
2387 }
2388 
unix_copy_addr(struct msghdr * msg,struct sock * sk)2389 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2390 {
2391 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2392 
2393 	if (addr) {
2394 		msg->msg_namelen = addr->len;
2395 		memcpy(msg->msg_name, addr->name, addr->len);
2396 	}
2397 }
2398 
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2399 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2400 			 int flags)
2401 {
2402 	struct scm_cookie scm;
2403 	struct socket *sock = sk->sk_socket;
2404 	struct unix_sock *u = unix_sk(sk);
2405 	struct sk_buff *skb, *last;
2406 	long timeo;
2407 	int skip;
2408 	int err;
2409 
2410 	err = -EOPNOTSUPP;
2411 	if (flags&MSG_OOB)
2412 		goto out;
2413 
2414 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2415 
2416 	do {
2417 		mutex_lock(&u->iolock);
2418 
2419 		skip = sk_peek_offset(sk, flags);
2420 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2421 					      &skip, &err, &last);
2422 		if (skb) {
2423 			if (!(flags & MSG_PEEK))
2424 				scm_stat_del(sk, skb);
2425 			break;
2426 		}
2427 
2428 		mutex_unlock(&u->iolock);
2429 
2430 		if (err != -EAGAIN)
2431 			break;
2432 	} while (timeo &&
2433 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2434 					      &err, &timeo, last));
2435 
2436 	if (!skb) { /* implies iolock unlocked */
2437 		unix_state_lock(sk);
2438 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2439 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2440 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2441 			err = 0;
2442 		unix_state_unlock(sk);
2443 		goto out;
2444 	}
2445 
2446 	if (wq_has_sleeper(&u->peer_wait))
2447 		wake_up_interruptible_sync_poll(&u->peer_wait,
2448 						EPOLLOUT | EPOLLWRNORM |
2449 						EPOLLWRBAND);
2450 
2451 	if (msg->msg_name) {
2452 		unix_copy_addr(msg, skb->sk);
2453 
2454 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2455 						      msg->msg_name,
2456 						      &msg->msg_namelen);
2457 	}
2458 
2459 	if (size > skb->len - skip)
2460 		size = skb->len - skip;
2461 	else if (size < skb->len - skip)
2462 		msg->msg_flags |= MSG_TRUNC;
2463 
2464 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2465 	if (err)
2466 		goto out_free;
2467 
2468 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2469 		__sock_recv_timestamp(msg, sk, skb);
2470 
2471 	memset(&scm, 0, sizeof(scm));
2472 
2473 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2474 	unix_set_secdata(&scm, skb);
2475 
2476 	if (!(flags & MSG_PEEK)) {
2477 		if (UNIXCB(skb).fp)
2478 			unix_detach_fds(&scm, skb);
2479 
2480 		sk_peek_offset_bwd(sk, skb->len);
2481 	} else {
2482 		/* It is questionable: on PEEK we could:
2483 		   - do not return fds - good, but too simple 8)
2484 		   - return fds, and do not return them on read (old strategy,
2485 		     apparently wrong)
2486 		   - clone fds (I chose it for now, it is the most universal
2487 		     solution)
2488 
2489 		   POSIX 1003.1g does not actually define this clearly
2490 		   at all. POSIX 1003.1g doesn't define a lot of things
2491 		   clearly however!
2492 
2493 		*/
2494 
2495 		sk_peek_offset_fwd(sk, size);
2496 
2497 		if (UNIXCB(skb).fp)
2498 			unix_peek_fds(&scm, skb);
2499 	}
2500 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2501 
2502 	scm_recv_unix(sock, msg, &scm, flags);
2503 
2504 out_free:
2505 	skb_free_datagram(sk, skb);
2506 	mutex_unlock(&u->iolock);
2507 out:
2508 	return err;
2509 }
2510 
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2511 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2512 			      int flags)
2513 {
2514 	struct sock *sk = sock->sk;
2515 
2516 #ifdef CONFIG_BPF_SYSCALL
2517 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2518 
2519 	if (prot != &unix_dgram_proto)
2520 		return prot->recvmsg(sk, msg, size, flags, NULL);
2521 #endif
2522 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2523 }
2524 
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2525 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2526 {
2527 	struct unix_sock *u = unix_sk(sk);
2528 	struct sk_buff *skb;
2529 	int err;
2530 
2531 	mutex_lock(&u->iolock);
2532 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2533 	mutex_unlock(&u->iolock);
2534 	if (!skb)
2535 		return err;
2536 
2537 	return recv_actor(sk, skb);
2538 }
2539 
2540 /*
2541  *	Sleep until more data has arrived. But check for races..
2542  */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2543 static long unix_stream_data_wait(struct sock *sk, long timeo,
2544 				  struct sk_buff *last, unsigned int last_len,
2545 				  bool freezable)
2546 {
2547 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2548 	struct sk_buff *tail;
2549 	DEFINE_WAIT(wait);
2550 
2551 	unix_state_lock(sk);
2552 
2553 	for (;;) {
2554 		prepare_to_wait(sk_sleep(sk), &wait, state);
2555 
2556 		tail = skb_peek_tail(&sk->sk_receive_queue);
2557 		if (tail != last ||
2558 		    (tail && tail->len != last_len) ||
2559 		    sk->sk_err ||
2560 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2561 		    signal_pending(current) ||
2562 		    !timeo)
2563 			break;
2564 
2565 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2566 		unix_state_unlock(sk);
2567 		timeo = schedule_timeout(timeo);
2568 		unix_state_lock(sk);
2569 
2570 		if (sock_flag(sk, SOCK_DEAD))
2571 			break;
2572 
2573 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2574 	}
2575 
2576 	finish_wait(sk_sleep(sk), &wait);
2577 	unix_state_unlock(sk);
2578 	return timeo;
2579 }
2580 
unix_skb_len(const struct sk_buff * skb)2581 static unsigned int unix_skb_len(const struct sk_buff *skb)
2582 {
2583 	return skb->len - UNIXCB(skb).consumed;
2584 }
2585 
2586 struct unix_stream_read_state {
2587 	int (*recv_actor)(struct sk_buff *, int, int,
2588 			  struct unix_stream_read_state *);
2589 	struct socket *socket;
2590 	struct msghdr *msg;
2591 	struct pipe_inode_info *pipe;
2592 	size_t size;
2593 	int flags;
2594 	unsigned int splice_flags;
2595 };
2596 
2597 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2598 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2599 {
2600 	struct socket *sock = state->socket;
2601 	struct sock *sk = sock->sk;
2602 	struct unix_sock *u = unix_sk(sk);
2603 	int chunk = 1;
2604 	struct sk_buff *oob_skb;
2605 
2606 	mutex_lock(&u->iolock);
2607 	unix_state_lock(sk);
2608 	spin_lock(&sk->sk_receive_queue.lock);
2609 
2610 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2611 		spin_unlock(&sk->sk_receive_queue.lock);
2612 		unix_state_unlock(sk);
2613 		mutex_unlock(&u->iolock);
2614 		return -EINVAL;
2615 	}
2616 
2617 	oob_skb = u->oob_skb;
2618 
2619 	if (!(state->flags & MSG_PEEK))
2620 		WRITE_ONCE(u->oob_skb, NULL);
2621 
2622 	spin_unlock(&sk->sk_receive_queue.lock);
2623 	unix_state_unlock(sk);
2624 
2625 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2626 
2627 	if (!(state->flags & MSG_PEEK))
2628 		UNIXCB(oob_skb).consumed += 1;
2629 
2630 	mutex_unlock(&u->iolock);
2631 
2632 	if (chunk < 0)
2633 		return -EFAULT;
2634 
2635 	state->msg->msg_flags |= MSG_OOB;
2636 	return 1;
2637 }
2638 
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2639 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2640 				  int flags, int copied)
2641 {
2642 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2643 	struct unix_sock *u = unix_sk(sk);
2644 
2645 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2646 		return skb;
2647 
2648 	spin_lock(&sk->sk_receive_queue.lock);
2649 
2650 	if (!unix_skb_len(skb)) {
2651 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2652 			skb = NULL;
2653 		} else if (flags & MSG_PEEK) {
2654 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2655 		} else {
2656 			read_skb = skb;
2657 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2658 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2659 		}
2660 
2661 		if (!skb)
2662 			goto unlock;
2663 	}
2664 
2665 	if (skb != u->oob_skb)
2666 		goto unlock;
2667 
2668 	if (copied) {
2669 		skb = NULL;
2670 	} else if (!(flags & MSG_PEEK)) {
2671 		WRITE_ONCE(u->oob_skb, NULL);
2672 
2673 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2674 			__skb_unlink(skb, &sk->sk_receive_queue);
2675 			unread_skb = skb;
2676 			skb = skb_peek(&sk->sk_receive_queue);
2677 		}
2678 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2679 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2680 	}
2681 
2682 unlock:
2683 	spin_unlock(&sk->sk_receive_queue.lock);
2684 
2685 	consume_skb(read_skb);
2686 	kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2687 
2688 	return skb;
2689 }
2690 #endif
2691 
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2692 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2693 {
2694 	struct unix_sock *u = unix_sk(sk);
2695 	struct sk_buff *skb;
2696 	int err;
2697 
2698 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2699 		return -ENOTCONN;
2700 
2701 	mutex_lock(&u->iolock);
2702 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2703 	mutex_unlock(&u->iolock);
2704 	if (!skb)
2705 		return err;
2706 
2707 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2708 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2709 		bool drop = false;
2710 
2711 		unix_state_lock(sk);
2712 
2713 		if (sock_flag(sk, SOCK_DEAD)) {
2714 			unix_state_unlock(sk);
2715 			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2716 			return -ECONNRESET;
2717 		}
2718 
2719 		spin_lock(&sk->sk_receive_queue.lock);
2720 		if (likely(skb == u->oob_skb)) {
2721 			WRITE_ONCE(u->oob_skb, NULL);
2722 			drop = true;
2723 		}
2724 		spin_unlock(&sk->sk_receive_queue.lock);
2725 
2726 		unix_state_unlock(sk);
2727 
2728 		if (drop) {
2729 			kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2730 			return -EAGAIN;
2731 		}
2732 	}
2733 #endif
2734 
2735 	return recv_actor(sk, skb);
2736 }
2737 
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2738 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2739 				    bool freezable)
2740 {
2741 	struct scm_cookie scm;
2742 	struct socket *sock = state->socket;
2743 	struct sock *sk = sock->sk;
2744 	struct unix_sock *u = unix_sk(sk);
2745 	int copied = 0;
2746 	int flags = state->flags;
2747 	int noblock = flags & MSG_DONTWAIT;
2748 	bool check_creds = false;
2749 	int target;
2750 	int err = 0;
2751 	long timeo;
2752 	int skip;
2753 	size_t size = state->size;
2754 	unsigned int last_len;
2755 
2756 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2757 		err = -EINVAL;
2758 		goto out;
2759 	}
2760 
2761 	if (unlikely(flags & MSG_OOB)) {
2762 		err = -EOPNOTSUPP;
2763 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2764 		err = unix_stream_recv_urg(state);
2765 #endif
2766 		goto out;
2767 	}
2768 
2769 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2770 	timeo = sock_rcvtimeo(sk, noblock);
2771 
2772 	memset(&scm, 0, sizeof(scm));
2773 
2774 	/* Lock the socket to prevent queue disordering
2775 	 * while sleeps in memcpy_tomsg
2776 	 */
2777 	mutex_lock(&u->iolock);
2778 
2779 	skip = max(sk_peek_offset(sk, flags), 0);
2780 
2781 	do {
2782 		struct sk_buff *skb, *last;
2783 		int chunk;
2784 
2785 redo:
2786 		unix_state_lock(sk);
2787 		if (sock_flag(sk, SOCK_DEAD)) {
2788 			err = -ECONNRESET;
2789 			goto unlock;
2790 		}
2791 		last = skb = skb_peek(&sk->sk_receive_queue);
2792 		last_len = last ? last->len : 0;
2793 
2794 again:
2795 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2796 		if (skb) {
2797 			skb = manage_oob(skb, sk, flags, copied);
2798 			if (!skb && copied) {
2799 				unix_state_unlock(sk);
2800 				break;
2801 			}
2802 		}
2803 #endif
2804 		if (skb == NULL) {
2805 			if (copied >= target)
2806 				goto unlock;
2807 
2808 			/*
2809 			 *	POSIX 1003.1g mandates this order.
2810 			 */
2811 
2812 			err = sock_error(sk);
2813 			if (err)
2814 				goto unlock;
2815 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2816 				goto unlock;
2817 
2818 			unix_state_unlock(sk);
2819 			if (!timeo) {
2820 				err = -EAGAIN;
2821 				break;
2822 			}
2823 
2824 			mutex_unlock(&u->iolock);
2825 
2826 			timeo = unix_stream_data_wait(sk, timeo, last,
2827 						      last_len, freezable);
2828 
2829 			if (signal_pending(current)) {
2830 				err = sock_intr_errno(timeo);
2831 				scm_destroy(&scm);
2832 				goto out;
2833 			}
2834 
2835 			mutex_lock(&u->iolock);
2836 			goto redo;
2837 unlock:
2838 			unix_state_unlock(sk);
2839 			break;
2840 		}
2841 
2842 		while (skip >= unix_skb_len(skb)) {
2843 			skip -= unix_skb_len(skb);
2844 			last = skb;
2845 			last_len = skb->len;
2846 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2847 			if (!skb)
2848 				goto again;
2849 		}
2850 
2851 		unix_state_unlock(sk);
2852 
2853 		if (check_creds) {
2854 			/* Never glue messages from different writers */
2855 			if (!unix_skb_scm_eq(skb, &scm))
2856 				break;
2857 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2858 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2859 			/* Copy credentials */
2860 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2861 			unix_set_secdata(&scm, skb);
2862 			check_creds = true;
2863 		}
2864 
2865 		/* Copy address just once */
2866 		if (state->msg && state->msg->msg_name) {
2867 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2868 					 state->msg->msg_name);
2869 			unix_copy_addr(state->msg, skb->sk);
2870 
2871 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2872 							      state->msg->msg_name,
2873 							      &state->msg->msg_namelen);
2874 
2875 			sunaddr = NULL;
2876 		}
2877 
2878 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2879 		chunk = state->recv_actor(skb, skip, chunk, state);
2880 		if (chunk < 0) {
2881 			if (copied == 0)
2882 				copied = -EFAULT;
2883 			break;
2884 		}
2885 		copied += chunk;
2886 		size -= chunk;
2887 
2888 		/* Mark read part of skb as used */
2889 		if (!(flags & MSG_PEEK)) {
2890 			UNIXCB(skb).consumed += chunk;
2891 
2892 			sk_peek_offset_bwd(sk, chunk);
2893 
2894 			if (UNIXCB(skb).fp) {
2895 				scm_stat_del(sk, skb);
2896 				unix_detach_fds(&scm, skb);
2897 			}
2898 
2899 			if (unix_skb_len(skb))
2900 				break;
2901 
2902 			skb_unlink(skb, &sk->sk_receive_queue);
2903 			consume_skb(skb);
2904 
2905 			if (scm.fp)
2906 				break;
2907 		} else {
2908 			/* It is questionable, see note in unix_dgram_recvmsg.
2909 			 */
2910 			if (UNIXCB(skb).fp)
2911 				unix_peek_fds(&scm, skb);
2912 
2913 			sk_peek_offset_fwd(sk, chunk);
2914 
2915 			if (UNIXCB(skb).fp)
2916 				break;
2917 
2918 			skip = 0;
2919 			last = skb;
2920 			last_len = skb->len;
2921 			unix_state_lock(sk);
2922 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2923 			if (skb)
2924 				goto again;
2925 			unix_state_unlock(sk);
2926 			break;
2927 		}
2928 	} while (size);
2929 
2930 	mutex_unlock(&u->iolock);
2931 	if (state->msg)
2932 		scm_recv_unix(sock, state->msg, &scm, flags);
2933 	else
2934 		scm_destroy(&scm);
2935 out:
2936 	return copied ? : err;
2937 }
2938 
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2939 static int unix_stream_read_actor(struct sk_buff *skb,
2940 				  int skip, int chunk,
2941 				  struct unix_stream_read_state *state)
2942 {
2943 	int ret;
2944 
2945 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2946 				    state->msg, chunk);
2947 	return ret ?: chunk;
2948 }
2949 
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2950 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2951 			  size_t size, int flags)
2952 {
2953 	struct unix_stream_read_state state = {
2954 		.recv_actor = unix_stream_read_actor,
2955 		.socket = sk->sk_socket,
2956 		.msg = msg,
2957 		.size = size,
2958 		.flags = flags
2959 	};
2960 
2961 	return unix_stream_read_generic(&state, true);
2962 }
2963 
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2964 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2965 			       size_t size, int flags)
2966 {
2967 	struct unix_stream_read_state state = {
2968 		.recv_actor = unix_stream_read_actor,
2969 		.socket = sock,
2970 		.msg = msg,
2971 		.size = size,
2972 		.flags = flags
2973 	};
2974 
2975 #ifdef CONFIG_BPF_SYSCALL
2976 	struct sock *sk = sock->sk;
2977 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2978 
2979 	if (prot != &unix_stream_proto)
2980 		return prot->recvmsg(sk, msg, size, flags, NULL);
2981 #endif
2982 	return unix_stream_read_generic(&state, true);
2983 }
2984 
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2985 static int unix_stream_splice_actor(struct sk_buff *skb,
2986 				    int skip, int chunk,
2987 				    struct unix_stream_read_state *state)
2988 {
2989 	return skb_splice_bits(skb, state->socket->sk,
2990 			       UNIXCB(skb).consumed + skip,
2991 			       state->pipe, chunk, state->splice_flags);
2992 }
2993 
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)2994 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2995 				       struct pipe_inode_info *pipe,
2996 				       size_t size, unsigned int flags)
2997 {
2998 	struct unix_stream_read_state state = {
2999 		.recv_actor = unix_stream_splice_actor,
3000 		.socket = sock,
3001 		.pipe = pipe,
3002 		.size = size,
3003 		.splice_flags = flags,
3004 	};
3005 
3006 	if (unlikely(*ppos))
3007 		return -ESPIPE;
3008 
3009 	if (sock->file->f_flags & O_NONBLOCK ||
3010 	    flags & SPLICE_F_NONBLOCK)
3011 		state.flags = MSG_DONTWAIT;
3012 
3013 	return unix_stream_read_generic(&state, false);
3014 }
3015 
unix_shutdown(struct socket * sock,int mode)3016 static int unix_shutdown(struct socket *sock, int mode)
3017 {
3018 	struct sock *sk = sock->sk;
3019 	struct sock *other;
3020 
3021 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3022 		return -EINVAL;
3023 	/* This maps:
3024 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3025 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3026 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3027 	 */
3028 	++mode;
3029 
3030 	unix_state_lock(sk);
3031 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3032 	other = unix_peer(sk);
3033 	if (other)
3034 		sock_hold(other);
3035 	unix_state_unlock(sk);
3036 	sk->sk_state_change(sk);
3037 
3038 	if (other &&
3039 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3040 
3041 		int peer_mode = 0;
3042 		const struct proto *prot = READ_ONCE(other->sk_prot);
3043 
3044 		if (prot->unhash)
3045 			prot->unhash(other);
3046 		if (mode&RCV_SHUTDOWN)
3047 			peer_mode |= SEND_SHUTDOWN;
3048 		if (mode&SEND_SHUTDOWN)
3049 			peer_mode |= RCV_SHUTDOWN;
3050 		unix_state_lock(other);
3051 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3052 		unix_state_unlock(other);
3053 		other->sk_state_change(other);
3054 		if (peer_mode == SHUTDOWN_MASK)
3055 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3056 		else if (peer_mode & RCV_SHUTDOWN)
3057 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3058 	}
3059 	if (other)
3060 		sock_put(other);
3061 
3062 	return 0;
3063 }
3064 
unix_inq_len(struct sock * sk)3065 long unix_inq_len(struct sock *sk)
3066 {
3067 	struct sk_buff *skb;
3068 	long amount = 0;
3069 
3070 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3071 		return -EINVAL;
3072 
3073 	spin_lock(&sk->sk_receive_queue.lock);
3074 	if (sk->sk_type == SOCK_STREAM ||
3075 	    sk->sk_type == SOCK_SEQPACKET) {
3076 		skb_queue_walk(&sk->sk_receive_queue, skb)
3077 			amount += unix_skb_len(skb);
3078 	} else {
3079 		skb = skb_peek(&sk->sk_receive_queue);
3080 		if (skb)
3081 			amount = skb->len;
3082 	}
3083 	spin_unlock(&sk->sk_receive_queue.lock);
3084 
3085 	return amount;
3086 }
3087 EXPORT_SYMBOL_GPL(unix_inq_len);
3088 
unix_outq_len(struct sock * sk)3089 long unix_outq_len(struct sock *sk)
3090 {
3091 	return sk_wmem_alloc_get(sk);
3092 }
3093 EXPORT_SYMBOL_GPL(unix_outq_len);
3094 
unix_open_file(struct sock * sk)3095 static int unix_open_file(struct sock *sk)
3096 {
3097 	struct path path;
3098 	struct file *f;
3099 	int fd;
3100 
3101 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3102 		return -EPERM;
3103 
3104 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3105 		return -ENOENT;
3106 
3107 	path = unix_sk(sk)->path;
3108 	if (!path.dentry)
3109 		return -ENOENT;
3110 
3111 	path_get(&path);
3112 
3113 	fd = get_unused_fd_flags(O_CLOEXEC);
3114 	if (fd < 0)
3115 		goto out;
3116 
3117 	f = dentry_open(&path, O_PATH, current_cred());
3118 	if (IS_ERR(f)) {
3119 		put_unused_fd(fd);
3120 		fd = PTR_ERR(f);
3121 		goto out;
3122 	}
3123 
3124 	fd_install(fd, f);
3125 out:
3126 	path_put(&path);
3127 
3128 	return fd;
3129 }
3130 
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3131 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3132 {
3133 	struct sock *sk = sock->sk;
3134 	long amount = 0;
3135 	int err;
3136 
3137 	switch (cmd) {
3138 	case SIOCOUTQ:
3139 		amount = unix_outq_len(sk);
3140 		err = put_user(amount, (int __user *)arg);
3141 		break;
3142 	case SIOCINQ:
3143 		amount = unix_inq_len(sk);
3144 		if (amount < 0)
3145 			err = amount;
3146 		else
3147 			err = put_user(amount, (int __user *)arg);
3148 		break;
3149 	case SIOCUNIXFILE:
3150 		err = unix_open_file(sk);
3151 		break;
3152 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3153 	case SIOCATMARK:
3154 		{
3155 			struct unix_sock *u = unix_sk(sk);
3156 			struct sk_buff *skb;
3157 			int answ = 0;
3158 
3159 			mutex_lock(&u->iolock);
3160 
3161 			skb = skb_peek(&sk->sk_receive_queue);
3162 			if (skb) {
3163 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3164 				struct sk_buff *next_skb;
3165 
3166 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3167 
3168 				if (skb == oob_skb ||
3169 				    (!unix_skb_len(skb) &&
3170 				     (!oob_skb || next_skb == oob_skb)))
3171 					answ = 1;
3172 			}
3173 
3174 			mutex_unlock(&u->iolock);
3175 
3176 			err = put_user(answ, (int __user *)arg);
3177 		}
3178 		break;
3179 #endif
3180 	default:
3181 		err = -ENOIOCTLCMD;
3182 		break;
3183 	}
3184 	return err;
3185 }
3186 
3187 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3188 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3189 {
3190 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3191 }
3192 #endif
3193 
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3194 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3195 {
3196 	struct sock *sk = sock->sk;
3197 	unsigned char state;
3198 	__poll_t mask;
3199 	u8 shutdown;
3200 
3201 	sock_poll_wait(file, sock, wait);
3202 	mask = 0;
3203 	shutdown = READ_ONCE(sk->sk_shutdown);
3204 	state = READ_ONCE(sk->sk_state);
3205 
3206 	/* exceptional events? */
3207 	if (READ_ONCE(sk->sk_err))
3208 		mask |= EPOLLERR;
3209 	if (shutdown == SHUTDOWN_MASK)
3210 		mask |= EPOLLHUP;
3211 	if (shutdown & RCV_SHUTDOWN)
3212 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3213 
3214 	/* readable? */
3215 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3216 		mask |= EPOLLIN | EPOLLRDNORM;
3217 	if (sk_is_readable(sk))
3218 		mask |= EPOLLIN | EPOLLRDNORM;
3219 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3220 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3221 		mask |= EPOLLPRI;
3222 #endif
3223 
3224 	/* Connection-based need to check for termination and startup */
3225 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3226 	    state == TCP_CLOSE)
3227 		mask |= EPOLLHUP;
3228 
3229 	/*
3230 	 * we set writable also when the other side has shut down the
3231 	 * connection. This prevents stuck sockets.
3232 	 */
3233 	if (unix_writable(sk, state))
3234 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3235 
3236 	return mask;
3237 }
3238 
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3239 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3240 				    poll_table *wait)
3241 {
3242 	struct sock *sk = sock->sk, *other;
3243 	unsigned int writable;
3244 	unsigned char state;
3245 	__poll_t mask;
3246 	u8 shutdown;
3247 
3248 	sock_poll_wait(file, sock, wait);
3249 	mask = 0;
3250 	shutdown = READ_ONCE(sk->sk_shutdown);
3251 	state = READ_ONCE(sk->sk_state);
3252 
3253 	/* exceptional events? */
3254 	if (READ_ONCE(sk->sk_err) ||
3255 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3256 		mask |= EPOLLERR |
3257 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3258 
3259 	if (shutdown & RCV_SHUTDOWN)
3260 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3261 	if (shutdown == SHUTDOWN_MASK)
3262 		mask |= EPOLLHUP;
3263 
3264 	/* readable? */
3265 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3266 		mask |= EPOLLIN | EPOLLRDNORM;
3267 	if (sk_is_readable(sk))
3268 		mask |= EPOLLIN | EPOLLRDNORM;
3269 
3270 	/* Connection-based need to check for termination and startup */
3271 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3272 		mask |= EPOLLHUP;
3273 
3274 	/* No write status requested, avoid expensive OUT tests. */
3275 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3276 		return mask;
3277 
3278 	writable = unix_writable(sk, state);
3279 	if (writable) {
3280 		unix_state_lock(sk);
3281 
3282 		other = unix_peer(sk);
3283 		if (other && unix_peer(other) != sk &&
3284 		    unix_recvq_full_lockless(other) &&
3285 		    unix_dgram_peer_wake_me(sk, other))
3286 			writable = 0;
3287 
3288 		unix_state_unlock(sk);
3289 	}
3290 
3291 	if (writable)
3292 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3293 	else
3294 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3295 
3296 	return mask;
3297 }
3298 
3299 #ifdef CONFIG_PROC_FS
3300 
3301 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3302 
3303 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3304 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3305 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3306 
unix_from_bucket(struct seq_file * seq,loff_t * pos)3307 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3308 {
3309 	unsigned long offset = get_offset(*pos);
3310 	unsigned long bucket = get_bucket(*pos);
3311 	unsigned long count = 0;
3312 	struct sock *sk;
3313 
3314 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3315 	     sk; sk = sk_next(sk)) {
3316 		if (++count == offset)
3317 			break;
3318 	}
3319 
3320 	return sk;
3321 }
3322 
unix_get_first(struct seq_file * seq,loff_t * pos)3323 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3324 {
3325 	unsigned long bucket = get_bucket(*pos);
3326 	struct net *net = seq_file_net(seq);
3327 	struct sock *sk;
3328 
3329 	while (bucket < UNIX_HASH_SIZE) {
3330 		spin_lock(&net->unx.table.locks[bucket]);
3331 
3332 		sk = unix_from_bucket(seq, pos);
3333 		if (sk)
3334 			return sk;
3335 
3336 		spin_unlock(&net->unx.table.locks[bucket]);
3337 
3338 		*pos = set_bucket_offset(++bucket, 1);
3339 	}
3340 
3341 	return NULL;
3342 }
3343 
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3344 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3345 				  loff_t *pos)
3346 {
3347 	unsigned long bucket = get_bucket(*pos);
3348 
3349 	sk = sk_next(sk);
3350 	if (sk)
3351 		return sk;
3352 
3353 
3354 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3355 
3356 	*pos = set_bucket_offset(++bucket, 1);
3357 
3358 	return unix_get_first(seq, pos);
3359 }
3360 
unix_seq_start(struct seq_file * seq,loff_t * pos)3361 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3362 {
3363 	if (!*pos)
3364 		return SEQ_START_TOKEN;
3365 
3366 	return unix_get_first(seq, pos);
3367 }
3368 
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3369 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3370 {
3371 	++*pos;
3372 
3373 	if (v == SEQ_START_TOKEN)
3374 		return unix_get_first(seq, pos);
3375 
3376 	return unix_get_next(seq, v, pos);
3377 }
3378 
unix_seq_stop(struct seq_file * seq,void * v)3379 static void unix_seq_stop(struct seq_file *seq, void *v)
3380 {
3381 	struct sock *sk = v;
3382 
3383 	if (sk)
3384 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3385 }
3386 
unix_seq_show(struct seq_file * seq,void * v)3387 static int unix_seq_show(struct seq_file *seq, void *v)
3388 {
3389 
3390 	if (v == SEQ_START_TOKEN)
3391 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3392 			 "Inode Path\n");
3393 	else {
3394 		struct sock *s = v;
3395 		struct unix_sock *u = unix_sk(s);
3396 		unix_state_lock(s);
3397 
3398 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3399 			s,
3400 			refcount_read(&s->sk_refcnt),
3401 			0,
3402 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3403 			s->sk_type,
3404 			s->sk_socket ?
3405 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3406 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3407 			sock_i_ino(s));
3408 
3409 		if (u->addr) {	// under a hash table lock here
3410 			int i, len;
3411 			seq_putc(seq, ' ');
3412 
3413 			i = 0;
3414 			len = u->addr->len -
3415 				offsetof(struct sockaddr_un, sun_path);
3416 			if (u->addr->name->sun_path[0]) {
3417 				len--;
3418 			} else {
3419 				seq_putc(seq, '@');
3420 				i++;
3421 			}
3422 			for ( ; i < len; i++)
3423 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3424 					 '@');
3425 		}
3426 		unix_state_unlock(s);
3427 		seq_putc(seq, '\n');
3428 	}
3429 
3430 	return 0;
3431 }
3432 
3433 static const struct seq_operations unix_seq_ops = {
3434 	.start  = unix_seq_start,
3435 	.next   = unix_seq_next,
3436 	.stop   = unix_seq_stop,
3437 	.show   = unix_seq_show,
3438 };
3439 
3440 #ifdef CONFIG_BPF_SYSCALL
3441 struct bpf_unix_iter_state {
3442 	struct seq_net_private p;
3443 	unsigned int cur_sk;
3444 	unsigned int end_sk;
3445 	unsigned int max_sk;
3446 	struct sock **batch;
3447 	bool st_bucket_done;
3448 };
3449 
3450 struct bpf_iter__unix {
3451 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3452 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3453 	uid_t uid __aligned(8);
3454 };
3455 
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3456 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3457 			      struct unix_sock *unix_sk, uid_t uid)
3458 {
3459 	struct bpf_iter__unix ctx;
3460 
3461 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3462 	ctx.meta = meta;
3463 	ctx.unix_sk = unix_sk;
3464 	ctx.uid = uid;
3465 	return bpf_iter_run_prog(prog, &ctx);
3466 }
3467 
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3468 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3469 
3470 {
3471 	struct bpf_unix_iter_state *iter = seq->private;
3472 	unsigned int expected = 1;
3473 	struct sock *sk;
3474 
3475 	sock_hold(start_sk);
3476 	iter->batch[iter->end_sk++] = start_sk;
3477 
3478 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3479 		if (iter->end_sk < iter->max_sk) {
3480 			sock_hold(sk);
3481 			iter->batch[iter->end_sk++] = sk;
3482 		}
3483 
3484 		expected++;
3485 	}
3486 
3487 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3488 
3489 	return expected;
3490 }
3491 
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3492 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3493 {
3494 	while (iter->cur_sk < iter->end_sk)
3495 		sock_put(iter->batch[iter->cur_sk++]);
3496 }
3497 
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3498 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3499 				       unsigned int new_batch_sz)
3500 {
3501 	struct sock **new_batch;
3502 
3503 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3504 			     GFP_USER | __GFP_NOWARN);
3505 	if (!new_batch)
3506 		return -ENOMEM;
3507 
3508 	bpf_iter_unix_put_batch(iter);
3509 	kvfree(iter->batch);
3510 	iter->batch = new_batch;
3511 	iter->max_sk = new_batch_sz;
3512 
3513 	return 0;
3514 }
3515 
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3516 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3517 					loff_t *pos)
3518 {
3519 	struct bpf_unix_iter_state *iter = seq->private;
3520 	unsigned int expected;
3521 	bool resized = false;
3522 	struct sock *sk;
3523 
3524 	if (iter->st_bucket_done)
3525 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3526 
3527 again:
3528 	/* Get a new batch */
3529 	iter->cur_sk = 0;
3530 	iter->end_sk = 0;
3531 
3532 	sk = unix_get_first(seq, pos);
3533 	if (!sk)
3534 		return NULL; /* Done */
3535 
3536 	expected = bpf_iter_unix_hold_batch(seq, sk);
3537 
3538 	if (iter->end_sk == expected) {
3539 		iter->st_bucket_done = true;
3540 		return sk;
3541 	}
3542 
3543 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3544 		resized = true;
3545 		goto again;
3546 	}
3547 
3548 	return sk;
3549 }
3550 
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3551 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3552 {
3553 	if (!*pos)
3554 		return SEQ_START_TOKEN;
3555 
3556 	/* bpf iter does not support lseek, so it always
3557 	 * continue from where it was stop()-ped.
3558 	 */
3559 	return bpf_iter_unix_batch(seq, pos);
3560 }
3561 
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3562 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3563 {
3564 	struct bpf_unix_iter_state *iter = seq->private;
3565 	struct sock *sk;
3566 
3567 	/* Whenever seq_next() is called, the iter->cur_sk is
3568 	 * done with seq_show(), so advance to the next sk in
3569 	 * the batch.
3570 	 */
3571 	if (iter->cur_sk < iter->end_sk)
3572 		sock_put(iter->batch[iter->cur_sk++]);
3573 
3574 	++*pos;
3575 
3576 	if (iter->cur_sk < iter->end_sk)
3577 		sk = iter->batch[iter->cur_sk];
3578 	else
3579 		sk = bpf_iter_unix_batch(seq, pos);
3580 
3581 	return sk;
3582 }
3583 
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3584 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3585 {
3586 	struct bpf_iter_meta meta;
3587 	struct bpf_prog *prog;
3588 	struct sock *sk = v;
3589 	uid_t uid;
3590 	bool slow;
3591 	int ret;
3592 
3593 	if (v == SEQ_START_TOKEN)
3594 		return 0;
3595 
3596 	slow = lock_sock_fast(sk);
3597 
3598 	if (unlikely(sk_unhashed(sk))) {
3599 		ret = SEQ_SKIP;
3600 		goto unlock;
3601 	}
3602 
3603 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3604 	meta.seq = seq;
3605 	prog = bpf_iter_get_info(&meta, false);
3606 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3607 unlock:
3608 	unlock_sock_fast(sk, slow);
3609 	return ret;
3610 }
3611 
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3612 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3613 {
3614 	struct bpf_unix_iter_state *iter = seq->private;
3615 	struct bpf_iter_meta meta;
3616 	struct bpf_prog *prog;
3617 
3618 	if (!v) {
3619 		meta.seq = seq;
3620 		prog = bpf_iter_get_info(&meta, true);
3621 		if (prog)
3622 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3623 	}
3624 
3625 	if (iter->cur_sk < iter->end_sk)
3626 		bpf_iter_unix_put_batch(iter);
3627 }
3628 
3629 static const struct seq_operations bpf_iter_unix_seq_ops = {
3630 	.start	= bpf_iter_unix_seq_start,
3631 	.next	= bpf_iter_unix_seq_next,
3632 	.stop	= bpf_iter_unix_seq_stop,
3633 	.show	= bpf_iter_unix_seq_show,
3634 };
3635 #endif
3636 #endif
3637 
3638 static const struct net_proto_family unix_family_ops = {
3639 	.family = PF_UNIX,
3640 	.create = unix_create,
3641 	.owner	= THIS_MODULE,
3642 };
3643 
3644 
unix_net_init(struct net * net)3645 static int __net_init unix_net_init(struct net *net)
3646 {
3647 	int i;
3648 
3649 	net->unx.sysctl_max_dgram_qlen = 10;
3650 	if (unix_sysctl_register(net))
3651 		goto out;
3652 
3653 #ifdef CONFIG_PROC_FS
3654 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3655 			     sizeof(struct seq_net_private)))
3656 		goto err_sysctl;
3657 #endif
3658 
3659 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3660 					      sizeof(spinlock_t), GFP_KERNEL);
3661 	if (!net->unx.table.locks)
3662 		goto err_proc;
3663 
3664 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3665 						sizeof(struct hlist_head),
3666 						GFP_KERNEL);
3667 	if (!net->unx.table.buckets)
3668 		goto free_locks;
3669 
3670 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3671 		spin_lock_init(&net->unx.table.locks[i]);
3672 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3673 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3674 	}
3675 
3676 	return 0;
3677 
3678 free_locks:
3679 	kvfree(net->unx.table.locks);
3680 err_proc:
3681 #ifdef CONFIG_PROC_FS
3682 	remove_proc_entry("unix", net->proc_net);
3683 err_sysctl:
3684 #endif
3685 	unix_sysctl_unregister(net);
3686 out:
3687 	return -ENOMEM;
3688 }
3689 
unix_net_exit(struct net * net)3690 static void __net_exit unix_net_exit(struct net *net)
3691 {
3692 	kvfree(net->unx.table.buckets);
3693 	kvfree(net->unx.table.locks);
3694 	unix_sysctl_unregister(net);
3695 	remove_proc_entry("unix", net->proc_net);
3696 }
3697 
3698 static struct pernet_operations unix_net_ops = {
3699 	.init = unix_net_init,
3700 	.exit = unix_net_exit,
3701 };
3702 
3703 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3704 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3705 		     struct unix_sock *unix_sk, uid_t uid)
3706 
3707 #define INIT_BATCH_SZ 16
3708 
3709 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3710 {
3711 	struct bpf_unix_iter_state *iter = priv_data;
3712 	int err;
3713 
3714 	err = bpf_iter_init_seq_net(priv_data, aux);
3715 	if (err)
3716 		return err;
3717 
3718 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3719 	if (err) {
3720 		bpf_iter_fini_seq_net(priv_data);
3721 		return err;
3722 	}
3723 
3724 	return 0;
3725 }
3726 
bpf_iter_fini_unix(void * priv_data)3727 static void bpf_iter_fini_unix(void *priv_data)
3728 {
3729 	struct bpf_unix_iter_state *iter = priv_data;
3730 
3731 	bpf_iter_fini_seq_net(priv_data);
3732 	kvfree(iter->batch);
3733 }
3734 
3735 static const struct bpf_iter_seq_info unix_seq_info = {
3736 	.seq_ops		= &bpf_iter_unix_seq_ops,
3737 	.init_seq_private	= bpf_iter_init_unix,
3738 	.fini_seq_private	= bpf_iter_fini_unix,
3739 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3740 };
3741 
3742 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3743 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3744 			     const struct bpf_prog *prog)
3745 {
3746 	switch (func_id) {
3747 	case BPF_FUNC_setsockopt:
3748 		return &bpf_sk_setsockopt_proto;
3749 	case BPF_FUNC_getsockopt:
3750 		return &bpf_sk_getsockopt_proto;
3751 	default:
3752 		return NULL;
3753 	}
3754 }
3755 
3756 static struct bpf_iter_reg unix_reg_info = {
3757 	.target			= "unix",
3758 	.ctx_arg_info_size	= 1,
3759 	.ctx_arg_info		= {
3760 		{ offsetof(struct bpf_iter__unix, unix_sk),
3761 		  PTR_TO_BTF_ID_OR_NULL },
3762 	},
3763 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3764 	.seq_info		= &unix_seq_info,
3765 };
3766 
bpf_iter_register(void)3767 static void __init bpf_iter_register(void)
3768 {
3769 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3770 	if (bpf_iter_reg_target(&unix_reg_info))
3771 		pr_warn("Warning: could not register bpf iterator unix\n");
3772 }
3773 #endif
3774 
af_unix_init(void)3775 static int __init af_unix_init(void)
3776 {
3777 	int i, rc = -1;
3778 
3779 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3780 
3781 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3782 		spin_lock_init(&bsd_socket_locks[i]);
3783 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3784 	}
3785 
3786 	rc = proto_register(&unix_dgram_proto, 1);
3787 	if (rc != 0) {
3788 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3789 		goto out;
3790 	}
3791 
3792 	rc = proto_register(&unix_stream_proto, 1);
3793 	if (rc != 0) {
3794 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3795 		proto_unregister(&unix_dgram_proto);
3796 		goto out;
3797 	}
3798 
3799 	sock_register(&unix_family_ops);
3800 	register_pernet_subsys(&unix_net_ops);
3801 	unix_bpf_build_proto();
3802 
3803 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3804 	bpf_iter_register();
3805 #endif
3806 
3807 out:
3808 	return rc;
3809 }
3810 
3811 /* Later than subsys_initcall() because we depend on stuff initialised there */
3812 fs_initcall(af_unix_init);
3813