xref: /linux/net/unix/af_unix.c (revision 1e15510b71c99c6e49134d756df91069f7d18141)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 #ifdef CONFIG_PROVE_LOCKING
130 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
131 
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133 				  const struct lockdep_map *b)
134 {
135 	return cmp_ptr(a, b);
136 }
137 
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139 				  const struct lockdep_map *_b)
140 {
141 	const struct unix_sock *a, *b;
142 
143 	a = container_of(_a, struct unix_sock, lock.dep_map);
144 	b = container_of(_b, struct unix_sock, lock.dep_map);
145 
146 	if (a->sk.sk_state == TCP_LISTEN) {
147 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
148 		 *
149 		 *   1. a is TCP_LISTEN.
150 		 *   2. b is not a.
151 		 *   3. concurrent connect(b -> a) must fail.
152 		 *
153 		 * Except for 2. & 3., the b's state can be any possible
154 		 * value due to concurrent connect() or listen().
155 		 *
156 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157 		 * be expressed as lock_cmp_fn.
158 		 */
159 		switch (b->sk.sk_state) {
160 		case TCP_CLOSE:
161 		case TCP_ESTABLISHED:
162 		case TCP_LISTEN:
163 			return -1;
164 		default:
165 			/* Invalid case. */
166 			return 0;
167 		}
168 	}
169 
170 	/* Should never happen.  Just to be symmetric. */
171 	if (b->sk.sk_state == TCP_LISTEN) {
172 		switch (b->sk.sk_state) {
173 		case TCP_CLOSE:
174 		case TCP_ESTABLISHED:
175 			return 1;
176 		default:
177 			return 0;
178 		}
179 	}
180 
181 	/* unix_state_double_lock(): ascending address order. */
182 	return cmp_ptr(a, b);
183 }
184 
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186 				  const struct lockdep_map *_b)
187 {
188 	const struct sock *a, *b;
189 
190 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192 
193 	/* unix_collect_skb(): listener -> embryo order. */
194 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195 		return -1;
196 
197 	/* Should never happen.  Just to be symmetric. */
198 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199 		return 1;
200 
201 	return 0;
202 }
203 #endif
204 
unix_unbound_hash(struct sock * sk)205 static unsigned int unix_unbound_hash(struct sock *sk)
206 {
207 	unsigned long hash = (unsigned long)sk;
208 
209 	hash ^= hash >> 16;
210 	hash ^= hash >> 8;
211 	hash ^= sk->sk_type;
212 
213 	return hash & UNIX_HASH_MOD;
214 }
215 
unix_bsd_hash(struct inode * i)216 static unsigned int unix_bsd_hash(struct inode *i)
217 {
218 	return i->i_ino & UNIX_HASH_MOD;
219 }
220 
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222 				       int addr_len, int type)
223 {
224 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
225 	unsigned int hash;
226 
227 	hash = (__force unsigned int)csum_fold(csum);
228 	hash ^= hash >> 8;
229 	hash ^= type;
230 
231 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232 }
233 
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)234 static void unix_table_double_lock(struct net *net,
235 				   unsigned int hash1, unsigned int hash2)
236 {
237 	if (hash1 == hash2) {
238 		spin_lock(&net->unx.table.locks[hash1]);
239 		return;
240 	}
241 
242 	if (hash1 > hash2)
243 		swap(hash1, hash2);
244 
245 	spin_lock(&net->unx.table.locks[hash1]);
246 	spin_lock(&net->unx.table.locks[hash2]);
247 }
248 
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)249 static void unix_table_double_unlock(struct net *net,
250 				     unsigned int hash1, unsigned int hash2)
251 {
252 	if (hash1 == hash2) {
253 		spin_unlock(&net->unx.table.locks[hash1]);
254 		return;
255 	}
256 
257 	spin_unlock(&net->unx.table.locks[hash1]);
258 	spin_unlock(&net->unx.table.locks[hash2]);
259 }
260 
261 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263 {
264 	UNIXCB(skb).secid = scm->secid;
265 }
266 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268 {
269 	scm->secid = UNIXCB(skb).secid;
270 }
271 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273 {
274 	return (scm->secid == UNIXCB(skb).secid);
275 }
276 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278 { }
279 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281 { }
282 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284 {
285 	return true;
286 }
287 #endif /* CONFIG_SECURITY_NETWORK */
288 
unix_may_send(struct sock * sk,struct sock * osk)289 static inline int unix_may_send(struct sock *sk, struct sock *osk)
290 {
291 	return !unix_peer(osk) || unix_peer(osk) == sk;
292 }
293 
unix_recvq_full_lockless(const struct sock * sk)294 static inline int unix_recvq_full_lockless(const struct sock *sk)
295 {
296 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
297 }
298 
unix_peer_get(struct sock * s)299 struct sock *unix_peer_get(struct sock *s)
300 {
301 	struct sock *peer;
302 
303 	unix_state_lock(s);
304 	peer = unix_peer(s);
305 	if (peer)
306 		sock_hold(peer);
307 	unix_state_unlock(s);
308 	return peer;
309 }
310 EXPORT_SYMBOL_GPL(unix_peer_get);
311 
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)312 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
313 					     int addr_len)
314 {
315 	struct unix_address *addr;
316 
317 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
318 	if (!addr)
319 		return NULL;
320 
321 	refcount_set(&addr->refcnt, 1);
322 	addr->len = addr_len;
323 	memcpy(addr->name, sunaddr, addr_len);
324 
325 	return addr;
326 }
327 
unix_release_addr(struct unix_address * addr)328 static inline void unix_release_addr(struct unix_address *addr)
329 {
330 	if (refcount_dec_and_test(&addr->refcnt))
331 		kfree(addr);
332 }
333 
334 /*
335  *	Check unix socket name:
336  *		- should be not zero length.
337  *	        - if started by not zero, should be NULL terminated (FS object)
338  *		- if started by zero, it is abstract name.
339  */
340 
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)341 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
342 {
343 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
344 	    addr_len > sizeof(*sunaddr))
345 		return -EINVAL;
346 
347 	if (sunaddr->sun_family != AF_UNIX)
348 		return -EINVAL;
349 
350 	return 0;
351 }
352 
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)353 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
354 {
355 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
356 	short offset = offsetof(struct sockaddr_storage, __data);
357 
358 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
359 
360 	/* This may look like an off by one error but it is a bit more
361 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
362 	 * sun_path[108] doesn't as such exist.  However in kernel space
363 	 * we are guaranteed that it is a valid memory location in our
364 	 * kernel address buffer because syscall functions always pass
365 	 * a pointer of struct sockaddr_storage which has a bigger buffer
366 	 * than 108.  Also, we must terminate sun_path for strlen() in
367 	 * getname_kernel().
368 	 */
369 	addr->__data[addr_len - offset] = 0;
370 
371 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
372 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
373 	 * know the actual buffer.
374 	 */
375 	return strlen(addr->__data) + offset + 1;
376 }
377 
__unix_remove_socket(struct sock * sk)378 static void __unix_remove_socket(struct sock *sk)
379 {
380 	sk_del_node_init(sk);
381 }
382 
__unix_insert_socket(struct net * net,struct sock * sk)383 static void __unix_insert_socket(struct net *net, struct sock *sk)
384 {
385 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
386 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
387 }
388 
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)389 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
390 				 struct unix_address *addr, unsigned int hash)
391 {
392 	__unix_remove_socket(sk);
393 	smp_store_release(&unix_sk(sk)->addr, addr);
394 
395 	sk->sk_hash = hash;
396 	__unix_insert_socket(net, sk);
397 }
398 
unix_remove_socket(struct net * net,struct sock * sk)399 static void unix_remove_socket(struct net *net, struct sock *sk)
400 {
401 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
402 	__unix_remove_socket(sk);
403 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
404 }
405 
unix_insert_unbound_socket(struct net * net,struct sock * sk)406 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
407 {
408 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
409 	__unix_insert_socket(net, sk);
410 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
411 }
412 
unix_insert_bsd_socket(struct sock * sk)413 static void unix_insert_bsd_socket(struct sock *sk)
414 {
415 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
416 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
417 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
418 }
419 
unix_remove_bsd_socket(struct sock * sk)420 static void unix_remove_bsd_socket(struct sock *sk)
421 {
422 	if (!hlist_unhashed(&sk->sk_bind_node)) {
423 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
424 		__sk_del_bind_node(sk);
425 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
426 
427 		sk_node_init(&sk->sk_bind_node);
428 	}
429 }
430 
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)431 static struct sock *__unix_find_socket_byname(struct net *net,
432 					      struct sockaddr_un *sunname,
433 					      int len, unsigned int hash)
434 {
435 	struct sock *s;
436 
437 	sk_for_each(s, &net->unx.table.buckets[hash]) {
438 		struct unix_sock *u = unix_sk(s);
439 
440 		if (u->addr->len == len &&
441 		    !memcmp(u->addr->name, sunname, len))
442 			return s;
443 	}
444 	return NULL;
445 }
446 
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)447 static inline struct sock *unix_find_socket_byname(struct net *net,
448 						   struct sockaddr_un *sunname,
449 						   int len, unsigned int hash)
450 {
451 	struct sock *s;
452 
453 	spin_lock(&net->unx.table.locks[hash]);
454 	s = __unix_find_socket_byname(net, sunname, len, hash);
455 	if (s)
456 		sock_hold(s);
457 	spin_unlock(&net->unx.table.locks[hash]);
458 	return s;
459 }
460 
unix_find_socket_byinode(struct inode * i)461 static struct sock *unix_find_socket_byinode(struct inode *i)
462 {
463 	unsigned int hash = unix_bsd_hash(i);
464 	struct sock *s;
465 
466 	spin_lock(&bsd_socket_locks[hash]);
467 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
468 		struct dentry *dentry = unix_sk(s)->path.dentry;
469 
470 		if (dentry && d_backing_inode(dentry) == i) {
471 			sock_hold(s);
472 			spin_unlock(&bsd_socket_locks[hash]);
473 			return s;
474 		}
475 	}
476 	spin_unlock(&bsd_socket_locks[hash]);
477 	return NULL;
478 }
479 
480 /* Support code for asymmetrically connected dgram sockets
481  *
482  * If a datagram socket is connected to a socket not itself connected
483  * to the first socket (eg, /dev/log), clients may only enqueue more
484  * messages if the present receive queue of the server socket is not
485  * "too large". This means there's a second writeability condition
486  * poll and sendmsg need to test. The dgram recv code will do a wake
487  * up on the peer_wait wait queue of a socket upon reception of a
488  * datagram which needs to be propagated to sleeping would-be writers
489  * since these might not have sent anything so far. This can't be
490  * accomplished via poll_wait because the lifetime of the server
491  * socket might be less than that of its clients if these break their
492  * association with it or if the server socket is closed while clients
493  * are still connected to it and there's no way to inform "a polling
494  * implementation" that it should let go of a certain wait queue
495  *
496  * In order to propagate a wake up, a wait_queue_entry_t of the client
497  * socket is enqueued on the peer_wait queue of the server socket
498  * whose wake function does a wake_up on the ordinary client socket
499  * wait queue. This connection is established whenever a write (or
500  * poll for write) hit the flow control condition and broken when the
501  * association to the server socket is dissolved or after a wake up
502  * was relayed.
503  */
504 
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)505 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
506 				      void *key)
507 {
508 	struct unix_sock *u;
509 	wait_queue_head_t *u_sleep;
510 
511 	u = container_of(q, struct unix_sock, peer_wake);
512 
513 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
514 			    q);
515 	u->peer_wake.private = NULL;
516 
517 	/* relaying can only happen while the wq still exists */
518 	u_sleep = sk_sleep(&u->sk);
519 	if (u_sleep)
520 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
521 
522 	return 0;
523 }
524 
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)525 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
526 {
527 	struct unix_sock *u, *u_other;
528 	int rc;
529 
530 	u = unix_sk(sk);
531 	u_other = unix_sk(other);
532 	rc = 0;
533 	spin_lock(&u_other->peer_wait.lock);
534 
535 	if (!u->peer_wake.private) {
536 		u->peer_wake.private = other;
537 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
538 
539 		rc = 1;
540 	}
541 
542 	spin_unlock(&u_other->peer_wait.lock);
543 	return rc;
544 }
545 
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)546 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
547 					    struct sock *other)
548 {
549 	struct unix_sock *u, *u_other;
550 
551 	u = unix_sk(sk);
552 	u_other = unix_sk(other);
553 	spin_lock(&u_other->peer_wait.lock);
554 
555 	if (u->peer_wake.private == other) {
556 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
557 		u->peer_wake.private = NULL;
558 	}
559 
560 	spin_unlock(&u_other->peer_wait.lock);
561 }
562 
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)563 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
564 						   struct sock *other)
565 {
566 	unix_dgram_peer_wake_disconnect(sk, other);
567 	wake_up_interruptible_poll(sk_sleep(sk),
568 				   EPOLLOUT |
569 				   EPOLLWRNORM |
570 				   EPOLLWRBAND);
571 }
572 
573 /* preconditions:
574  *	- unix_peer(sk) == other
575  *	- association is stable
576  */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)577 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
578 {
579 	int connected;
580 
581 	connected = unix_dgram_peer_wake_connect(sk, other);
582 
583 	/* If other is SOCK_DEAD, we want to make sure we signal
584 	 * POLLOUT, such that a subsequent write() can get a
585 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
586 	 * to other and its full, we will hang waiting for POLLOUT.
587 	 */
588 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
589 		return 1;
590 
591 	if (connected)
592 		unix_dgram_peer_wake_disconnect(sk, other);
593 
594 	return 0;
595 }
596 
unix_writable(const struct sock * sk,unsigned char state)597 static int unix_writable(const struct sock *sk, unsigned char state)
598 {
599 	return state != TCP_LISTEN &&
600 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
601 }
602 
unix_write_space(struct sock * sk)603 static void unix_write_space(struct sock *sk)
604 {
605 	struct socket_wq *wq;
606 
607 	rcu_read_lock();
608 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
609 		wq = rcu_dereference(sk->sk_wq);
610 		if (skwq_has_sleeper(wq))
611 			wake_up_interruptible_sync_poll(&wq->wait,
612 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
613 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
614 	}
615 	rcu_read_unlock();
616 }
617 
618 /* When dgram socket disconnects (or changes its peer), we clear its receive
619  * queue of packets arrived from previous peer. First, it allows to do
620  * flow control based only on wmem_alloc; second, sk connected to peer
621  * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)622 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
623 {
624 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
625 		skb_queue_purge_reason(&sk->sk_receive_queue,
626 				       SKB_DROP_REASON_UNIX_DISCONNECT);
627 
628 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
629 
630 		/* If one link of bidirectional dgram pipe is disconnected,
631 		 * we signal error. Messages are lost. Do not make this,
632 		 * when peer was not connected to us.
633 		 */
634 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
635 			WRITE_ONCE(other->sk_err, ECONNRESET);
636 			sk_error_report(other);
637 		}
638 	}
639 }
640 
unix_sock_destructor(struct sock * sk)641 static void unix_sock_destructor(struct sock *sk)
642 {
643 	struct unix_sock *u = unix_sk(sk);
644 
645 	skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
646 
647 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
648 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
649 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
650 	if (!sock_flag(sk, SOCK_DEAD)) {
651 		pr_info("Attempt to release alive unix socket: %p\n", sk);
652 		return;
653 	}
654 
655 	if (u->addr)
656 		unix_release_addr(u->addr);
657 
658 	atomic_long_dec(&unix_nr_socks);
659 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
660 #ifdef UNIX_REFCNT_DEBUG
661 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
662 		atomic_long_read(&unix_nr_socks));
663 #endif
664 }
665 
unix_release_sock(struct sock * sk,int embrion)666 static void unix_release_sock(struct sock *sk, int embrion)
667 {
668 	struct unix_sock *u = unix_sk(sk);
669 	struct sock *skpair;
670 	struct sk_buff *skb;
671 	struct path path;
672 	int state;
673 
674 	unix_remove_socket(sock_net(sk), sk);
675 	unix_remove_bsd_socket(sk);
676 
677 	/* Clear state */
678 	unix_state_lock(sk);
679 	sock_orphan(sk);
680 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
681 	path	     = u->path;
682 	u->path.dentry = NULL;
683 	u->path.mnt = NULL;
684 	state = sk->sk_state;
685 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
686 
687 	skpair = unix_peer(sk);
688 	unix_peer(sk) = NULL;
689 
690 	unix_state_unlock(sk);
691 
692 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
693 	u->oob_skb = NULL;
694 #endif
695 
696 	wake_up_interruptible_all(&u->peer_wait);
697 
698 	if (skpair != NULL) {
699 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
700 			unix_state_lock(skpair);
701 			/* No more writes */
702 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
703 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
704 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
705 			unix_state_unlock(skpair);
706 			skpair->sk_state_change(skpair);
707 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
708 		}
709 
710 		unix_dgram_peer_wake_disconnect(sk, skpair);
711 		sock_put(skpair); /* It may now die */
712 	}
713 
714 	/* Try to flush out this socket. Throw out buffers at least */
715 
716 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
717 		if (state == TCP_LISTEN)
718 			unix_release_sock(skb->sk, 1);
719 
720 		/* passed fds are erased in the kfree_skb hook */
721 		kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
722 	}
723 
724 	if (path.dentry)
725 		path_put(&path);
726 
727 	sock_put(sk);
728 
729 	/* ---- Socket is dead now and most probably destroyed ---- */
730 
731 	/*
732 	 * Fixme: BSD difference: In BSD all sockets connected to us get
733 	 *	  ECONNRESET and we die on the spot. In Linux we behave
734 	 *	  like files and pipes do and wait for the last
735 	 *	  dereference.
736 	 *
737 	 * Can't we simply set sock->err?
738 	 *
739 	 *	  What the above comment does talk about? --ANK(980817)
740 	 */
741 
742 	if (READ_ONCE(unix_tot_inflight))
743 		unix_gc();		/* Garbage collect fds */
744 }
745 
init_peercred(struct sock * sk)746 static void init_peercred(struct sock *sk)
747 {
748 	sk->sk_peer_pid = get_pid(task_tgid(current));
749 	sk->sk_peer_cred = get_current_cred();
750 }
751 
update_peercred(struct sock * sk)752 static void update_peercred(struct sock *sk)
753 {
754 	const struct cred *old_cred;
755 	struct pid *old_pid;
756 
757 	spin_lock(&sk->sk_peer_lock);
758 	old_pid = sk->sk_peer_pid;
759 	old_cred = sk->sk_peer_cred;
760 	init_peercred(sk);
761 	spin_unlock(&sk->sk_peer_lock);
762 
763 	put_pid(old_pid);
764 	put_cred(old_cred);
765 }
766 
copy_peercred(struct sock * sk,struct sock * peersk)767 static void copy_peercred(struct sock *sk, struct sock *peersk)
768 {
769 	lockdep_assert_held(&unix_sk(peersk)->lock);
770 
771 	spin_lock(&sk->sk_peer_lock);
772 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
773 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
774 	spin_unlock(&sk->sk_peer_lock);
775 }
776 
unix_listen(struct socket * sock,int backlog)777 static int unix_listen(struct socket *sock, int backlog)
778 {
779 	int err;
780 	struct sock *sk = sock->sk;
781 	struct unix_sock *u = unix_sk(sk);
782 
783 	err = -EOPNOTSUPP;
784 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
785 		goto out;	/* Only stream/seqpacket sockets accept */
786 	err = -EINVAL;
787 	if (!READ_ONCE(u->addr))
788 		goto out;	/* No listens on an unbound socket */
789 	unix_state_lock(sk);
790 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
791 		goto out_unlock;
792 	if (backlog > sk->sk_max_ack_backlog)
793 		wake_up_interruptible_all(&u->peer_wait);
794 	sk->sk_max_ack_backlog	= backlog;
795 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
796 
797 	/* set credentials so connect can copy them */
798 	update_peercred(sk);
799 	err = 0;
800 
801 out_unlock:
802 	unix_state_unlock(sk);
803 out:
804 	return err;
805 }
806 
807 static int unix_release(struct socket *);
808 static int unix_bind(struct socket *, struct sockaddr *, int);
809 static int unix_stream_connect(struct socket *, struct sockaddr *,
810 			       int addr_len, int flags);
811 static int unix_socketpair(struct socket *, struct socket *);
812 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
813 static int unix_getname(struct socket *, struct sockaddr *, int);
814 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
815 static __poll_t unix_dgram_poll(struct file *, struct socket *,
816 				    poll_table *);
817 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
818 #ifdef CONFIG_COMPAT
819 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
820 #endif
821 static int unix_shutdown(struct socket *, int);
822 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
823 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
824 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
825 				       struct pipe_inode_info *, size_t size,
826 				       unsigned int flags);
827 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
828 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
829 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
830 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
831 static int unix_dgram_connect(struct socket *, struct sockaddr *,
832 			      int, int);
833 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
834 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
835 				  int);
836 
837 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)838 static int unix_count_nr_fds(struct sock *sk)
839 {
840 	struct sk_buff *skb;
841 	struct unix_sock *u;
842 	int nr_fds = 0;
843 
844 	spin_lock(&sk->sk_receive_queue.lock);
845 	skb = skb_peek(&sk->sk_receive_queue);
846 	while (skb) {
847 		u = unix_sk(skb->sk);
848 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
849 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
850 	}
851 	spin_unlock(&sk->sk_receive_queue.lock);
852 
853 	return nr_fds;
854 }
855 
unix_show_fdinfo(struct seq_file * m,struct socket * sock)856 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
857 {
858 	struct sock *sk = sock->sk;
859 	unsigned char s_state;
860 	struct unix_sock *u;
861 	int nr_fds = 0;
862 
863 	if (sk) {
864 		s_state = READ_ONCE(sk->sk_state);
865 		u = unix_sk(sk);
866 
867 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
868 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
869 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
870 		 */
871 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
872 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
873 		else if (s_state == TCP_LISTEN)
874 			nr_fds = unix_count_nr_fds(sk);
875 
876 		seq_printf(m, "scm_fds: %u\n", nr_fds);
877 	}
878 }
879 #else
880 #define unix_show_fdinfo NULL
881 #endif
882 
883 static const struct proto_ops unix_stream_ops = {
884 	.family =	PF_UNIX,
885 	.owner =	THIS_MODULE,
886 	.release =	unix_release,
887 	.bind =		unix_bind,
888 	.connect =	unix_stream_connect,
889 	.socketpair =	unix_socketpair,
890 	.accept =	unix_accept,
891 	.getname =	unix_getname,
892 	.poll =		unix_poll,
893 	.ioctl =	unix_ioctl,
894 #ifdef CONFIG_COMPAT
895 	.compat_ioctl =	unix_compat_ioctl,
896 #endif
897 	.listen =	unix_listen,
898 	.shutdown =	unix_shutdown,
899 	.sendmsg =	unix_stream_sendmsg,
900 	.recvmsg =	unix_stream_recvmsg,
901 	.read_skb =	unix_stream_read_skb,
902 	.mmap =		sock_no_mmap,
903 	.splice_read =	unix_stream_splice_read,
904 	.set_peek_off =	sk_set_peek_off,
905 	.show_fdinfo =	unix_show_fdinfo,
906 };
907 
908 static const struct proto_ops unix_dgram_ops = {
909 	.family =	PF_UNIX,
910 	.owner =	THIS_MODULE,
911 	.release =	unix_release,
912 	.bind =		unix_bind,
913 	.connect =	unix_dgram_connect,
914 	.socketpair =	unix_socketpair,
915 	.accept =	sock_no_accept,
916 	.getname =	unix_getname,
917 	.poll =		unix_dgram_poll,
918 	.ioctl =	unix_ioctl,
919 #ifdef CONFIG_COMPAT
920 	.compat_ioctl =	unix_compat_ioctl,
921 #endif
922 	.listen =	sock_no_listen,
923 	.shutdown =	unix_shutdown,
924 	.sendmsg =	unix_dgram_sendmsg,
925 	.read_skb =	unix_read_skb,
926 	.recvmsg =	unix_dgram_recvmsg,
927 	.mmap =		sock_no_mmap,
928 	.set_peek_off =	sk_set_peek_off,
929 	.show_fdinfo =	unix_show_fdinfo,
930 };
931 
932 static const struct proto_ops unix_seqpacket_ops = {
933 	.family =	PF_UNIX,
934 	.owner =	THIS_MODULE,
935 	.release =	unix_release,
936 	.bind =		unix_bind,
937 	.connect =	unix_stream_connect,
938 	.socketpair =	unix_socketpair,
939 	.accept =	unix_accept,
940 	.getname =	unix_getname,
941 	.poll =		unix_dgram_poll,
942 	.ioctl =	unix_ioctl,
943 #ifdef CONFIG_COMPAT
944 	.compat_ioctl =	unix_compat_ioctl,
945 #endif
946 	.listen =	unix_listen,
947 	.shutdown =	unix_shutdown,
948 	.sendmsg =	unix_seqpacket_sendmsg,
949 	.recvmsg =	unix_seqpacket_recvmsg,
950 	.mmap =		sock_no_mmap,
951 	.set_peek_off =	sk_set_peek_off,
952 	.show_fdinfo =	unix_show_fdinfo,
953 };
954 
unix_close(struct sock * sk,long timeout)955 static void unix_close(struct sock *sk, long timeout)
956 {
957 	/* Nothing to do here, unix socket does not need a ->close().
958 	 * This is merely for sockmap.
959 	 */
960 }
961 
unix_unhash(struct sock * sk)962 static void unix_unhash(struct sock *sk)
963 {
964 	/* Nothing to do here, unix socket does not need a ->unhash().
965 	 * This is merely for sockmap.
966 	 */
967 }
968 
unix_bpf_bypass_getsockopt(int level,int optname)969 static bool unix_bpf_bypass_getsockopt(int level, int optname)
970 {
971 	if (level == SOL_SOCKET) {
972 		switch (optname) {
973 		case SO_PEERPIDFD:
974 			return true;
975 		default:
976 			return false;
977 		}
978 	}
979 
980 	return false;
981 }
982 
983 struct proto unix_dgram_proto = {
984 	.name			= "UNIX",
985 	.owner			= THIS_MODULE,
986 	.obj_size		= sizeof(struct unix_sock),
987 	.close			= unix_close,
988 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
989 #ifdef CONFIG_BPF_SYSCALL
990 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
991 #endif
992 };
993 
994 struct proto unix_stream_proto = {
995 	.name			= "UNIX-STREAM",
996 	.owner			= THIS_MODULE,
997 	.obj_size		= sizeof(struct unix_sock),
998 	.close			= unix_close,
999 	.unhash			= unix_unhash,
1000 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1001 #ifdef CONFIG_BPF_SYSCALL
1002 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1003 #endif
1004 };
1005 
unix_create1(struct net * net,struct socket * sock,int kern,int type)1006 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1007 {
1008 	struct unix_sock *u;
1009 	struct sock *sk;
1010 	int err;
1011 
1012 	atomic_long_inc(&unix_nr_socks);
1013 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1014 		err = -ENFILE;
1015 		goto err;
1016 	}
1017 
1018 	if (type == SOCK_STREAM)
1019 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1020 	else /*dgram and  seqpacket */
1021 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1022 
1023 	if (!sk) {
1024 		err = -ENOMEM;
1025 		goto err;
1026 	}
1027 
1028 	sock_init_data(sock, sk);
1029 
1030 	sk->sk_hash		= unix_unbound_hash(sk);
1031 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1032 	sk->sk_write_space	= unix_write_space;
1033 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1034 	sk->sk_destruct		= unix_sock_destructor;
1035 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1036 
1037 	u = unix_sk(sk);
1038 	u->listener = NULL;
1039 	u->vertex = NULL;
1040 	u->path.dentry = NULL;
1041 	u->path.mnt = NULL;
1042 	spin_lock_init(&u->lock);
1043 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1044 	mutex_init(&u->iolock); /* single task reading lock */
1045 	mutex_init(&u->bindlock); /* single task binding lock */
1046 	init_waitqueue_head(&u->peer_wait);
1047 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1048 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1049 	unix_insert_unbound_socket(net, sk);
1050 
1051 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1052 
1053 	return sk;
1054 
1055 err:
1056 	atomic_long_dec(&unix_nr_socks);
1057 	return ERR_PTR(err);
1058 }
1059 
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1060 static int unix_create(struct net *net, struct socket *sock, int protocol,
1061 		       int kern)
1062 {
1063 	struct sock *sk;
1064 
1065 	if (protocol && protocol != PF_UNIX)
1066 		return -EPROTONOSUPPORT;
1067 
1068 	sock->state = SS_UNCONNECTED;
1069 
1070 	switch (sock->type) {
1071 	case SOCK_STREAM:
1072 		sock->ops = &unix_stream_ops;
1073 		break;
1074 		/*
1075 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1076 		 *	nothing uses it.
1077 		 */
1078 	case SOCK_RAW:
1079 		sock->type = SOCK_DGRAM;
1080 		fallthrough;
1081 	case SOCK_DGRAM:
1082 		sock->ops = &unix_dgram_ops;
1083 		break;
1084 	case SOCK_SEQPACKET:
1085 		sock->ops = &unix_seqpacket_ops;
1086 		break;
1087 	default:
1088 		return -ESOCKTNOSUPPORT;
1089 	}
1090 
1091 	sk = unix_create1(net, sock, kern, sock->type);
1092 	if (IS_ERR(sk))
1093 		return PTR_ERR(sk);
1094 
1095 	return 0;
1096 }
1097 
unix_release(struct socket * sock)1098 static int unix_release(struct socket *sock)
1099 {
1100 	struct sock *sk = sock->sk;
1101 
1102 	if (!sk)
1103 		return 0;
1104 
1105 	sk->sk_prot->close(sk, 0);
1106 	unix_release_sock(sk, 0);
1107 	sock->sk = NULL;
1108 
1109 	return 0;
1110 }
1111 
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1112 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1113 				  int type)
1114 {
1115 	struct inode *inode;
1116 	struct path path;
1117 	struct sock *sk;
1118 	int err;
1119 
1120 	unix_mkname_bsd(sunaddr, addr_len);
1121 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1122 	if (err)
1123 		goto fail;
1124 
1125 	err = path_permission(&path, MAY_WRITE);
1126 	if (err)
1127 		goto path_put;
1128 
1129 	err = -ECONNREFUSED;
1130 	inode = d_backing_inode(path.dentry);
1131 	if (!S_ISSOCK(inode->i_mode))
1132 		goto path_put;
1133 
1134 	sk = unix_find_socket_byinode(inode);
1135 	if (!sk)
1136 		goto path_put;
1137 
1138 	err = -EPROTOTYPE;
1139 	if (sk->sk_type == type)
1140 		touch_atime(&path);
1141 	else
1142 		goto sock_put;
1143 
1144 	path_put(&path);
1145 
1146 	return sk;
1147 
1148 sock_put:
1149 	sock_put(sk);
1150 path_put:
1151 	path_put(&path);
1152 fail:
1153 	return ERR_PTR(err);
1154 }
1155 
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1156 static struct sock *unix_find_abstract(struct net *net,
1157 				       struct sockaddr_un *sunaddr,
1158 				       int addr_len, int type)
1159 {
1160 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1161 	struct dentry *dentry;
1162 	struct sock *sk;
1163 
1164 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1165 	if (!sk)
1166 		return ERR_PTR(-ECONNREFUSED);
1167 
1168 	dentry = unix_sk(sk)->path.dentry;
1169 	if (dentry)
1170 		touch_atime(&unix_sk(sk)->path);
1171 
1172 	return sk;
1173 }
1174 
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1175 static struct sock *unix_find_other(struct net *net,
1176 				    struct sockaddr_un *sunaddr,
1177 				    int addr_len, int type)
1178 {
1179 	struct sock *sk;
1180 
1181 	if (sunaddr->sun_path[0])
1182 		sk = unix_find_bsd(sunaddr, addr_len, type);
1183 	else
1184 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1185 
1186 	return sk;
1187 }
1188 
unix_autobind(struct sock * sk)1189 static int unix_autobind(struct sock *sk)
1190 {
1191 	struct unix_sock *u = unix_sk(sk);
1192 	unsigned int new_hash, old_hash;
1193 	struct net *net = sock_net(sk);
1194 	struct unix_address *addr;
1195 	u32 lastnum, ordernum;
1196 	int err;
1197 
1198 	err = mutex_lock_interruptible(&u->bindlock);
1199 	if (err)
1200 		return err;
1201 
1202 	if (u->addr)
1203 		goto out;
1204 
1205 	err = -ENOMEM;
1206 	addr = kzalloc(sizeof(*addr) +
1207 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1208 	if (!addr)
1209 		goto out;
1210 
1211 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1212 	addr->name->sun_family = AF_UNIX;
1213 	refcount_set(&addr->refcnt, 1);
1214 
1215 	old_hash = sk->sk_hash;
1216 	ordernum = get_random_u32();
1217 	lastnum = ordernum & 0xFFFFF;
1218 retry:
1219 	ordernum = (ordernum + 1) & 0xFFFFF;
1220 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1221 
1222 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1223 	unix_table_double_lock(net, old_hash, new_hash);
1224 
1225 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1226 		unix_table_double_unlock(net, old_hash, new_hash);
1227 
1228 		/* __unix_find_socket_byname() may take long time if many names
1229 		 * are already in use.
1230 		 */
1231 		cond_resched();
1232 
1233 		if (ordernum == lastnum) {
1234 			/* Give up if all names seems to be in use. */
1235 			err = -ENOSPC;
1236 			unix_release_addr(addr);
1237 			goto out;
1238 		}
1239 
1240 		goto retry;
1241 	}
1242 
1243 	__unix_set_addr_hash(net, sk, addr, new_hash);
1244 	unix_table_double_unlock(net, old_hash, new_hash);
1245 	err = 0;
1246 
1247 out:	mutex_unlock(&u->bindlock);
1248 	return err;
1249 }
1250 
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1251 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1252 			 int addr_len)
1253 {
1254 	umode_t mode = S_IFSOCK |
1255 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1256 	struct unix_sock *u = unix_sk(sk);
1257 	unsigned int new_hash, old_hash;
1258 	struct net *net = sock_net(sk);
1259 	struct mnt_idmap *idmap;
1260 	struct unix_address *addr;
1261 	struct dentry *dentry;
1262 	struct path parent;
1263 	int err;
1264 
1265 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1266 	addr = unix_create_addr(sunaddr, addr_len);
1267 	if (!addr)
1268 		return -ENOMEM;
1269 
1270 	/*
1271 	 * Get the parent directory, calculate the hash for last
1272 	 * component.
1273 	 */
1274 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1275 	if (IS_ERR(dentry)) {
1276 		err = PTR_ERR(dentry);
1277 		goto out;
1278 	}
1279 
1280 	/*
1281 	 * All right, let's create it.
1282 	 */
1283 	idmap = mnt_idmap(parent.mnt);
1284 	err = security_path_mknod(&parent, dentry, mode, 0);
1285 	if (!err)
1286 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1287 	if (err)
1288 		goto out_path;
1289 	err = mutex_lock_interruptible(&u->bindlock);
1290 	if (err)
1291 		goto out_unlink;
1292 	if (u->addr)
1293 		goto out_unlock;
1294 
1295 	old_hash = sk->sk_hash;
1296 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1297 	unix_table_double_lock(net, old_hash, new_hash);
1298 	u->path.mnt = mntget(parent.mnt);
1299 	u->path.dentry = dget(dentry);
1300 	__unix_set_addr_hash(net, sk, addr, new_hash);
1301 	unix_table_double_unlock(net, old_hash, new_hash);
1302 	unix_insert_bsd_socket(sk);
1303 	mutex_unlock(&u->bindlock);
1304 	done_path_create(&parent, dentry);
1305 	return 0;
1306 
1307 out_unlock:
1308 	mutex_unlock(&u->bindlock);
1309 	err = -EINVAL;
1310 out_unlink:
1311 	/* failed after successful mknod?  unlink what we'd created... */
1312 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1313 out_path:
1314 	done_path_create(&parent, dentry);
1315 out:
1316 	unix_release_addr(addr);
1317 	return err == -EEXIST ? -EADDRINUSE : err;
1318 }
1319 
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1320 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1321 			      int addr_len)
1322 {
1323 	struct unix_sock *u = unix_sk(sk);
1324 	unsigned int new_hash, old_hash;
1325 	struct net *net = sock_net(sk);
1326 	struct unix_address *addr;
1327 	int err;
1328 
1329 	addr = unix_create_addr(sunaddr, addr_len);
1330 	if (!addr)
1331 		return -ENOMEM;
1332 
1333 	err = mutex_lock_interruptible(&u->bindlock);
1334 	if (err)
1335 		goto out;
1336 
1337 	if (u->addr) {
1338 		err = -EINVAL;
1339 		goto out_mutex;
1340 	}
1341 
1342 	old_hash = sk->sk_hash;
1343 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1344 	unix_table_double_lock(net, old_hash, new_hash);
1345 
1346 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1347 		goto out_spin;
1348 
1349 	__unix_set_addr_hash(net, sk, addr, new_hash);
1350 	unix_table_double_unlock(net, old_hash, new_hash);
1351 	mutex_unlock(&u->bindlock);
1352 	return 0;
1353 
1354 out_spin:
1355 	unix_table_double_unlock(net, old_hash, new_hash);
1356 	err = -EADDRINUSE;
1357 out_mutex:
1358 	mutex_unlock(&u->bindlock);
1359 out:
1360 	unix_release_addr(addr);
1361 	return err;
1362 }
1363 
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1364 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1365 {
1366 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1367 	struct sock *sk = sock->sk;
1368 	int err;
1369 
1370 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1371 	    sunaddr->sun_family == AF_UNIX)
1372 		return unix_autobind(sk);
1373 
1374 	err = unix_validate_addr(sunaddr, addr_len);
1375 	if (err)
1376 		return err;
1377 
1378 	if (sunaddr->sun_path[0])
1379 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1380 	else
1381 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1382 
1383 	return err;
1384 }
1385 
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1386 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1387 {
1388 	if (unlikely(sk1 == sk2) || !sk2) {
1389 		unix_state_lock(sk1);
1390 		return;
1391 	}
1392 
1393 	if (sk1 > sk2)
1394 		swap(sk1, sk2);
1395 
1396 	unix_state_lock(sk1);
1397 	unix_state_lock(sk2);
1398 }
1399 
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1400 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1401 {
1402 	if (unlikely(sk1 == sk2) || !sk2) {
1403 		unix_state_unlock(sk1);
1404 		return;
1405 	}
1406 	unix_state_unlock(sk1);
1407 	unix_state_unlock(sk2);
1408 }
1409 
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1410 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1411 			      int alen, int flags)
1412 {
1413 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1414 	struct sock *sk = sock->sk;
1415 	struct sock *other;
1416 	int err;
1417 
1418 	err = -EINVAL;
1419 	if (alen < offsetofend(struct sockaddr, sa_family))
1420 		goto out;
1421 
1422 	if (addr->sa_family != AF_UNSPEC) {
1423 		err = unix_validate_addr(sunaddr, alen);
1424 		if (err)
1425 			goto out;
1426 
1427 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1428 		if (err)
1429 			goto out;
1430 
1431 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1432 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1433 		    !READ_ONCE(unix_sk(sk)->addr)) {
1434 			err = unix_autobind(sk);
1435 			if (err)
1436 				goto out;
1437 		}
1438 
1439 restart:
1440 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1441 		if (IS_ERR(other)) {
1442 			err = PTR_ERR(other);
1443 			goto out;
1444 		}
1445 
1446 		unix_state_double_lock(sk, other);
1447 
1448 		/* Apparently VFS overslept socket death. Retry. */
1449 		if (sock_flag(other, SOCK_DEAD)) {
1450 			unix_state_double_unlock(sk, other);
1451 			sock_put(other);
1452 			goto restart;
1453 		}
1454 
1455 		err = -EPERM;
1456 		if (!unix_may_send(sk, other))
1457 			goto out_unlock;
1458 
1459 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1460 		if (err)
1461 			goto out_unlock;
1462 
1463 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1464 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1465 	} else {
1466 		/*
1467 		 *	1003.1g breaking connected state with AF_UNSPEC
1468 		 */
1469 		other = NULL;
1470 		unix_state_double_lock(sk, other);
1471 	}
1472 
1473 	/*
1474 	 * If it was connected, reconnect.
1475 	 */
1476 	if (unix_peer(sk)) {
1477 		struct sock *old_peer = unix_peer(sk);
1478 
1479 		unix_peer(sk) = other;
1480 		if (!other)
1481 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1482 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1483 
1484 		unix_state_double_unlock(sk, other);
1485 
1486 		if (other != old_peer) {
1487 			unix_dgram_disconnected(sk, old_peer);
1488 
1489 			unix_state_lock(old_peer);
1490 			if (!unix_peer(old_peer))
1491 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1492 			unix_state_unlock(old_peer);
1493 		}
1494 
1495 		sock_put(old_peer);
1496 	} else {
1497 		unix_peer(sk) = other;
1498 		unix_state_double_unlock(sk, other);
1499 	}
1500 
1501 	return 0;
1502 
1503 out_unlock:
1504 	unix_state_double_unlock(sk, other);
1505 	sock_put(other);
1506 out:
1507 	return err;
1508 }
1509 
unix_wait_for_peer(struct sock * other,long timeo)1510 static long unix_wait_for_peer(struct sock *other, long timeo)
1511 	__releases(&unix_sk(other)->lock)
1512 {
1513 	struct unix_sock *u = unix_sk(other);
1514 	int sched;
1515 	DEFINE_WAIT(wait);
1516 
1517 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1518 
1519 	sched = !sock_flag(other, SOCK_DEAD) &&
1520 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1521 		unix_recvq_full_lockless(other);
1522 
1523 	unix_state_unlock(other);
1524 
1525 	if (sched)
1526 		timeo = schedule_timeout(timeo);
1527 
1528 	finish_wait(&u->peer_wait, &wait);
1529 	return timeo;
1530 }
1531 
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1532 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1533 			       int addr_len, int flags)
1534 {
1535 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1536 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1537 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1538 	struct net *net = sock_net(sk);
1539 	struct sk_buff *skb = NULL;
1540 	unsigned char state;
1541 	long timeo;
1542 	int err;
1543 
1544 	err = unix_validate_addr(sunaddr, addr_len);
1545 	if (err)
1546 		goto out;
1547 
1548 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1549 	if (err)
1550 		goto out;
1551 
1552 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1553 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1554 	    !READ_ONCE(u->addr)) {
1555 		err = unix_autobind(sk);
1556 		if (err)
1557 			goto out;
1558 	}
1559 
1560 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1561 
1562 	/* First of all allocate resources.
1563 	 * If we will make it after state is locked,
1564 	 * we will have to recheck all again in any case.
1565 	 */
1566 
1567 	/* create new sock for complete connection */
1568 	newsk = unix_create1(net, NULL, 0, sock->type);
1569 	if (IS_ERR(newsk)) {
1570 		err = PTR_ERR(newsk);
1571 		goto out;
1572 	}
1573 
1574 	/* Allocate skb for sending to listening sock */
1575 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1576 	if (!skb) {
1577 		err = -ENOMEM;
1578 		goto out_free_sk;
1579 	}
1580 
1581 restart:
1582 	/*  Find listening sock. */
1583 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1584 	if (IS_ERR(other)) {
1585 		err = PTR_ERR(other);
1586 		goto out_free_skb;
1587 	}
1588 
1589 	unix_state_lock(other);
1590 
1591 	/* Apparently VFS overslept socket death. Retry. */
1592 	if (sock_flag(other, SOCK_DEAD)) {
1593 		unix_state_unlock(other);
1594 		sock_put(other);
1595 		goto restart;
1596 	}
1597 
1598 	if (other->sk_state != TCP_LISTEN ||
1599 	    other->sk_shutdown & RCV_SHUTDOWN) {
1600 		err = -ECONNREFUSED;
1601 		goto out_unlock;
1602 	}
1603 
1604 	if (unix_recvq_full_lockless(other)) {
1605 		if (!timeo) {
1606 			err = -EAGAIN;
1607 			goto out_unlock;
1608 		}
1609 
1610 		timeo = unix_wait_for_peer(other, timeo);
1611 		sock_put(other);
1612 
1613 		err = sock_intr_errno(timeo);
1614 		if (signal_pending(current))
1615 			goto out_free_skb;
1616 
1617 		goto restart;
1618 	}
1619 
1620 	/* self connect and simultaneous connect are eliminated
1621 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1622 	 */
1623 	state = READ_ONCE(sk->sk_state);
1624 	if (unlikely(state != TCP_CLOSE)) {
1625 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1626 		goto out_unlock;
1627 	}
1628 
1629 	unix_state_lock(sk);
1630 
1631 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1632 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1633 		unix_state_unlock(sk);
1634 		goto out_unlock;
1635 	}
1636 
1637 	err = security_unix_stream_connect(sk, other, newsk);
1638 	if (err) {
1639 		unix_state_unlock(sk);
1640 		goto out_unlock;
1641 	}
1642 
1643 	/* The way is open! Fastly set all the necessary fields... */
1644 
1645 	sock_hold(sk);
1646 	unix_peer(newsk)	= sk;
1647 	newsk->sk_state		= TCP_ESTABLISHED;
1648 	newsk->sk_type		= sk->sk_type;
1649 	init_peercred(newsk);
1650 	newu = unix_sk(newsk);
1651 	newu->listener = other;
1652 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1653 	otheru = unix_sk(other);
1654 
1655 	/* copy address information from listening to new sock
1656 	 *
1657 	 * The contents of *(otheru->addr) and otheru->path
1658 	 * are seen fully set up here, since we have found
1659 	 * otheru in hash under its lock.  Insertion into the
1660 	 * hash chain we'd found it in had been done in an
1661 	 * earlier critical area protected by the chain's lock,
1662 	 * the same one where we'd set *(otheru->addr) contents,
1663 	 * as well as otheru->path and otheru->addr itself.
1664 	 *
1665 	 * Using smp_store_release() here to set newu->addr
1666 	 * is enough to make those stores, as well as stores
1667 	 * to newu->path visible to anyone who gets newu->addr
1668 	 * by smp_load_acquire().  IOW, the same warranties
1669 	 * as for unix_sock instances bound in unix_bind() or
1670 	 * in unix_autobind().
1671 	 */
1672 	if (otheru->path.dentry) {
1673 		path_get(&otheru->path);
1674 		newu->path = otheru->path;
1675 	}
1676 	refcount_inc(&otheru->addr->refcnt);
1677 	smp_store_release(&newu->addr, otheru->addr);
1678 
1679 	/* Set credentials */
1680 	copy_peercred(sk, other);
1681 
1682 	sock->state	= SS_CONNECTED;
1683 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1684 	sock_hold(newsk);
1685 
1686 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1687 	unix_peer(sk)	= newsk;
1688 
1689 	unix_state_unlock(sk);
1690 
1691 	/* take ten and send info to listening sock */
1692 	spin_lock(&other->sk_receive_queue.lock);
1693 	__skb_queue_tail(&other->sk_receive_queue, skb);
1694 	spin_unlock(&other->sk_receive_queue.lock);
1695 	unix_state_unlock(other);
1696 	other->sk_data_ready(other);
1697 	sock_put(other);
1698 	return 0;
1699 
1700 out_unlock:
1701 	unix_state_unlock(other);
1702 	sock_put(other);
1703 out_free_skb:
1704 	consume_skb(skb);
1705 out_free_sk:
1706 	unix_release_sock(newsk, 0);
1707 out:
1708 	return err;
1709 }
1710 
unix_socketpair(struct socket * socka,struct socket * sockb)1711 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1712 {
1713 	struct sock *ska = socka->sk, *skb = sockb->sk;
1714 
1715 	/* Join our sockets back to back */
1716 	sock_hold(ska);
1717 	sock_hold(skb);
1718 	unix_peer(ska) = skb;
1719 	unix_peer(skb) = ska;
1720 	init_peercred(ska);
1721 	init_peercred(skb);
1722 
1723 	ska->sk_state = TCP_ESTABLISHED;
1724 	skb->sk_state = TCP_ESTABLISHED;
1725 	socka->state  = SS_CONNECTED;
1726 	sockb->state  = SS_CONNECTED;
1727 	return 0;
1728 }
1729 
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1730 static void unix_sock_inherit_flags(const struct socket *old,
1731 				    struct socket *new)
1732 {
1733 	if (test_bit(SOCK_PASSCRED, &old->flags))
1734 		set_bit(SOCK_PASSCRED, &new->flags);
1735 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1736 		set_bit(SOCK_PASSPIDFD, &new->flags);
1737 	if (test_bit(SOCK_PASSSEC, &old->flags))
1738 		set_bit(SOCK_PASSSEC, &new->flags);
1739 }
1740 
unix_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)1741 static int unix_accept(struct socket *sock, struct socket *newsock,
1742 		       struct proto_accept_arg *arg)
1743 {
1744 	struct sock *sk = sock->sk;
1745 	struct sk_buff *skb;
1746 	struct sock *tsk;
1747 
1748 	arg->err = -EOPNOTSUPP;
1749 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1750 		goto out;
1751 
1752 	arg->err = -EINVAL;
1753 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1754 		goto out;
1755 
1756 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1757 	 * so that no locks are necessary.
1758 	 */
1759 
1760 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1761 				&arg->err);
1762 	if (!skb) {
1763 		/* This means receive shutdown. */
1764 		if (arg->err == 0)
1765 			arg->err = -EINVAL;
1766 		goto out;
1767 	}
1768 
1769 	tsk = skb->sk;
1770 	skb_free_datagram(sk, skb);
1771 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1772 
1773 	/* attach accepted sock to socket */
1774 	unix_state_lock(tsk);
1775 	unix_update_edges(unix_sk(tsk));
1776 	newsock->state = SS_CONNECTED;
1777 	unix_sock_inherit_flags(sock, newsock);
1778 	sock_graft(tsk, newsock);
1779 	unix_state_unlock(tsk);
1780 	return 0;
1781 
1782 out:
1783 	return arg->err;
1784 }
1785 
1786 
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1787 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1788 {
1789 	struct sock *sk = sock->sk;
1790 	struct unix_address *addr;
1791 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1792 	int err = 0;
1793 
1794 	if (peer) {
1795 		sk = unix_peer_get(sk);
1796 
1797 		err = -ENOTCONN;
1798 		if (!sk)
1799 			goto out;
1800 		err = 0;
1801 	} else {
1802 		sock_hold(sk);
1803 	}
1804 
1805 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1806 	if (!addr) {
1807 		sunaddr->sun_family = AF_UNIX;
1808 		sunaddr->sun_path[0] = 0;
1809 		err = offsetof(struct sockaddr_un, sun_path);
1810 	} else {
1811 		err = addr->len;
1812 		memcpy(sunaddr, addr->name, addr->len);
1813 
1814 		if (peer)
1815 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1816 					       CGROUP_UNIX_GETPEERNAME);
1817 		else
1818 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1819 					       CGROUP_UNIX_GETSOCKNAME);
1820 	}
1821 	sock_put(sk);
1822 out:
1823 	return err;
1824 }
1825 
1826 /* The "user->unix_inflight" variable is protected by the garbage
1827  * collection lock, and we just read it locklessly here. If you go
1828  * over the limit, there might be a tiny race in actually noticing
1829  * it across threads. Tough.
1830  */
too_many_unix_fds(struct task_struct * p)1831 static inline bool too_many_unix_fds(struct task_struct *p)
1832 {
1833 	struct user_struct *user = current_user();
1834 
1835 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1836 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1837 	return false;
1838 }
1839 
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1840 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1841 {
1842 	if (too_many_unix_fds(current))
1843 		return -ETOOMANYREFS;
1844 
1845 	UNIXCB(skb).fp = scm->fp;
1846 	scm->fp = NULL;
1847 
1848 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1849 		return -ENOMEM;
1850 
1851 	return 0;
1852 }
1853 
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1854 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1855 {
1856 	scm->fp = UNIXCB(skb).fp;
1857 	UNIXCB(skb).fp = NULL;
1858 
1859 	unix_destroy_fpl(scm->fp);
1860 }
1861 
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1862 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1863 {
1864 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1865 }
1866 
unix_destruct_scm(struct sk_buff * skb)1867 static void unix_destruct_scm(struct sk_buff *skb)
1868 {
1869 	struct scm_cookie scm;
1870 
1871 	memset(&scm, 0, sizeof(scm));
1872 	scm.pid  = UNIXCB(skb).pid;
1873 	if (UNIXCB(skb).fp)
1874 		unix_detach_fds(&scm, skb);
1875 
1876 	/* Alas, it calls VFS */
1877 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1878 	scm_destroy(&scm);
1879 	sock_wfree(skb);
1880 }
1881 
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1882 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1883 {
1884 	int err = 0;
1885 
1886 	UNIXCB(skb).pid  = get_pid(scm->pid);
1887 	UNIXCB(skb).uid = scm->creds.uid;
1888 	UNIXCB(skb).gid = scm->creds.gid;
1889 	UNIXCB(skb).fp = NULL;
1890 	unix_get_secdata(scm, skb);
1891 	if (scm->fp && send_fds)
1892 		err = unix_attach_fds(scm, skb);
1893 
1894 	skb->destructor = unix_destruct_scm;
1895 	return err;
1896 }
1897 
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1898 static bool unix_passcred_enabled(const struct socket *sock,
1899 				  const struct sock *other)
1900 {
1901 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1902 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1903 	       !other->sk_socket ||
1904 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1905 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1906 }
1907 
1908 /*
1909  * Some apps rely on write() giving SCM_CREDENTIALS
1910  * We include credentials if source or destination socket
1911  * asserted SOCK_PASSCRED.
1912  */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1913 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1914 			    const struct sock *other)
1915 {
1916 	if (UNIXCB(skb).pid)
1917 		return;
1918 	if (unix_passcred_enabled(sock, other)) {
1919 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1920 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1921 	}
1922 }
1923 
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1924 static bool unix_skb_scm_eq(struct sk_buff *skb,
1925 			    struct scm_cookie *scm)
1926 {
1927 	return UNIXCB(skb).pid == scm->pid &&
1928 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1929 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1930 	       unix_secdata_eq(scm, skb);
1931 }
1932 
scm_stat_add(struct sock * sk,struct sk_buff * skb)1933 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1934 {
1935 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1936 	struct unix_sock *u = unix_sk(sk);
1937 
1938 	if (unlikely(fp && fp->count)) {
1939 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1940 		unix_add_edges(fp, u);
1941 	}
1942 }
1943 
scm_stat_del(struct sock * sk,struct sk_buff * skb)1944 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1945 {
1946 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1947 	struct unix_sock *u = unix_sk(sk);
1948 
1949 	if (unlikely(fp && fp->count)) {
1950 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1951 		unix_del_edges(fp);
1952 	}
1953 }
1954 
1955 /*
1956  *	Send AF_UNIX data.
1957  */
1958 
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1959 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1960 			      size_t len)
1961 {
1962 	struct sock *sk = sock->sk, *other = NULL;
1963 	struct unix_sock *u = unix_sk(sk);
1964 	struct scm_cookie scm;
1965 	struct sk_buff *skb;
1966 	int data_len = 0;
1967 	int sk_locked;
1968 	long timeo;
1969 	int err;
1970 
1971 	err = scm_send(sock, msg, &scm, false);
1972 	if (err < 0)
1973 		return err;
1974 
1975 	wait_for_unix_gc(scm.fp);
1976 
1977 	if (msg->msg_flags & MSG_OOB) {
1978 		err = -EOPNOTSUPP;
1979 		goto out;
1980 	}
1981 
1982 	if (msg->msg_namelen) {
1983 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
1984 		if (err)
1985 			goto out;
1986 
1987 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1988 							    msg->msg_name,
1989 							    &msg->msg_namelen,
1990 							    NULL);
1991 		if (err)
1992 			goto out;
1993 	}
1994 
1995 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1996 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1997 	    !READ_ONCE(u->addr)) {
1998 		err = unix_autobind(sk);
1999 		if (err)
2000 			goto out;
2001 	}
2002 
2003 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
2004 		err = -EMSGSIZE;
2005 		goto out;
2006 	}
2007 
2008 	if (len > SKB_MAX_ALLOC) {
2009 		data_len = min_t(size_t,
2010 				 len - SKB_MAX_ALLOC,
2011 				 MAX_SKB_FRAGS * PAGE_SIZE);
2012 		data_len = PAGE_ALIGN(data_len);
2013 
2014 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2015 	}
2016 
2017 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2018 				   msg->msg_flags & MSG_DONTWAIT, &err,
2019 				   PAGE_ALLOC_COSTLY_ORDER);
2020 	if (!skb)
2021 		goto out;
2022 
2023 	err = unix_scm_to_skb(&scm, skb, true);
2024 	if (err < 0)
2025 		goto out_free;
2026 
2027 	skb_put(skb, len - data_len);
2028 	skb->data_len = data_len;
2029 	skb->len = len;
2030 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2031 	if (err)
2032 		goto out_free;
2033 
2034 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2035 
2036 	if (msg->msg_namelen) {
2037 lookup:
2038 		other = unix_find_other(sock_net(sk), msg->msg_name,
2039 					msg->msg_namelen, sk->sk_type);
2040 		if (IS_ERR(other)) {
2041 			err = PTR_ERR(other);
2042 			goto out_free;
2043 		}
2044 	} else {
2045 		other = unix_peer_get(sk);
2046 		if (!other) {
2047 			err = -ENOTCONN;
2048 			goto out_free;
2049 		}
2050 	}
2051 
2052 	if (sk_filter(other, skb) < 0) {
2053 		/* Toss the packet but do not return any error to the sender */
2054 		err = len;
2055 		goto out_sock_put;
2056 	}
2057 
2058 restart:
2059 	sk_locked = 0;
2060 	unix_state_lock(other);
2061 restart_locked:
2062 
2063 	if (!unix_may_send(sk, other)) {
2064 		err = -EPERM;
2065 		goto out_unlock;
2066 	}
2067 
2068 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2069 		/* Check with 1003.1g - what should datagram error */
2070 
2071 		unix_state_unlock(other);
2072 
2073 		if (sk->sk_type == SOCK_SEQPACKET) {
2074 			/* We are here only when racing with unix_release_sock()
2075 			 * is clearing @other. Never change state to TCP_CLOSE
2076 			 * unlike SOCK_DGRAM wants.
2077 			 */
2078 			err = -EPIPE;
2079 			goto out_sock_put;
2080 		}
2081 
2082 		if (!sk_locked)
2083 			unix_state_lock(sk);
2084 
2085 		if (unix_peer(sk) == other) {
2086 			unix_peer(sk) = NULL;
2087 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2088 
2089 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2090 			unix_state_unlock(sk);
2091 
2092 			unix_dgram_disconnected(sk, other);
2093 			sock_put(other);
2094 			err = -ECONNREFUSED;
2095 			goto out_sock_put;
2096 		}
2097 
2098 		unix_state_unlock(sk);
2099 
2100 		if (!msg->msg_namelen) {
2101 			err = -ECONNRESET;
2102 			goto out_sock_put;
2103 		}
2104 
2105 		sock_put(other);
2106 		goto lookup;
2107 	}
2108 
2109 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2110 		err = -EPIPE;
2111 		goto out_unlock;
2112 	}
2113 
2114 	if (sk->sk_type != SOCK_SEQPACKET) {
2115 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2116 		if (err)
2117 			goto out_unlock;
2118 	}
2119 
2120 	/* other == sk && unix_peer(other) != sk if
2121 	 * - unix_peer(sk) == NULL, destination address bound to sk
2122 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2123 	 */
2124 	if (other != sk &&
2125 	    unlikely(unix_peer(other) != sk &&
2126 	    unix_recvq_full_lockless(other))) {
2127 		if (timeo) {
2128 			timeo = unix_wait_for_peer(other, timeo);
2129 
2130 			err = sock_intr_errno(timeo);
2131 			if (signal_pending(current))
2132 				goto out_sock_put;
2133 
2134 			goto restart;
2135 		}
2136 
2137 		if (!sk_locked) {
2138 			unix_state_unlock(other);
2139 			unix_state_double_lock(sk, other);
2140 		}
2141 
2142 		if (unix_peer(sk) != other ||
2143 		    unix_dgram_peer_wake_me(sk, other)) {
2144 			err = -EAGAIN;
2145 			sk_locked = 1;
2146 			goto out_unlock;
2147 		}
2148 
2149 		if (!sk_locked) {
2150 			sk_locked = 1;
2151 			goto restart_locked;
2152 		}
2153 	}
2154 
2155 	if (unlikely(sk_locked))
2156 		unix_state_unlock(sk);
2157 
2158 	if (sock_flag(other, SOCK_RCVTSTAMP))
2159 		__net_timestamp(skb);
2160 	maybe_add_creds(skb, sock, other);
2161 	scm_stat_add(other, skb);
2162 	skb_queue_tail(&other->sk_receive_queue, skb);
2163 	unix_state_unlock(other);
2164 	other->sk_data_ready(other);
2165 	sock_put(other);
2166 	scm_destroy(&scm);
2167 	return len;
2168 
2169 out_unlock:
2170 	if (sk_locked)
2171 		unix_state_unlock(sk);
2172 	unix_state_unlock(other);
2173 out_sock_put:
2174 	sock_put(other);
2175 out_free:
2176 	consume_skb(skb);
2177 out:
2178 	scm_destroy(&scm);
2179 	return err;
2180 }
2181 
2182 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2183  * bytes, and a minimum of a full page.
2184  */
2185 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2186 
2187 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2188 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2189 		     struct scm_cookie *scm, bool fds_sent)
2190 {
2191 	struct unix_sock *ousk = unix_sk(other);
2192 	struct sk_buff *skb;
2193 	int err;
2194 
2195 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2196 
2197 	if (!skb)
2198 		return err;
2199 
2200 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2201 	if (err < 0)
2202 		goto out;
2203 
2204 	skb_put(skb, 1);
2205 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2206 
2207 	if (err)
2208 		goto out;
2209 
2210 	unix_state_lock(other);
2211 
2212 	if (sock_flag(other, SOCK_DEAD) ||
2213 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2214 		unix_state_unlock(other);
2215 		err = -EPIPE;
2216 		goto out;
2217 	}
2218 
2219 	maybe_add_creds(skb, sock, other);
2220 	scm_stat_add(other, skb);
2221 
2222 	spin_lock(&other->sk_receive_queue.lock);
2223 	WRITE_ONCE(ousk->oob_skb, skb);
2224 	__skb_queue_tail(&other->sk_receive_queue, skb);
2225 	spin_unlock(&other->sk_receive_queue.lock);
2226 
2227 	sk_send_sigurg(other);
2228 	unix_state_unlock(other);
2229 	other->sk_data_ready(other);
2230 
2231 	return 0;
2232 out:
2233 	consume_skb(skb);
2234 	return err;
2235 }
2236 #endif
2237 
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2238 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2239 			       size_t len)
2240 {
2241 	struct sock *sk = sock->sk;
2242 	struct sk_buff *skb = NULL;
2243 	struct sock *other = NULL;
2244 	struct scm_cookie scm;
2245 	bool fds_sent = false;
2246 	int err, sent = 0;
2247 
2248 	err = scm_send(sock, msg, &scm, false);
2249 	if (err < 0)
2250 		return err;
2251 
2252 	wait_for_unix_gc(scm.fp);
2253 
2254 	if (msg->msg_flags & MSG_OOB) {
2255 		err = -EOPNOTSUPP;
2256 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2257 		if (len)
2258 			len--;
2259 		else
2260 #endif
2261 			goto out_err;
2262 	}
2263 
2264 	if (msg->msg_namelen) {
2265 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2266 		goto out_err;
2267 	} else {
2268 		other = unix_peer(sk);
2269 		if (!other) {
2270 			err = -ENOTCONN;
2271 			goto out_err;
2272 		}
2273 	}
2274 
2275 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2276 		goto out_pipe;
2277 
2278 	while (sent < len) {
2279 		int size = len - sent;
2280 		int data_len;
2281 
2282 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2283 			skb = sock_alloc_send_pskb(sk, 0, 0,
2284 						   msg->msg_flags & MSG_DONTWAIT,
2285 						   &err, 0);
2286 		} else {
2287 			/* Keep two messages in the pipe so it schedules better */
2288 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2289 
2290 			/* allow fallback to order-0 allocations */
2291 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2292 
2293 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2294 
2295 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2296 
2297 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2298 						   msg->msg_flags & MSG_DONTWAIT, &err,
2299 						   get_order(UNIX_SKB_FRAGS_SZ));
2300 		}
2301 		if (!skb)
2302 			goto out_err;
2303 
2304 		/* Only send the fds in the first buffer */
2305 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2306 		if (err < 0)
2307 			goto out_free;
2308 
2309 		fds_sent = true;
2310 
2311 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2312 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2313 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2314 						   sk->sk_allocation);
2315 			if (err < 0)
2316 				goto out_free;
2317 
2318 			size = err;
2319 			refcount_add(size, &sk->sk_wmem_alloc);
2320 		} else {
2321 			skb_put(skb, size - data_len);
2322 			skb->data_len = data_len;
2323 			skb->len = size;
2324 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2325 			if (err)
2326 				goto out_free;
2327 		}
2328 
2329 		unix_state_lock(other);
2330 
2331 		if (sock_flag(other, SOCK_DEAD) ||
2332 		    (other->sk_shutdown & RCV_SHUTDOWN))
2333 			goto out_pipe_unlock;
2334 
2335 		maybe_add_creds(skb, sock, other);
2336 		scm_stat_add(other, skb);
2337 		skb_queue_tail(&other->sk_receive_queue, skb);
2338 		unix_state_unlock(other);
2339 		other->sk_data_ready(other);
2340 		sent += size;
2341 	}
2342 
2343 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2344 	if (msg->msg_flags & MSG_OOB) {
2345 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2346 		if (err)
2347 			goto out_err;
2348 		sent++;
2349 	}
2350 #endif
2351 
2352 	scm_destroy(&scm);
2353 
2354 	return sent;
2355 
2356 out_pipe_unlock:
2357 	unix_state_unlock(other);
2358 out_pipe:
2359 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2360 		send_sig(SIGPIPE, current, 0);
2361 	err = -EPIPE;
2362 out_free:
2363 	consume_skb(skb);
2364 out_err:
2365 	scm_destroy(&scm);
2366 	return sent ? : err;
2367 }
2368 
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2369 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2370 				  size_t len)
2371 {
2372 	int err;
2373 	struct sock *sk = sock->sk;
2374 
2375 	err = sock_error(sk);
2376 	if (err)
2377 		return err;
2378 
2379 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2380 		return -ENOTCONN;
2381 
2382 	if (msg->msg_namelen)
2383 		msg->msg_namelen = 0;
2384 
2385 	return unix_dgram_sendmsg(sock, msg, len);
2386 }
2387 
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2388 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2389 				  size_t size, int flags)
2390 {
2391 	struct sock *sk = sock->sk;
2392 
2393 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2394 		return -ENOTCONN;
2395 
2396 	return unix_dgram_recvmsg(sock, msg, size, flags);
2397 }
2398 
unix_copy_addr(struct msghdr * msg,struct sock * sk)2399 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2400 {
2401 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2402 
2403 	if (addr) {
2404 		msg->msg_namelen = addr->len;
2405 		memcpy(msg->msg_name, addr->name, addr->len);
2406 	}
2407 }
2408 
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2409 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2410 			 int flags)
2411 {
2412 	struct scm_cookie scm;
2413 	struct socket *sock = sk->sk_socket;
2414 	struct unix_sock *u = unix_sk(sk);
2415 	struct sk_buff *skb, *last;
2416 	long timeo;
2417 	int skip;
2418 	int err;
2419 
2420 	err = -EOPNOTSUPP;
2421 	if (flags&MSG_OOB)
2422 		goto out;
2423 
2424 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2425 
2426 	do {
2427 		mutex_lock(&u->iolock);
2428 
2429 		skip = sk_peek_offset(sk, flags);
2430 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2431 					      &skip, &err, &last);
2432 		if (skb) {
2433 			if (!(flags & MSG_PEEK))
2434 				scm_stat_del(sk, skb);
2435 			break;
2436 		}
2437 
2438 		mutex_unlock(&u->iolock);
2439 
2440 		if (err != -EAGAIN)
2441 			break;
2442 	} while (timeo &&
2443 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2444 					      &err, &timeo, last));
2445 
2446 	if (!skb) { /* implies iolock unlocked */
2447 		unix_state_lock(sk);
2448 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2449 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2450 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2451 			err = 0;
2452 		unix_state_unlock(sk);
2453 		goto out;
2454 	}
2455 
2456 	if (wq_has_sleeper(&u->peer_wait))
2457 		wake_up_interruptible_sync_poll(&u->peer_wait,
2458 						EPOLLOUT | EPOLLWRNORM |
2459 						EPOLLWRBAND);
2460 
2461 	if (msg->msg_name) {
2462 		unix_copy_addr(msg, skb->sk);
2463 
2464 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2465 						      msg->msg_name,
2466 						      &msg->msg_namelen);
2467 	}
2468 
2469 	if (size > skb->len - skip)
2470 		size = skb->len - skip;
2471 	else if (size < skb->len - skip)
2472 		msg->msg_flags |= MSG_TRUNC;
2473 
2474 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2475 	if (err)
2476 		goto out_free;
2477 
2478 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2479 		__sock_recv_timestamp(msg, sk, skb);
2480 
2481 	memset(&scm, 0, sizeof(scm));
2482 
2483 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2484 	unix_set_secdata(&scm, skb);
2485 
2486 	if (!(flags & MSG_PEEK)) {
2487 		if (UNIXCB(skb).fp)
2488 			unix_detach_fds(&scm, skb);
2489 
2490 		sk_peek_offset_bwd(sk, skb->len);
2491 	} else {
2492 		/* It is questionable: on PEEK we could:
2493 		   - do not return fds - good, but too simple 8)
2494 		   - return fds, and do not return them on read (old strategy,
2495 		     apparently wrong)
2496 		   - clone fds (I chose it for now, it is the most universal
2497 		     solution)
2498 
2499 		   POSIX 1003.1g does not actually define this clearly
2500 		   at all. POSIX 1003.1g doesn't define a lot of things
2501 		   clearly however!
2502 
2503 		*/
2504 
2505 		sk_peek_offset_fwd(sk, size);
2506 
2507 		if (UNIXCB(skb).fp)
2508 			unix_peek_fds(&scm, skb);
2509 	}
2510 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2511 
2512 	scm_recv_unix(sock, msg, &scm, flags);
2513 
2514 out_free:
2515 	skb_free_datagram(sk, skb);
2516 	mutex_unlock(&u->iolock);
2517 out:
2518 	return err;
2519 }
2520 
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2521 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2522 			      int flags)
2523 {
2524 	struct sock *sk = sock->sk;
2525 
2526 #ifdef CONFIG_BPF_SYSCALL
2527 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2528 
2529 	if (prot != &unix_dgram_proto)
2530 		return prot->recvmsg(sk, msg, size, flags, NULL);
2531 #endif
2532 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2533 }
2534 
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2535 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2536 {
2537 	struct unix_sock *u = unix_sk(sk);
2538 	struct sk_buff *skb;
2539 	int err;
2540 
2541 	mutex_lock(&u->iolock);
2542 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2543 	mutex_unlock(&u->iolock);
2544 	if (!skb)
2545 		return err;
2546 
2547 	return recv_actor(sk, skb);
2548 }
2549 
2550 /*
2551  *	Sleep until more data has arrived. But check for races..
2552  */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2553 static long unix_stream_data_wait(struct sock *sk, long timeo,
2554 				  struct sk_buff *last, unsigned int last_len,
2555 				  bool freezable)
2556 {
2557 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2558 	struct sk_buff *tail;
2559 	DEFINE_WAIT(wait);
2560 
2561 	unix_state_lock(sk);
2562 
2563 	for (;;) {
2564 		prepare_to_wait(sk_sleep(sk), &wait, state);
2565 
2566 		tail = skb_peek_tail(&sk->sk_receive_queue);
2567 		if (tail != last ||
2568 		    (tail && tail->len != last_len) ||
2569 		    sk->sk_err ||
2570 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2571 		    signal_pending(current) ||
2572 		    !timeo)
2573 			break;
2574 
2575 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2576 		unix_state_unlock(sk);
2577 		timeo = schedule_timeout(timeo);
2578 		unix_state_lock(sk);
2579 
2580 		if (sock_flag(sk, SOCK_DEAD))
2581 			break;
2582 
2583 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2584 	}
2585 
2586 	finish_wait(sk_sleep(sk), &wait);
2587 	unix_state_unlock(sk);
2588 	return timeo;
2589 }
2590 
unix_skb_len(const struct sk_buff * skb)2591 static unsigned int unix_skb_len(const struct sk_buff *skb)
2592 {
2593 	return skb->len - UNIXCB(skb).consumed;
2594 }
2595 
2596 struct unix_stream_read_state {
2597 	int (*recv_actor)(struct sk_buff *, int, int,
2598 			  struct unix_stream_read_state *);
2599 	struct socket *socket;
2600 	struct msghdr *msg;
2601 	struct pipe_inode_info *pipe;
2602 	size_t size;
2603 	int flags;
2604 	unsigned int splice_flags;
2605 };
2606 
2607 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2608 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2609 {
2610 	struct socket *sock = state->socket;
2611 	struct sock *sk = sock->sk;
2612 	struct unix_sock *u = unix_sk(sk);
2613 	int chunk = 1;
2614 	struct sk_buff *oob_skb;
2615 
2616 	mutex_lock(&u->iolock);
2617 	unix_state_lock(sk);
2618 	spin_lock(&sk->sk_receive_queue.lock);
2619 
2620 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2621 		spin_unlock(&sk->sk_receive_queue.lock);
2622 		unix_state_unlock(sk);
2623 		mutex_unlock(&u->iolock);
2624 		return -EINVAL;
2625 	}
2626 
2627 	oob_skb = u->oob_skb;
2628 
2629 	if (!(state->flags & MSG_PEEK))
2630 		WRITE_ONCE(u->oob_skb, NULL);
2631 
2632 	spin_unlock(&sk->sk_receive_queue.lock);
2633 	unix_state_unlock(sk);
2634 
2635 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2636 
2637 	if (!(state->flags & MSG_PEEK))
2638 		UNIXCB(oob_skb).consumed += 1;
2639 
2640 	mutex_unlock(&u->iolock);
2641 
2642 	if (chunk < 0)
2643 		return -EFAULT;
2644 
2645 	state->msg->msg_flags |= MSG_OOB;
2646 	return 1;
2647 }
2648 
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2649 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2650 				  int flags, int copied)
2651 {
2652 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2653 	struct unix_sock *u = unix_sk(sk);
2654 
2655 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2656 		return skb;
2657 
2658 	spin_lock(&sk->sk_receive_queue.lock);
2659 
2660 	if (!unix_skb_len(skb)) {
2661 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2662 			skb = NULL;
2663 		} else if (flags & MSG_PEEK) {
2664 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2665 		} else {
2666 			read_skb = skb;
2667 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2668 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2669 		}
2670 
2671 		if (!skb)
2672 			goto unlock;
2673 	}
2674 
2675 	if (skb != u->oob_skb)
2676 		goto unlock;
2677 
2678 	if (copied) {
2679 		skb = NULL;
2680 	} else if (!(flags & MSG_PEEK)) {
2681 		WRITE_ONCE(u->oob_skb, NULL);
2682 
2683 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2684 			__skb_unlink(skb, &sk->sk_receive_queue);
2685 			unread_skb = skb;
2686 			skb = skb_peek(&sk->sk_receive_queue);
2687 		}
2688 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2689 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2690 	}
2691 
2692 unlock:
2693 	spin_unlock(&sk->sk_receive_queue.lock);
2694 
2695 	consume_skb(read_skb);
2696 	kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2697 
2698 	return skb;
2699 }
2700 #endif
2701 
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2702 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2703 {
2704 	struct unix_sock *u = unix_sk(sk);
2705 	struct sk_buff *skb;
2706 	int err;
2707 
2708 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2709 		return -ENOTCONN;
2710 
2711 	mutex_lock(&u->iolock);
2712 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2713 	mutex_unlock(&u->iolock);
2714 	if (!skb)
2715 		return err;
2716 
2717 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2718 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2719 		bool drop = false;
2720 
2721 		unix_state_lock(sk);
2722 
2723 		if (sock_flag(sk, SOCK_DEAD)) {
2724 			unix_state_unlock(sk);
2725 			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2726 			return -ECONNRESET;
2727 		}
2728 
2729 		spin_lock(&sk->sk_receive_queue.lock);
2730 		if (likely(skb == u->oob_skb)) {
2731 			WRITE_ONCE(u->oob_skb, NULL);
2732 			drop = true;
2733 		}
2734 		spin_unlock(&sk->sk_receive_queue.lock);
2735 
2736 		unix_state_unlock(sk);
2737 
2738 		if (drop) {
2739 			kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2740 			return -EAGAIN;
2741 		}
2742 	}
2743 #endif
2744 
2745 	return recv_actor(sk, skb);
2746 }
2747 
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2748 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2749 				    bool freezable)
2750 {
2751 	struct scm_cookie scm;
2752 	struct socket *sock = state->socket;
2753 	struct sock *sk = sock->sk;
2754 	struct unix_sock *u = unix_sk(sk);
2755 	int copied = 0;
2756 	int flags = state->flags;
2757 	int noblock = flags & MSG_DONTWAIT;
2758 	bool check_creds = false;
2759 	int target;
2760 	int err = 0;
2761 	long timeo;
2762 	int skip;
2763 	size_t size = state->size;
2764 	unsigned int last_len;
2765 
2766 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2767 		err = -EINVAL;
2768 		goto out;
2769 	}
2770 
2771 	if (unlikely(flags & MSG_OOB)) {
2772 		err = -EOPNOTSUPP;
2773 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2774 		err = unix_stream_recv_urg(state);
2775 #endif
2776 		goto out;
2777 	}
2778 
2779 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2780 	timeo = sock_rcvtimeo(sk, noblock);
2781 
2782 	memset(&scm, 0, sizeof(scm));
2783 
2784 	/* Lock the socket to prevent queue disordering
2785 	 * while sleeps in memcpy_tomsg
2786 	 */
2787 	mutex_lock(&u->iolock);
2788 
2789 	skip = max(sk_peek_offset(sk, flags), 0);
2790 
2791 	do {
2792 		struct sk_buff *skb, *last;
2793 		int chunk;
2794 
2795 redo:
2796 		unix_state_lock(sk);
2797 		if (sock_flag(sk, SOCK_DEAD)) {
2798 			err = -ECONNRESET;
2799 			goto unlock;
2800 		}
2801 		last = skb = skb_peek(&sk->sk_receive_queue);
2802 		last_len = last ? last->len : 0;
2803 
2804 again:
2805 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2806 		if (skb) {
2807 			skb = manage_oob(skb, sk, flags, copied);
2808 			if (!skb && copied) {
2809 				unix_state_unlock(sk);
2810 				break;
2811 			}
2812 		}
2813 #endif
2814 		if (skb == NULL) {
2815 			if (copied >= target)
2816 				goto unlock;
2817 
2818 			/*
2819 			 *	POSIX 1003.1g mandates this order.
2820 			 */
2821 
2822 			err = sock_error(sk);
2823 			if (err)
2824 				goto unlock;
2825 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2826 				goto unlock;
2827 
2828 			unix_state_unlock(sk);
2829 			if (!timeo) {
2830 				err = -EAGAIN;
2831 				break;
2832 			}
2833 
2834 			mutex_unlock(&u->iolock);
2835 
2836 			timeo = unix_stream_data_wait(sk, timeo, last,
2837 						      last_len, freezable);
2838 
2839 			if (signal_pending(current)) {
2840 				err = sock_intr_errno(timeo);
2841 				scm_destroy(&scm);
2842 				goto out;
2843 			}
2844 
2845 			mutex_lock(&u->iolock);
2846 			goto redo;
2847 unlock:
2848 			unix_state_unlock(sk);
2849 			break;
2850 		}
2851 
2852 		while (skip >= unix_skb_len(skb)) {
2853 			skip -= unix_skb_len(skb);
2854 			last = skb;
2855 			last_len = skb->len;
2856 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2857 			if (!skb)
2858 				goto again;
2859 		}
2860 
2861 		unix_state_unlock(sk);
2862 
2863 		if (check_creds) {
2864 			/* Never glue messages from different writers */
2865 			if (!unix_skb_scm_eq(skb, &scm))
2866 				break;
2867 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2868 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2869 			/* Copy credentials */
2870 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2871 			unix_set_secdata(&scm, skb);
2872 			check_creds = true;
2873 		}
2874 
2875 		/* Copy address just once */
2876 		if (state->msg && state->msg->msg_name) {
2877 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2878 					 state->msg->msg_name);
2879 			unix_copy_addr(state->msg, skb->sk);
2880 
2881 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2882 							      state->msg->msg_name,
2883 							      &state->msg->msg_namelen);
2884 
2885 			sunaddr = NULL;
2886 		}
2887 
2888 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2889 		chunk = state->recv_actor(skb, skip, chunk, state);
2890 		if (chunk < 0) {
2891 			if (copied == 0)
2892 				copied = -EFAULT;
2893 			break;
2894 		}
2895 		copied += chunk;
2896 		size -= chunk;
2897 
2898 		/* Mark read part of skb as used */
2899 		if (!(flags & MSG_PEEK)) {
2900 			UNIXCB(skb).consumed += chunk;
2901 
2902 			sk_peek_offset_bwd(sk, chunk);
2903 
2904 			if (UNIXCB(skb).fp) {
2905 				scm_stat_del(sk, skb);
2906 				unix_detach_fds(&scm, skb);
2907 			}
2908 
2909 			if (unix_skb_len(skb))
2910 				break;
2911 
2912 			skb_unlink(skb, &sk->sk_receive_queue);
2913 			consume_skb(skb);
2914 
2915 			if (scm.fp)
2916 				break;
2917 		} else {
2918 			/* It is questionable, see note in unix_dgram_recvmsg.
2919 			 */
2920 			if (UNIXCB(skb).fp)
2921 				unix_peek_fds(&scm, skb);
2922 
2923 			sk_peek_offset_fwd(sk, chunk);
2924 
2925 			if (UNIXCB(skb).fp)
2926 				break;
2927 
2928 			skip = 0;
2929 			last = skb;
2930 			last_len = skb->len;
2931 			unix_state_lock(sk);
2932 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2933 			if (skb)
2934 				goto again;
2935 			unix_state_unlock(sk);
2936 			break;
2937 		}
2938 	} while (size);
2939 
2940 	mutex_unlock(&u->iolock);
2941 	if (state->msg)
2942 		scm_recv_unix(sock, state->msg, &scm, flags);
2943 	else
2944 		scm_destroy(&scm);
2945 out:
2946 	return copied ? : err;
2947 }
2948 
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2949 static int unix_stream_read_actor(struct sk_buff *skb,
2950 				  int skip, int chunk,
2951 				  struct unix_stream_read_state *state)
2952 {
2953 	int ret;
2954 
2955 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2956 				    state->msg, chunk);
2957 	return ret ?: chunk;
2958 }
2959 
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2960 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2961 			  size_t size, int flags)
2962 {
2963 	struct unix_stream_read_state state = {
2964 		.recv_actor = unix_stream_read_actor,
2965 		.socket = sk->sk_socket,
2966 		.msg = msg,
2967 		.size = size,
2968 		.flags = flags
2969 	};
2970 
2971 	return unix_stream_read_generic(&state, true);
2972 }
2973 
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2974 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2975 			       size_t size, int flags)
2976 {
2977 	struct unix_stream_read_state state = {
2978 		.recv_actor = unix_stream_read_actor,
2979 		.socket = sock,
2980 		.msg = msg,
2981 		.size = size,
2982 		.flags = flags
2983 	};
2984 
2985 #ifdef CONFIG_BPF_SYSCALL
2986 	struct sock *sk = sock->sk;
2987 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2988 
2989 	if (prot != &unix_stream_proto)
2990 		return prot->recvmsg(sk, msg, size, flags, NULL);
2991 #endif
2992 	return unix_stream_read_generic(&state, true);
2993 }
2994 
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2995 static int unix_stream_splice_actor(struct sk_buff *skb,
2996 				    int skip, int chunk,
2997 				    struct unix_stream_read_state *state)
2998 {
2999 	return skb_splice_bits(skb, state->socket->sk,
3000 			       UNIXCB(skb).consumed + skip,
3001 			       state->pipe, chunk, state->splice_flags);
3002 }
3003 
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)3004 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3005 				       struct pipe_inode_info *pipe,
3006 				       size_t size, unsigned int flags)
3007 {
3008 	struct unix_stream_read_state state = {
3009 		.recv_actor = unix_stream_splice_actor,
3010 		.socket = sock,
3011 		.pipe = pipe,
3012 		.size = size,
3013 		.splice_flags = flags,
3014 	};
3015 
3016 	if (unlikely(*ppos))
3017 		return -ESPIPE;
3018 
3019 	if (sock->file->f_flags & O_NONBLOCK ||
3020 	    flags & SPLICE_F_NONBLOCK)
3021 		state.flags = MSG_DONTWAIT;
3022 
3023 	return unix_stream_read_generic(&state, false);
3024 }
3025 
unix_shutdown(struct socket * sock,int mode)3026 static int unix_shutdown(struct socket *sock, int mode)
3027 {
3028 	struct sock *sk = sock->sk;
3029 	struct sock *other;
3030 
3031 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3032 		return -EINVAL;
3033 	/* This maps:
3034 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3035 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3036 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3037 	 */
3038 	++mode;
3039 
3040 	unix_state_lock(sk);
3041 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3042 	other = unix_peer(sk);
3043 	if (other)
3044 		sock_hold(other);
3045 	unix_state_unlock(sk);
3046 	sk->sk_state_change(sk);
3047 
3048 	if (other &&
3049 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3050 
3051 		int peer_mode = 0;
3052 		const struct proto *prot = READ_ONCE(other->sk_prot);
3053 
3054 		if (prot->unhash)
3055 			prot->unhash(other);
3056 		if (mode&RCV_SHUTDOWN)
3057 			peer_mode |= SEND_SHUTDOWN;
3058 		if (mode&SEND_SHUTDOWN)
3059 			peer_mode |= RCV_SHUTDOWN;
3060 		unix_state_lock(other);
3061 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3062 		unix_state_unlock(other);
3063 		other->sk_state_change(other);
3064 		if (peer_mode == SHUTDOWN_MASK)
3065 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3066 		else if (peer_mode & RCV_SHUTDOWN)
3067 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3068 	}
3069 	if (other)
3070 		sock_put(other);
3071 
3072 	return 0;
3073 }
3074 
unix_inq_len(struct sock * sk)3075 long unix_inq_len(struct sock *sk)
3076 {
3077 	struct sk_buff *skb;
3078 	long amount = 0;
3079 
3080 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3081 		return -EINVAL;
3082 
3083 	spin_lock(&sk->sk_receive_queue.lock);
3084 	if (sk->sk_type == SOCK_STREAM ||
3085 	    sk->sk_type == SOCK_SEQPACKET) {
3086 		skb_queue_walk(&sk->sk_receive_queue, skb)
3087 			amount += unix_skb_len(skb);
3088 	} else {
3089 		skb = skb_peek(&sk->sk_receive_queue);
3090 		if (skb)
3091 			amount = skb->len;
3092 	}
3093 	spin_unlock(&sk->sk_receive_queue.lock);
3094 
3095 	return amount;
3096 }
3097 EXPORT_SYMBOL_GPL(unix_inq_len);
3098 
unix_outq_len(struct sock * sk)3099 long unix_outq_len(struct sock *sk)
3100 {
3101 	return sk_wmem_alloc_get(sk);
3102 }
3103 EXPORT_SYMBOL_GPL(unix_outq_len);
3104 
unix_open_file(struct sock * sk)3105 static int unix_open_file(struct sock *sk)
3106 {
3107 	struct path path;
3108 	struct file *f;
3109 	int fd;
3110 
3111 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3112 		return -EPERM;
3113 
3114 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3115 		return -ENOENT;
3116 
3117 	path = unix_sk(sk)->path;
3118 	if (!path.dentry)
3119 		return -ENOENT;
3120 
3121 	path_get(&path);
3122 
3123 	fd = get_unused_fd_flags(O_CLOEXEC);
3124 	if (fd < 0)
3125 		goto out;
3126 
3127 	f = dentry_open(&path, O_PATH, current_cred());
3128 	if (IS_ERR(f)) {
3129 		put_unused_fd(fd);
3130 		fd = PTR_ERR(f);
3131 		goto out;
3132 	}
3133 
3134 	fd_install(fd, f);
3135 out:
3136 	path_put(&path);
3137 
3138 	return fd;
3139 }
3140 
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3141 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3142 {
3143 	struct sock *sk = sock->sk;
3144 	long amount = 0;
3145 	int err;
3146 
3147 	switch (cmd) {
3148 	case SIOCOUTQ:
3149 		amount = unix_outq_len(sk);
3150 		err = put_user(amount, (int __user *)arg);
3151 		break;
3152 	case SIOCINQ:
3153 		amount = unix_inq_len(sk);
3154 		if (amount < 0)
3155 			err = amount;
3156 		else
3157 			err = put_user(amount, (int __user *)arg);
3158 		break;
3159 	case SIOCUNIXFILE:
3160 		err = unix_open_file(sk);
3161 		break;
3162 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3163 	case SIOCATMARK:
3164 		{
3165 			struct unix_sock *u = unix_sk(sk);
3166 			struct sk_buff *skb;
3167 			int answ = 0;
3168 
3169 			mutex_lock(&u->iolock);
3170 
3171 			skb = skb_peek(&sk->sk_receive_queue);
3172 			if (skb) {
3173 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3174 				struct sk_buff *next_skb;
3175 
3176 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3177 
3178 				if (skb == oob_skb ||
3179 				    (!unix_skb_len(skb) &&
3180 				     (!oob_skb || next_skb == oob_skb)))
3181 					answ = 1;
3182 			}
3183 
3184 			mutex_unlock(&u->iolock);
3185 
3186 			err = put_user(answ, (int __user *)arg);
3187 		}
3188 		break;
3189 #endif
3190 	default:
3191 		err = -ENOIOCTLCMD;
3192 		break;
3193 	}
3194 	return err;
3195 }
3196 
3197 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3198 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3199 {
3200 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3201 }
3202 #endif
3203 
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3204 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3205 {
3206 	struct sock *sk = sock->sk;
3207 	unsigned char state;
3208 	__poll_t mask;
3209 	u8 shutdown;
3210 
3211 	sock_poll_wait(file, sock, wait);
3212 	mask = 0;
3213 	shutdown = READ_ONCE(sk->sk_shutdown);
3214 	state = READ_ONCE(sk->sk_state);
3215 
3216 	/* exceptional events? */
3217 	if (READ_ONCE(sk->sk_err))
3218 		mask |= EPOLLERR;
3219 	if (shutdown == SHUTDOWN_MASK)
3220 		mask |= EPOLLHUP;
3221 	if (shutdown & RCV_SHUTDOWN)
3222 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3223 
3224 	/* readable? */
3225 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3226 		mask |= EPOLLIN | EPOLLRDNORM;
3227 	if (sk_is_readable(sk))
3228 		mask |= EPOLLIN | EPOLLRDNORM;
3229 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3230 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3231 		mask |= EPOLLPRI;
3232 #endif
3233 
3234 	/* Connection-based need to check for termination and startup */
3235 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3236 	    state == TCP_CLOSE)
3237 		mask |= EPOLLHUP;
3238 
3239 	/*
3240 	 * we set writable also when the other side has shut down the
3241 	 * connection. This prevents stuck sockets.
3242 	 */
3243 	if (unix_writable(sk, state))
3244 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3245 
3246 	return mask;
3247 }
3248 
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3249 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3250 				    poll_table *wait)
3251 {
3252 	struct sock *sk = sock->sk, *other;
3253 	unsigned int writable;
3254 	unsigned char state;
3255 	__poll_t mask;
3256 	u8 shutdown;
3257 
3258 	sock_poll_wait(file, sock, wait);
3259 	mask = 0;
3260 	shutdown = READ_ONCE(sk->sk_shutdown);
3261 	state = READ_ONCE(sk->sk_state);
3262 
3263 	/* exceptional events? */
3264 	if (READ_ONCE(sk->sk_err) ||
3265 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3266 		mask |= EPOLLERR |
3267 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3268 
3269 	if (shutdown & RCV_SHUTDOWN)
3270 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3271 	if (shutdown == SHUTDOWN_MASK)
3272 		mask |= EPOLLHUP;
3273 
3274 	/* readable? */
3275 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3276 		mask |= EPOLLIN | EPOLLRDNORM;
3277 	if (sk_is_readable(sk))
3278 		mask |= EPOLLIN | EPOLLRDNORM;
3279 
3280 	/* Connection-based need to check for termination and startup */
3281 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3282 		mask |= EPOLLHUP;
3283 
3284 	/* No write status requested, avoid expensive OUT tests. */
3285 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3286 		return mask;
3287 
3288 	writable = unix_writable(sk, state);
3289 	if (writable) {
3290 		unix_state_lock(sk);
3291 
3292 		other = unix_peer(sk);
3293 		if (other && unix_peer(other) != sk &&
3294 		    unix_recvq_full_lockless(other) &&
3295 		    unix_dgram_peer_wake_me(sk, other))
3296 			writable = 0;
3297 
3298 		unix_state_unlock(sk);
3299 	}
3300 
3301 	if (writable)
3302 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3303 	else
3304 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3305 
3306 	return mask;
3307 }
3308 
3309 #ifdef CONFIG_PROC_FS
3310 
3311 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3312 
3313 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3314 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3315 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3316 
unix_from_bucket(struct seq_file * seq,loff_t * pos)3317 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3318 {
3319 	unsigned long offset = get_offset(*pos);
3320 	unsigned long bucket = get_bucket(*pos);
3321 	unsigned long count = 0;
3322 	struct sock *sk;
3323 
3324 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3325 	     sk; sk = sk_next(sk)) {
3326 		if (++count == offset)
3327 			break;
3328 	}
3329 
3330 	return sk;
3331 }
3332 
unix_get_first(struct seq_file * seq,loff_t * pos)3333 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3334 {
3335 	unsigned long bucket = get_bucket(*pos);
3336 	struct net *net = seq_file_net(seq);
3337 	struct sock *sk;
3338 
3339 	while (bucket < UNIX_HASH_SIZE) {
3340 		spin_lock(&net->unx.table.locks[bucket]);
3341 
3342 		sk = unix_from_bucket(seq, pos);
3343 		if (sk)
3344 			return sk;
3345 
3346 		spin_unlock(&net->unx.table.locks[bucket]);
3347 
3348 		*pos = set_bucket_offset(++bucket, 1);
3349 	}
3350 
3351 	return NULL;
3352 }
3353 
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3354 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3355 				  loff_t *pos)
3356 {
3357 	unsigned long bucket = get_bucket(*pos);
3358 
3359 	sk = sk_next(sk);
3360 	if (sk)
3361 		return sk;
3362 
3363 
3364 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3365 
3366 	*pos = set_bucket_offset(++bucket, 1);
3367 
3368 	return unix_get_first(seq, pos);
3369 }
3370 
unix_seq_start(struct seq_file * seq,loff_t * pos)3371 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3372 {
3373 	if (!*pos)
3374 		return SEQ_START_TOKEN;
3375 
3376 	return unix_get_first(seq, pos);
3377 }
3378 
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3379 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3380 {
3381 	++*pos;
3382 
3383 	if (v == SEQ_START_TOKEN)
3384 		return unix_get_first(seq, pos);
3385 
3386 	return unix_get_next(seq, v, pos);
3387 }
3388 
unix_seq_stop(struct seq_file * seq,void * v)3389 static void unix_seq_stop(struct seq_file *seq, void *v)
3390 {
3391 	struct sock *sk = v;
3392 
3393 	if (sk)
3394 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3395 }
3396 
unix_seq_show(struct seq_file * seq,void * v)3397 static int unix_seq_show(struct seq_file *seq, void *v)
3398 {
3399 
3400 	if (v == SEQ_START_TOKEN)
3401 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3402 			 "Inode Path\n");
3403 	else {
3404 		struct sock *s = v;
3405 		struct unix_sock *u = unix_sk(s);
3406 		unix_state_lock(s);
3407 
3408 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3409 			s,
3410 			refcount_read(&s->sk_refcnt),
3411 			0,
3412 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3413 			s->sk_type,
3414 			s->sk_socket ?
3415 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3416 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3417 			sock_i_ino(s));
3418 
3419 		if (u->addr) {	// under a hash table lock here
3420 			int i, len;
3421 			seq_putc(seq, ' ');
3422 
3423 			i = 0;
3424 			len = u->addr->len -
3425 				offsetof(struct sockaddr_un, sun_path);
3426 			if (u->addr->name->sun_path[0]) {
3427 				len--;
3428 			} else {
3429 				seq_putc(seq, '@');
3430 				i++;
3431 			}
3432 			for ( ; i < len; i++)
3433 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3434 					 '@');
3435 		}
3436 		unix_state_unlock(s);
3437 		seq_putc(seq, '\n');
3438 	}
3439 
3440 	return 0;
3441 }
3442 
3443 static const struct seq_operations unix_seq_ops = {
3444 	.start  = unix_seq_start,
3445 	.next   = unix_seq_next,
3446 	.stop   = unix_seq_stop,
3447 	.show   = unix_seq_show,
3448 };
3449 
3450 #ifdef CONFIG_BPF_SYSCALL
3451 struct bpf_unix_iter_state {
3452 	struct seq_net_private p;
3453 	unsigned int cur_sk;
3454 	unsigned int end_sk;
3455 	unsigned int max_sk;
3456 	struct sock **batch;
3457 	bool st_bucket_done;
3458 };
3459 
3460 struct bpf_iter__unix {
3461 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3462 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3463 	uid_t uid __aligned(8);
3464 };
3465 
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3466 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3467 			      struct unix_sock *unix_sk, uid_t uid)
3468 {
3469 	struct bpf_iter__unix ctx;
3470 
3471 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3472 	ctx.meta = meta;
3473 	ctx.unix_sk = unix_sk;
3474 	ctx.uid = uid;
3475 	return bpf_iter_run_prog(prog, &ctx);
3476 }
3477 
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3478 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3479 
3480 {
3481 	struct bpf_unix_iter_state *iter = seq->private;
3482 	unsigned int expected = 1;
3483 	struct sock *sk;
3484 
3485 	sock_hold(start_sk);
3486 	iter->batch[iter->end_sk++] = start_sk;
3487 
3488 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3489 		if (iter->end_sk < iter->max_sk) {
3490 			sock_hold(sk);
3491 			iter->batch[iter->end_sk++] = sk;
3492 		}
3493 
3494 		expected++;
3495 	}
3496 
3497 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3498 
3499 	return expected;
3500 }
3501 
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3502 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3503 {
3504 	while (iter->cur_sk < iter->end_sk)
3505 		sock_put(iter->batch[iter->cur_sk++]);
3506 }
3507 
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3508 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3509 				       unsigned int new_batch_sz)
3510 {
3511 	struct sock **new_batch;
3512 
3513 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3514 			     GFP_USER | __GFP_NOWARN);
3515 	if (!new_batch)
3516 		return -ENOMEM;
3517 
3518 	bpf_iter_unix_put_batch(iter);
3519 	kvfree(iter->batch);
3520 	iter->batch = new_batch;
3521 	iter->max_sk = new_batch_sz;
3522 
3523 	return 0;
3524 }
3525 
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3526 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3527 					loff_t *pos)
3528 {
3529 	struct bpf_unix_iter_state *iter = seq->private;
3530 	unsigned int expected;
3531 	bool resized = false;
3532 	struct sock *sk;
3533 
3534 	if (iter->st_bucket_done)
3535 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3536 
3537 again:
3538 	/* Get a new batch */
3539 	iter->cur_sk = 0;
3540 	iter->end_sk = 0;
3541 
3542 	sk = unix_get_first(seq, pos);
3543 	if (!sk)
3544 		return NULL; /* Done */
3545 
3546 	expected = bpf_iter_unix_hold_batch(seq, sk);
3547 
3548 	if (iter->end_sk == expected) {
3549 		iter->st_bucket_done = true;
3550 		return sk;
3551 	}
3552 
3553 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3554 		resized = true;
3555 		goto again;
3556 	}
3557 
3558 	return sk;
3559 }
3560 
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3561 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3562 {
3563 	if (!*pos)
3564 		return SEQ_START_TOKEN;
3565 
3566 	/* bpf iter does not support lseek, so it always
3567 	 * continue from where it was stop()-ped.
3568 	 */
3569 	return bpf_iter_unix_batch(seq, pos);
3570 }
3571 
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3572 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3573 {
3574 	struct bpf_unix_iter_state *iter = seq->private;
3575 	struct sock *sk;
3576 
3577 	/* Whenever seq_next() is called, the iter->cur_sk is
3578 	 * done with seq_show(), so advance to the next sk in
3579 	 * the batch.
3580 	 */
3581 	if (iter->cur_sk < iter->end_sk)
3582 		sock_put(iter->batch[iter->cur_sk++]);
3583 
3584 	++*pos;
3585 
3586 	if (iter->cur_sk < iter->end_sk)
3587 		sk = iter->batch[iter->cur_sk];
3588 	else
3589 		sk = bpf_iter_unix_batch(seq, pos);
3590 
3591 	return sk;
3592 }
3593 
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3594 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3595 {
3596 	struct bpf_iter_meta meta;
3597 	struct bpf_prog *prog;
3598 	struct sock *sk = v;
3599 	uid_t uid;
3600 	bool slow;
3601 	int ret;
3602 
3603 	if (v == SEQ_START_TOKEN)
3604 		return 0;
3605 
3606 	slow = lock_sock_fast(sk);
3607 
3608 	if (unlikely(sk_unhashed(sk))) {
3609 		ret = SEQ_SKIP;
3610 		goto unlock;
3611 	}
3612 
3613 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3614 	meta.seq = seq;
3615 	prog = bpf_iter_get_info(&meta, false);
3616 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3617 unlock:
3618 	unlock_sock_fast(sk, slow);
3619 	return ret;
3620 }
3621 
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3622 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3623 {
3624 	struct bpf_unix_iter_state *iter = seq->private;
3625 	struct bpf_iter_meta meta;
3626 	struct bpf_prog *prog;
3627 
3628 	if (!v) {
3629 		meta.seq = seq;
3630 		prog = bpf_iter_get_info(&meta, true);
3631 		if (prog)
3632 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3633 	}
3634 
3635 	if (iter->cur_sk < iter->end_sk)
3636 		bpf_iter_unix_put_batch(iter);
3637 }
3638 
3639 static const struct seq_operations bpf_iter_unix_seq_ops = {
3640 	.start	= bpf_iter_unix_seq_start,
3641 	.next	= bpf_iter_unix_seq_next,
3642 	.stop	= bpf_iter_unix_seq_stop,
3643 	.show	= bpf_iter_unix_seq_show,
3644 };
3645 #endif
3646 #endif
3647 
3648 static const struct net_proto_family unix_family_ops = {
3649 	.family = PF_UNIX,
3650 	.create = unix_create,
3651 	.owner	= THIS_MODULE,
3652 };
3653 
3654 
unix_net_init(struct net * net)3655 static int __net_init unix_net_init(struct net *net)
3656 {
3657 	int i;
3658 
3659 	net->unx.sysctl_max_dgram_qlen = 10;
3660 	if (unix_sysctl_register(net))
3661 		goto out;
3662 
3663 #ifdef CONFIG_PROC_FS
3664 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3665 			     sizeof(struct seq_net_private)))
3666 		goto err_sysctl;
3667 #endif
3668 
3669 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3670 					      sizeof(spinlock_t), GFP_KERNEL);
3671 	if (!net->unx.table.locks)
3672 		goto err_proc;
3673 
3674 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3675 						sizeof(struct hlist_head),
3676 						GFP_KERNEL);
3677 	if (!net->unx.table.buckets)
3678 		goto free_locks;
3679 
3680 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3681 		spin_lock_init(&net->unx.table.locks[i]);
3682 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3683 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3684 	}
3685 
3686 	return 0;
3687 
3688 free_locks:
3689 	kvfree(net->unx.table.locks);
3690 err_proc:
3691 #ifdef CONFIG_PROC_FS
3692 	remove_proc_entry("unix", net->proc_net);
3693 err_sysctl:
3694 #endif
3695 	unix_sysctl_unregister(net);
3696 out:
3697 	return -ENOMEM;
3698 }
3699 
unix_net_exit(struct net * net)3700 static void __net_exit unix_net_exit(struct net *net)
3701 {
3702 	kvfree(net->unx.table.buckets);
3703 	kvfree(net->unx.table.locks);
3704 	unix_sysctl_unregister(net);
3705 	remove_proc_entry("unix", net->proc_net);
3706 }
3707 
3708 static struct pernet_operations unix_net_ops = {
3709 	.init = unix_net_init,
3710 	.exit = unix_net_exit,
3711 };
3712 
3713 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3714 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3715 		     struct unix_sock *unix_sk, uid_t uid)
3716 
3717 #define INIT_BATCH_SZ 16
3718 
3719 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3720 {
3721 	struct bpf_unix_iter_state *iter = priv_data;
3722 	int err;
3723 
3724 	err = bpf_iter_init_seq_net(priv_data, aux);
3725 	if (err)
3726 		return err;
3727 
3728 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3729 	if (err) {
3730 		bpf_iter_fini_seq_net(priv_data);
3731 		return err;
3732 	}
3733 
3734 	return 0;
3735 }
3736 
bpf_iter_fini_unix(void * priv_data)3737 static void bpf_iter_fini_unix(void *priv_data)
3738 {
3739 	struct bpf_unix_iter_state *iter = priv_data;
3740 
3741 	bpf_iter_fini_seq_net(priv_data);
3742 	kvfree(iter->batch);
3743 }
3744 
3745 static const struct bpf_iter_seq_info unix_seq_info = {
3746 	.seq_ops		= &bpf_iter_unix_seq_ops,
3747 	.init_seq_private	= bpf_iter_init_unix,
3748 	.fini_seq_private	= bpf_iter_fini_unix,
3749 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3750 };
3751 
3752 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3753 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3754 			     const struct bpf_prog *prog)
3755 {
3756 	switch (func_id) {
3757 	case BPF_FUNC_setsockopt:
3758 		return &bpf_sk_setsockopt_proto;
3759 	case BPF_FUNC_getsockopt:
3760 		return &bpf_sk_getsockopt_proto;
3761 	default:
3762 		return NULL;
3763 	}
3764 }
3765 
3766 static struct bpf_iter_reg unix_reg_info = {
3767 	.target			= "unix",
3768 	.ctx_arg_info_size	= 1,
3769 	.ctx_arg_info		= {
3770 		{ offsetof(struct bpf_iter__unix, unix_sk),
3771 		  PTR_TO_BTF_ID_OR_NULL },
3772 	},
3773 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3774 	.seq_info		= &unix_seq_info,
3775 };
3776 
bpf_iter_register(void)3777 static void __init bpf_iter_register(void)
3778 {
3779 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3780 	if (bpf_iter_reg_target(&unix_reg_info))
3781 		pr_warn("Warning: could not register bpf iterator unix\n");
3782 }
3783 #endif
3784 
af_unix_init(void)3785 static int __init af_unix_init(void)
3786 {
3787 	int i, rc = -1;
3788 
3789 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3790 
3791 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3792 		spin_lock_init(&bsd_socket_locks[i]);
3793 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3794 	}
3795 
3796 	rc = proto_register(&unix_dgram_proto, 1);
3797 	if (rc != 0) {
3798 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3799 		goto out;
3800 	}
3801 
3802 	rc = proto_register(&unix_stream_proto, 1);
3803 	if (rc != 0) {
3804 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3805 		proto_unregister(&unix_dgram_proto);
3806 		goto out;
3807 	}
3808 
3809 	sock_register(&unix_family_ops);
3810 	register_pernet_subsys(&unix_net_ops);
3811 	unix_bpf_build_proto();
3812 
3813 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3814 	bpf_iter_register();
3815 #endif
3816 
3817 out:
3818 	return rc;
3819 }
3820 
3821 /* Later than subsys_initcall() because we depend on stuff initialised there */
3822 fs_initcall(af_unix_init);
3823