xref: /linux/net/unix/af_unix.c (revision 15ecd83dc06277385ad71dc7ea26911d9a79acaf)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/bpf-cgroup.h>
81 #include <linux/btf_ids.h>
82 #include <linux/dcache.h>
83 #include <linux/errno.h>
84 #include <linux/fcntl.h>
85 #include <linux/file.h>
86 #include <linux/filter.h>
87 #include <linux/fs.h>
88 #include <linux/fs_struct.h>
89 #include <linux/init.h>
90 #include <linux/kernel.h>
91 #include <linux/mount.h>
92 #include <linux/namei.h>
93 #include <linux/net.h>
94 #include <linux/pidfs.h>
95 #include <linux/poll.h>
96 #include <linux/proc_fs.h>
97 #include <linux/sched/signal.h>
98 #include <linux/security.h>
99 #include <linux/seq_file.h>
100 #include <linux/skbuff.h>
101 #include <linux/slab.h>
102 #include <linux/socket.h>
103 #include <linux/splice.h>
104 #include <linux/string.h>
105 #include <linux/uaccess.h>
106 #include <net/af_unix.h>
107 #include <net/net_namespace.h>
108 #include <net/scm.h>
109 #include <net/tcp_states.h>
110 #include <uapi/linux/sockios.h>
111 #include <uapi/linux/termios.h>
112 
113 #include "af_unix.h"
114 
115 static atomic_long_t unix_nr_socks;
116 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
117 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
118 
119 /* SMP locking strategy:
120  *    hash table is protected with spinlock.
121  *    each socket state is protected by separate spinlock.
122  */
123 #ifdef CONFIG_PROVE_LOCKING
124 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
125 
126 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
127 				  const struct lockdep_map *b)
128 {
129 	return cmp_ptr(a, b);
130 }
131 
132 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
133 				  const struct lockdep_map *_b)
134 {
135 	const struct unix_sock *a, *b;
136 
137 	a = container_of(_a, struct unix_sock, lock.dep_map);
138 	b = container_of(_b, struct unix_sock, lock.dep_map);
139 
140 	if (a->sk.sk_state == TCP_LISTEN) {
141 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
142 		 *
143 		 *   1. a is TCP_LISTEN.
144 		 *   2. b is not a.
145 		 *   3. concurrent connect(b -> a) must fail.
146 		 *
147 		 * Except for 2. & 3., the b's state can be any possible
148 		 * value due to concurrent connect() or listen().
149 		 *
150 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
151 		 * be expressed as lock_cmp_fn.
152 		 */
153 		switch (b->sk.sk_state) {
154 		case TCP_CLOSE:
155 		case TCP_ESTABLISHED:
156 		case TCP_LISTEN:
157 			return -1;
158 		default:
159 			/* Invalid case. */
160 			return 0;
161 		}
162 	}
163 
164 	/* Should never happen.  Just to be symmetric. */
165 	if (b->sk.sk_state == TCP_LISTEN) {
166 		switch (b->sk.sk_state) {
167 		case TCP_CLOSE:
168 		case TCP_ESTABLISHED:
169 			return 1;
170 		default:
171 			return 0;
172 		}
173 	}
174 
175 	/* unix_state_double_lock(): ascending address order. */
176 	return cmp_ptr(a, b);
177 }
178 
179 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
180 				  const struct lockdep_map *_b)
181 {
182 	const struct sock *a, *b;
183 
184 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
185 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
186 
187 	/* unix_collect_skb(): listener -> embryo order. */
188 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
189 		return -1;
190 
191 	/* Should never happen.  Just to be symmetric. */
192 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
193 		return 1;
194 
195 	return 0;
196 }
197 #endif
198 
199 static unsigned int unix_unbound_hash(struct sock *sk)
200 {
201 	unsigned long hash = (unsigned long)sk;
202 
203 	hash ^= hash >> 16;
204 	hash ^= hash >> 8;
205 	hash ^= sk->sk_type;
206 
207 	return hash & UNIX_HASH_MOD;
208 }
209 
210 static unsigned int unix_bsd_hash(struct inode *i)
211 {
212 	return i->i_ino & UNIX_HASH_MOD;
213 }
214 
215 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
216 				       int addr_len, int type)
217 {
218 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
219 	unsigned int hash;
220 
221 	hash = (__force unsigned int)csum_fold(csum);
222 	hash ^= hash >> 8;
223 	hash ^= type;
224 
225 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
226 }
227 
228 static void unix_table_double_lock(struct net *net,
229 				   unsigned int hash1, unsigned int hash2)
230 {
231 	if (hash1 == hash2) {
232 		spin_lock(&net->unx.table.locks[hash1]);
233 		return;
234 	}
235 
236 	if (hash1 > hash2)
237 		swap(hash1, hash2);
238 
239 	spin_lock(&net->unx.table.locks[hash1]);
240 	spin_lock(&net->unx.table.locks[hash2]);
241 }
242 
243 static void unix_table_double_unlock(struct net *net,
244 				     unsigned int hash1, unsigned int hash2)
245 {
246 	if (hash1 == hash2) {
247 		spin_unlock(&net->unx.table.locks[hash1]);
248 		return;
249 	}
250 
251 	spin_unlock(&net->unx.table.locks[hash1]);
252 	spin_unlock(&net->unx.table.locks[hash2]);
253 }
254 
255 #ifdef CONFIG_SECURITY_NETWORK
256 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
257 {
258 	UNIXCB(skb).secid = scm->secid;
259 }
260 
261 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
262 {
263 	scm->secid = UNIXCB(skb).secid;
264 }
265 
266 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
267 {
268 	return (scm->secid == UNIXCB(skb).secid);
269 }
270 #else
271 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
272 { }
273 
274 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
275 { }
276 
277 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
278 {
279 	return true;
280 }
281 #endif /* CONFIG_SECURITY_NETWORK */
282 
283 static inline int unix_may_send(struct sock *sk, struct sock *osk)
284 {
285 	return !unix_peer(osk) || unix_peer(osk) == sk;
286 }
287 
288 static inline int unix_recvq_full_lockless(const struct sock *sk)
289 {
290 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
291 }
292 
293 struct sock *unix_peer_get(struct sock *s)
294 {
295 	struct sock *peer;
296 
297 	unix_state_lock(s);
298 	peer = unix_peer(s);
299 	if (peer)
300 		sock_hold(peer);
301 	unix_state_unlock(s);
302 	return peer;
303 }
304 EXPORT_SYMBOL_GPL(unix_peer_get);
305 
306 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
307 					     int addr_len)
308 {
309 	struct unix_address *addr;
310 
311 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
312 	if (!addr)
313 		return NULL;
314 
315 	refcount_set(&addr->refcnt, 1);
316 	addr->len = addr_len;
317 	memcpy(addr->name, sunaddr, addr_len);
318 
319 	return addr;
320 }
321 
322 static inline void unix_release_addr(struct unix_address *addr)
323 {
324 	if (refcount_dec_and_test(&addr->refcnt))
325 		kfree(addr);
326 }
327 
328 /*
329  *	Check unix socket name:
330  *		- should be not zero length.
331  *	        - if started by not zero, should be NULL terminated (FS object)
332  *		- if started by zero, it is abstract name.
333  */
334 
335 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
336 {
337 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
338 	    addr_len > sizeof(*sunaddr))
339 		return -EINVAL;
340 
341 	if (sunaddr->sun_family != AF_UNIX)
342 		return -EINVAL;
343 
344 	return 0;
345 }
346 
347 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
348 {
349 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
350 	short offset = offsetof(struct sockaddr_storage, __data);
351 
352 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
353 
354 	/* This may look like an off by one error but it is a bit more
355 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
356 	 * sun_path[108] doesn't as such exist.  However in kernel space
357 	 * we are guaranteed that it is a valid memory location in our
358 	 * kernel address buffer because syscall functions always pass
359 	 * a pointer of struct sockaddr_storage which has a bigger buffer
360 	 * than 108.  Also, we must terminate sun_path for strlen() in
361 	 * getname_kernel().
362 	 */
363 	addr->__data[addr_len - offset] = 0;
364 
365 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
366 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
367 	 * know the actual buffer.
368 	 */
369 	return strlen(addr->__data) + offset + 1;
370 }
371 
372 static void __unix_remove_socket(struct sock *sk)
373 {
374 	sk_del_node_init(sk);
375 }
376 
377 static void __unix_insert_socket(struct net *net, struct sock *sk)
378 {
379 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
380 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
381 }
382 
383 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
384 				 struct unix_address *addr, unsigned int hash)
385 {
386 	__unix_remove_socket(sk);
387 	smp_store_release(&unix_sk(sk)->addr, addr);
388 
389 	sk->sk_hash = hash;
390 	__unix_insert_socket(net, sk);
391 }
392 
393 static void unix_remove_socket(struct net *net, struct sock *sk)
394 {
395 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
396 	__unix_remove_socket(sk);
397 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
398 }
399 
400 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
401 {
402 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
403 	__unix_insert_socket(net, sk);
404 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
405 }
406 
407 static void unix_insert_bsd_socket(struct sock *sk)
408 {
409 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
410 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
411 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
412 }
413 
414 static void unix_remove_bsd_socket(struct sock *sk)
415 {
416 	if (!hlist_unhashed(&sk->sk_bind_node)) {
417 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
418 		__sk_del_bind_node(sk);
419 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
420 
421 		sk_node_init(&sk->sk_bind_node);
422 	}
423 }
424 
425 static struct sock *__unix_find_socket_byname(struct net *net,
426 					      struct sockaddr_un *sunname,
427 					      int len, unsigned int hash)
428 {
429 	struct sock *s;
430 
431 	sk_for_each(s, &net->unx.table.buckets[hash]) {
432 		struct unix_sock *u = unix_sk(s);
433 
434 		if (u->addr->len == len &&
435 		    !memcmp(u->addr->name, sunname, len))
436 			return s;
437 	}
438 	return NULL;
439 }
440 
441 static inline struct sock *unix_find_socket_byname(struct net *net,
442 						   struct sockaddr_un *sunname,
443 						   int len, unsigned int hash)
444 {
445 	struct sock *s;
446 
447 	spin_lock(&net->unx.table.locks[hash]);
448 	s = __unix_find_socket_byname(net, sunname, len, hash);
449 	if (s)
450 		sock_hold(s);
451 	spin_unlock(&net->unx.table.locks[hash]);
452 	return s;
453 }
454 
455 static struct sock *unix_find_socket_byinode(struct inode *i)
456 {
457 	unsigned int hash = unix_bsd_hash(i);
458 	struct sock *s;
459 
460 	spin_lock(&bsd_socket_locks[hash]);
461 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
462 		struct dentry *dentry = unix_sk(s)->path.dentry;
463 
464 		if (dentry && d_backing_inode(dentry) == i) {
465 			sock_hold(s);
466 			spin_unlock(&bsd_socket_locks[hash]);
467 			return s;
468 		}
469 	}
470 	spin_unlock(&bsd_socket_locks[hash]);
471 	return NULL;
472 }
473 
474 /* Support code for asymmetrically connected dgram sockets
475  *
476  * If a datagram socket is connected to a socket not itself connected
477  * to the first socket (eg, /dev/log), clients may only enqueue more
478  * messages if the present receive queue of the server socket is not
479  * "too large". This means there's a second writeability condition
480  * poll and sendmsg need to test. The dgram recv code will do a wake
481  * up on the peer_wait wait queue of a socket upon reception of a
482  * datagram which needs to be propagated to sleeping would-be writers
483  * since these might not have sent anything so far. This can't be
484  * accomplished via poll_wait because the lifetime of the server
485  * socket might be less than that of its clients if these break their
486  * association with it or if the server socket is closed while clients
487  * are still connected to it and there's no way to inform "a polling
488  * implementation" that it should let go of a certain wait queue
489  *
490  * In order to propagate a wake up, a wait_queue_entry_t of the client
491  * socket is enqueued on the peer_wait queue of the server socket
492  * whose wake function does a wake_up on the ordinary client socket
493  * wait queue. This connection is established whenever a write (or
494  * poll for write) hit the flow control condition and broken when the
495  * association to the server socket is dissolved or after a wake up
496  * was relayed.
497  */
498 
499 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
500 				      void *key)
501 {
502 	struct unix_sock *u;
503 	wait_queue_head_t *u_sleep;
504 
505 	u = container_of(q, struct unix_sock, peer_wake);
506 
507 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
508 			    q);
509 	u->peer_wake.private = NULL;
510 
511 	/* relaying can only happen while the wq still exists */
512 	u_sleep = sk_sleep(&u->sk);
513 	if (u_sleep)
514 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
515 
516 	return 0;
517 }
518 
519 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
520 {
521 	struct unix_sock *u, *u_other;
522 	int rc;
523 
524 	u = unix_sk(sk);
525 	u_other = unix_sk(other);
526 	rc = 0;
527 	spin_lock(&u_other->peer_wait.lock);
528 
529 	if (!u->peer_wake.private) {
530 		u->peer_wake.private = other;
531 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
532 
533 		rc = 1;
534 	}
535 
536 	spin_unlock(&u_other->peer_wait.lock);
537 	return rc;
538 }
539 
540 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
541 					    struct sock *other)
542 {
543 	struct unix_sock *u, *u_other;
544 
545 	u = unix_sk(sk);
546 	u_other = unix_sk(other);
547 	spin_lock(&u_other->peer_wait.lock);
548 
549 	if (u->peer_wake.private == other) {
550 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
551 		u->peer_wake.private = NULL;
552 	}
553 
554 	spin_unlock(&u_other->peer_wait.lock);
555 }
556 
557 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
558 						   struct sock *other)
559 {
560 	unix_dgram_peer_wake_disconnect(sk, other);
561 	wake_up_interruptible_poll(sk_sleep(sk),
562 				   EPOLLOUT |
563 				   EPOLLWRNORM |
564 				   EPOLLWRBAND);
565 }
566 
567 /* preconditions:
568  *	- unix_peer(sk) == other
569  *	- association is stable
570  */
571 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
572 {
573 	int connected;
574 
575 	connected = unix_dgram_peer_wake_connect(sk, other);
576 
577 	/* If other is SOCK_DEAD, we want to make sure we signal
578 	 * POLLOUT, such that a subsequent write() can get a
579 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
580 	 * to other and its full, we will hang waiting for POLLOUT.
581 	 */
582 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
583 		return 1;
584 
585 	if (connected)
586 		unix_dgram_peer_wake_disconnect(sk, other);
587 
588 	return 0;
589 }
590 
591 static int unix_writable(const struct sock *sk, unsigned char state)
592 {
593 	return state != TCP_LISTEN &&
594 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
595 }
596 
597 static void unix_write_space(struct sock *sk)
598 {
599 	struct socket_wq *wq;
600 
601 	rcu_read_lock();
602 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
603 		wq = rcu_dereference(sk->sk_wq);
604 		if (skwq_has_sleeper(wq))
605 			wake_up_interruptible_sync_poll(&wq->wait,
606 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
607 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
608 	}
609 	rcu_read_unlock();
610 }
611 
612 /* When dgram socket disconnects (or changes its peer), we clear its receive
613  * queue of packets arrived from previous peer. First, it allows to do
614  * flow control based only on wmem_alloc; second, sk connected to peer
615  * may receive messages only from that peer. */
616 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
617 {
618 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
619 		skb_queue_purge_reason(&sk->sk_receive_queue,
620 				       SKB_DROP_REASON_UNIX_DISCONNECT);
621 
622 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
623 
624 		/* If one link of bidirectional dgram pipe is disconnected,
625 		 * we signal error. Messages are lost. Do not make this,
626 		 * when peer was not connected to us.
627 		 */
628 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
629 			WRITE_ONCE(other->sk_err, ECONNRESET);
630 			sk_error_report(other);
631 		}
632 	}
633 }
634 
635 static void unix_sock_destructor(struct sock *sk)
636 {
637 	struct unix_sock *u = unix_sk(sk);
638 
639 	skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
640 
641 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
642 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
643 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
644 	if (!sock_flag(sk, SOCK_DEAD)) {
645 		pr_info("Attempt to release alive unix socket: %p\n", sk);
646 		return;
647 	}
648 
649 	if (sk->sk_peer_pid)
650 		pidfs_put_pid(sk->sk_peer_pid);
651 
652 	if (u->addr)
653 		unix_release_addr(u->addr);
654 
655 	atomic_long_dec(&unix_nr_socks);
656 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
657 #ifdef UNIX_REFCNT_DEBUG
658 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
659 		atomic_long_read(&unix_nr_socks));
660 #endif
661 }
662 
663 static void unix_release_sock(struct sock *sk, int embrion)
664 {
665 	struct unix_sock *u = unix_sk(sk);
666 	struct sock *skpair;
667 	struct sk_buff *skb;
668 	struct path path;
669 	int state;
670 
671 	unix_remove_socket(sock_net(sk), sk);
672 	unix_remove_bsd_socket(sk);
673 
674 	/* Clear state */
675 	unix_state_lock(sk);
676 	sock_orphan(sk);
677 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
678 	path	     = u->path;
679 	u->path.dentry = NULL;
680 	u->path.mnt = NULL;
681 	state = sk->sk_state;
682 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
683 
684 	skpair = unix_peer(sk);
685 	unix_peer(sk) = NULL;
686 
687 	unix_state_unlock(sk);
688 
689 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
690 	u->oob_skb = NULL;
691 #endif
692 
693 	wake_up_interruptible_all(&u->peer_wait);
694 
695 	if (skpair != NULL) {
696 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
697 			unix_state_lock(skpair);
698 			/* No more writes */
699 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
700 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
701 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
702 			unix_state_unlock(skpair);
703 			skpair->sk_state_change(skpair);
704 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
705 		}
706 
707 		unix_dgram_peer_wake_disconnect(sk, skpair);
708 		sock_put(skpair); /* It may now die */
709 	}
710 
711 	/* Try to flush out this socket. Throw out buffers at least */
712 
713 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
714 		if (state == TCP_LISTEN)
715 			unix_release_sock(skb->sk, 1);
716 
717 		/* passed fds are erased in the kfree_skb hook */
718 		kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
719 	}
720 
721 	if (path.dentry)
722 		path_put(&path);
723 
724 	sock_put(sk);
725 
726 	/* ---- Socket is dead now and most probably destroyed ---- */
727 
728 	/*
729 	 * Fixme: BSD difference: In BSD all sockets connected to us get
730 	 *	  ECONNRESET and we die on the spot. In Linux we behave
731 	 *	  like files and pipes do and wait for the last
732 	 *	  dereference.
733 	 *
734 	 * Can't we simply set sock->err?
735 	 *
736 	 *	  What the above comment does talk about? --ANK(980817)
737 	 */
738 
739 	if (READ_ONCE(unix_tot_inflight))
740 		unix_gc();		/* Garbage collect fds */
741 }
742 
743 struct unix_peercred {
744 	struct pid *peer_pid;
745 	const struct cred *peer_cred;
746 };
747 
748 static inline int prepare_peercred(struct unix_peercred *peercred)
749 {
750 	struct pid *pid;
751 	int err;
752 
753 	pid = task_tgid(current);
754 	err = pidfs_register_pid(pid);
755 	if (likely(!err)) {
756 		peercred->peer_pid = get_pid(pid);
757 		peercred->peer_cred = get_current_cred();
758 	}
759 	return err;
760 }
761 
762 static void drop_peercred(struct unix_peercred *peercred)
763 {
764 	const struct cred *cred = NULL;
765 	struct pid *pid = NULL;
766 
767 	might_sleep();
768 
769 	swap(peercred->peer_pid, pid);
770 	swap(peercred->peer_cred, cred);
771 
772 	pidfs_put_pid(pid);
773 	put_pid(pid);
774 	put_cred(cred);
775 }
776 
777 static inline void init_peercred(struct sock *sk,
778 				 const struct unix_peercred *peercred)
779 {
780 	sk->sk_peer_pid = peercred->peer_pid;
781 	sk->sk_peer_cred = peercred->peer_cred;
782 }
783 
784 static void update_peercred(struct sock *sk, struct unix_peercred *peercred)
785 {
786 	const struct cred *old_cred;
787 	struct pid *old_pid;
788 
789 	spin_lock(&sk->sk_peer_lock);
790 	old_pid = sk->sk_peer_pid;
791 	old_cred = sk->sk_peer_cred;
792 	init_peercred(sk, peercred);
793 	spin_unlock(&sk->sk_peer_lock);
794 
795 	peercred->peer_pid = old_pid;
796 	peercred->peer_cred = old_cred;
797 }
798 
799 static void copy_peercred(struct sock *sk, struct sock *peersk)
800 {
801 	lockdep_assert_held(&unix_sk(peersk)->lock);
802 
803 	spin_lock(&sk->sk_peer_lock);
804 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
805 	pidfs_get_pid(sk->sk_peer_pid);
806 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
807 	spin_unlock(&sk->sk_peer_lock);
808 }
809 
810 static int unix_listen(struct socket *sock, int backlog)
811 {
812 	int err;
813 	struct sock *sk = sock->sk;
814 	struct unix_sock *u = unix_sk(sk);
815 	struct unix_peercred peercred = {};
816 
817 	err = -EOPNOTSUPP;
818 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
819 		goto out;	/* Only stream/seqpacket sockets accept */
820 	err = -EINVAL;
821 	if (!READ_ONCE(u->addr))
822 		goto out;	/* No listens on an unbound socket */
823 	err = prepare_peercred(&peercred);
824 	if (err)
825 		goto out;
826 	unix_state_lock(sk);
827 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
828 		goto out_unlock;
829 	if (backlog > sk->sk_max_ack_backlog)
830 		wake_up_interruptible_all(&u->peer_wait);
831 	sk->sk_max_ack_backlog	= backlog;
832 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
833 
834 	/* set credentials so connect can copy them */
835 	update_peercred(sk, &peercred);
836 	err = 0;
837 
838 out_unlock:
839 	unix_state_unlock(sk);
840 	drop_peercred(&peercred);
841 out:
842 	return err;
843 }
844 
845 static int unix_release(struct socket *);
846 static int unix_bind(struct socket *, struct sockaddr *, int);
847 static int unix_stream_connect(struct socket *, struct sockaddr *,
848 			       int addr_len, int flags);
849 static int unix_socketpair(struct socket *, struct socket *);
850 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
851 static int unix_getname(struct socket *, struct sockaddr *, int);
852 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
853 static __poll_t unix_dgram_poll(struct file *, struct socket *,
854 				    poll_table *);
855 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
856 #ifdef CONFIG_COMPAT
857 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
858 #endif
859 static int unix_shutdown(struct socket *, int);
860 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
861 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
862 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
863 				       struct pipe_inode_info *, size_t size,
864 				       unsigned int flags);
865 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
866 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
867 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
868 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
869 static int unix_dgram_connect(struct socket *, struct sockaddr *,
870 			      int, int);
871 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
872 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
873 				  int);
874 
875 #ifdef CONFIG_PROC_FS
876 static int unix_count_nr_fds(struct sock *sk)
877 {
878 	struct sk_buff *skb;
879 	struct unix_sock *u;
880 	int nr_fds = 0;
881 
882 	spin_lock(&sk->sk_receive_queue.lock);
883 	skb = skb_peek(&sk->sk_receive_queue);
884 	while (skb) {
885 		u = unix_sk(skb->sk);
886 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
887 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
888 	}
889 	spin_unlock(&sk->sk_receive_queue.lock);
890 
891 	return nr_fds;
892 }
893 
894 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
895 {
896 	struct sock *sk = sock->sk;
897 	unsigned char s_state;
898 	struct unix_sock *u;
899 	int nr_fds = 0;
900 
901 	if (sk) {
902 		s_state = READ_ONCE(sk->sk_state);
903 		u = unix_sk(sk);
904 
905 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
906 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
907 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
908 		 */
909 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
910 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
911 		else if (s_state == TCP_LISTEN)
912 			nr_fds = unix_count_nr_fds(sk);
913 
914 		seq_printf(m, "scm_fds: %u\n", nr_fds);
915 	}
916 }
917 #else
918 #define unix_show_fdinfo NULL
919 #endif
920 
921 static const struct proto_ops unix_stream_ops = {
922 	.family =	PF_UNIX,
923 	.owner =	THIS_MODULE,
924 	.release =	unix_release,
925 	.bind =		unix_bind,
926 	.connect =	unix_stream_connect,
927 	.socketpair =	unix_socketpair,
928 	.accept =	unix_accept,
929 	.getname =	unix_getname,
930 	.poll =		unix_poll,
931 	.ioctl =	unix_ioctl,
932 #ifdef CONFIG_COMPAT
933 	.compat_ioctl =	unix_compat_ioctl,
934 #endif
935 	.listen =	unix_listen,
936 	.shutdown =	unix_shutdown,
937 	.sendmsg =	unix_stream_sendmsg,
938 	.recvmsg =	unix_stream_recvmsg,
939 	.read_skb =	unix_stream_read_skb,
940 	.mmap =		sock_no_mmap,
941 	.splice_read =	unix_stream_splice_read,
942 	.set_peek_off =	sk_set_peek_off,
943 	.show_fdinfo =	unix_show_fdinfo,
944 };
945 
946 static const struct proto_ops unix_dgram_ops = {
947 	.family =	PF_UNIX,
948 	.owner =	THIS_MODULE,
949 	.release =	unix_release,
950 	.bind =		unix_bind,
951 	.connect =	unix_dgram_connect,
952 	.socketpair =	unix_socketpair,
953 	.accept =	sock_no_accept,
954 	.getname =	unix_getname,
955 	.poll =		unix_dgram_poll,
956 	.ioctl =	unix_ioctl,
957 #ifdef CONFIG_COMPAT
958 	.compat_ioctl =	unix_compat_ioctl,
959 #endif
960 	.listen =	sock_no_listen,
961 	.shutdown =	unix_shutdown,
962 	.sendmsg =	unix_dgram_sendmsg,
963 	.read_skb =	unix_read_skb,
964 	.recvmsg =	unix_dgram_recvmsg,
965 	.mmap =		sock_no_mmap,
966 	.set_peek_off =	sk_set_peek_off,
967 	.show_fdinfo =	unix_show_fdinfo,
968 };
969 
970 static const struct proto_ops unix_seqpacket_ops = {
971 	.family =	PF_UNIX,
972 	.owner =	THIS_MODULE,
973 	.release =	unix_release,
974 	.bind =		unix_bind,
975 	.connect =	unix_stream_connect,
976 	.socketpair =	unix_socketpair,
977 	.accept =	unix_accept,
978 	.getname =	unix_getname,
979 	.poll =		unix_dgram_poll,
980 	.ioctl =	unix_ioctl,
981 #ifdef CONFIG_COMPAT
982 	.compat_ioctl =	unix_compat_ioctl,
983 #endif
984 	.listen =	unix_listen,
985 	.shutdown =	unix_shutdown,
986 	.sendmsg =	unix_seqpacket_sendmsg,
987 	.recvmsg =	unix_seqpacket_recvmsg,
988 	.mmap =		sock_no_mmap,
989 	.set_peek_off =	sk_set_peek_off,
990 	.show_fdinfo =	unix_show_fdinfo,
991 };
992 
993 static void unix_close(struct sock *sk, long timeout)
994 {
995 	/* Nothing to do here, unix socket does not need a ->close().
996 	 * This is merely for sockmap.
997 	 */
998 }
999 
1000 static void unix_unhash(struct sock *sk)
1001 {
1002 	/* Nothing to do here, unix socket does not need a ->unhash().
1003 	 * This is merely for sockmap.
1004 	 */
1005 }
1006 
1007 static bool unix_bpf_bypass_getsockopt(int level, int optname)
1008 {
1009 	if (level == SOL_SOCKET) {
1010 		switch (optname) {
1011 		case SO_PEERPIDFD:
1012 			return true;
1013 		default:
1014 			return false;
1015 		}
1016 	}
1017 
1018 	return false;
1019 }
1020 
1021 struct proto unix_dgram_proto = {
1022 	.name			= "UNIX",
1023 	.owner			= THIS_MODULE,
1024 	.obj_size		= sizeof(struct unix_sock),
1025 	.close			= unix_close,
1026 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1027 #ifdef CONFIG_BPF_SYSCALL
1028 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
1029 #endif
1030 };
1031 
1032 struct proto unix_stream_proto = {
1033 	.name			= "UNIX-STREAM",
1034 	.owner			= THIS_MODULE,
1035 	.obj_size		= sizeof(struct unix_sock),
1036 	.close			= unix_close,
1037 	.unhash			= unix_unhash,
1038 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1039 #ifdef CONFIG_BPF_SYSCALL
1040 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1041 #endif
1042 };
1043 
1044 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1045 {
1046 	struct unix_sock *u;
1047 	struct sock *sk;
1048 	int err;
1049 
1050 	atomic_long_inc(&unix_nr_socks);
1051 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1052 		err = -ENFILE;
1053 		goto err;
1054 	}
1055 
1056 	if (type == SOCK_STREAM)
1057 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1058 	else /*dgram and  seqpacket */
1059 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1060 
1061 	if (!sk) {
1062 		err = -ENOMEM;
1063 		goto err;
1064 	}
1065 
1066 	sock_init_data(sock, sk);
1067 
1068 	sk->sk_hash		= unix_unbound_hash(sk);
1069 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1070 	sk->sk_write_space	= unix_write_space;
1071 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1072 	sk->sk_destruct		= unix_sock_destructor;
1073 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1074 
1075 	u = unix_sk(sk);
1076 	u->listener = NULL;
1077 	u->vertex = NULL;
1078 	u->path.dentry = NULL;
1079 	u->path.mnt = NULL;
1080 	spin_lock_init(&u->lock);
1081 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1082 	mutex_init(&u->iolock); /* single task reading lock */
1083 	mutex_init(&u->bindlock); /* single task binding lock */
1084 	init_waitqueue_head(&u->peer_wait);
1085 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1086 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1087 	unix_insert_unbound_socket(net, sk);
1088 
1089 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1090 
1091 	return sk;
1092 
1093 err:
1094 	atomic_long_dec(&unix_nr_socks);
1095 	return ERR_PTR(err);
1096 }
1097 
1098 static int unix_create(struct net *net, struct socket *sock, int protocol,
1099 		       int kern)
1100 {
1101 	struct sock *sk;
1102 
1103 	if (protocol && protocol != PF_UNIX)
1104 		return -EPROTONOSUPPORT;
1105 
1106 	sock->state = SS_UNCONNECTED;
1107 
1108 	switch (sock->type) {
1109 	case SOCK_STREAM:
1110 		sock->ops = &unix_stream_ops;
1111 		break;
1112 		/*
1113 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1114 		 *	nothing uses it.
1115 		 */
1116 	case SOCK_RAW:
1117 		sock->type = SOCK_DGRAM;
1118 		fallthrough;
1119 	case SOCK_DGRAM:
1120 		sock->ops = &unix_dgram_ops;
1121 		break;
1122 	case SOCK_SEQPACKET:
1123 		sock->ops = &unix_seqpacket_ops;
1124 		break;
1125 	default:
1126 		return -ESOCKTNOSUPPORT;
1127 	}
1128 
1129 	sk = unix_create1(net, sock, kern, sock->type);
1130 	if (IS_ERR(sk))
1131 		return PTR_ERR(sk);
1132 
1133 	return 0;
1134 }
1135 
1136 static int unix_release(struct socket *sock)
1137 {
1138 	struct sock *sk = sock->sk;
1139 
1140 	if (!sk)
1141 		return 0;
1142 
1143 	sk->sk_prot->close(sk, 0);
1144 	unix_release_sock(sk, 0);
1145 	sock->sk = NULL;
1146 
1147 	return 0;
1148 }
1149 
1150 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1151 				  int type, int flags)
1152 {
1153 	struct inode *inode;
1154 	struct path path;
1155 	struct sock *sk;
1156 	int err;
1157 
1158 	unix_mkname_bsd(sunaddr, addr_len);
1159 
1160 	if (flags & SOCK_COREDUMP) {
1161 		const struct cred *cred;
1162 		struct cred *kcred;
1163 		struct path root;
1164 
1165 		kcred = prepare_kernel_cred(&init_task);
1166 		if (!kcred) {
1167 			err = -ENOMEM;
1168 			goto fail;
1169 		}
1170 
1171 		task_lock(&init_task);
1172 		get_fs_root(init_task.fs, &root);
1173 		task_unlock(&init_task);
1174 
1175 		cred = override_creds(kcred);
1176 		err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path,
1177 				      LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS |
1178 				      LOOKUP_NO_MAGICLINKS, &path);
1179 		put_cred(revert_creds(cred));
1180 		path_put(&root);
1181 		if (err)
1182 			goto fail;
1183 	} else {
1184 		err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1185 		if (err)
1186 			goto fail;
1187 
1188 		err = path_permission(&path, MAY_WRITE);
1189 		if (err)
1190 			goto path_put;
1191 	}
1192 
1193 	err = -ECONNREFUSED;
1194 	inode = d_backing_inode(path.dentry);
1195 	if (!S_ISSOCK(inode->i_mode))
1196 		goto path_put;
1197 
1198 	sk = unix_find_socket_byinode(inode);
1199 	if (!sk)
1200 		goto path_put;
1201 
1202 	err = -EPROTOTYPE;
1203 	if (sk->sk_type == type)
1204 		touch_atime(&path);
1205 	else
1206 		goto sock_put;
1207 
1208 	path_put(&path);
1209 
1210 	return sk;
1211 
1212 sock_put:
1213 	sock_put(sk);
1214 path_put:
1215 	path_put(&path);
1216 fail:
1217 	return ERR_PTR(err);
1218 }
1219 
1220 static struct sock *unix_find_abstract(struct net *net,
1221 				       struct sockaddr_un *sunaddr,
1222 				       int addr_len, int type)
1223 {
1224 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1225 	struct dentry *dentry;
1226 	struct sock *sk;
1227 
1228 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1229 	if (!sk)
1230 		return ERR_PTR(-ECONNREFUSED);
1231 
1232 	dentry = unix_sk(sk)->path.dentry;
1233 	if (dentry)
1234 		touch_atime(&unix_sk(sk)->path);
1235 
1236 	return sk;
1237 }
1238 
1239 static struct sock *unix_find_other(struct net *net,
1240 				    struct sockaddr_un *sunaddr,
1241 				    int addr_len, int type, int flags)
1242 {
1243 	struct sock *sk;
1244 
1245 	if (sunaddr->sun_path[0])
1246 		sk = unix_find_bsd(sunaddr, addr_len, type, flags);
1247 	else
1248 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1249 
1250 	return sk;
1251 }
1252 
1253 static int unix_autobind(struct sock *sk)
1254 {
1255 	struct unix_sock *u = unix_sk(sk);
1256 	unsigned int new_hash, old_hash;
1257 	struct net *net = sock_net(sk);
1258 	struct unix_address *addr;
1259 	u32 lastnum, ordernum;
1260 	int err;
1261 
1262 	err = mutex_lock_interruptible(&u->bindlock);
1263 	if (err)
1264 		return err;
1265 
1266 	if (u->addr)
1267 		goto out;
1268 
1269 	err = -ENOMEM;
1270 	addr = kzalloc(sizeof(*addr) +
1271 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1272 	if (!addr)
1273 		goto out;
1274 
1275 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1276 	addr->name->sun_family = AF_UNIX;
1277 	refcount_set(&addr->refcnt, 1);
1278 
1279 	old_hash = sk->sk_hash;
1280 	ordernum = get_random_u32();
1281 	lastnum = ordernum & 0xFFFFF;
1282 retry:
1283 	ordernum = (ordernum + 1) & 0xFFFFF;
1284 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1285 
1286 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1287 	unix_table_double_lock(net, old_hash, new_hash);
1288 
1289 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1290 		unix_table_double_unlock(net, old_hash, new_hash);
1291 
1292 		/* __unix_find_socket_byname() may take long time if many names
1293 		 * are already in use.
1294 		 */
1295 		cond_resched();
1296 
1297 		if (ordernum == lastnum) {
1298 			/* Give up if all names seems to be in use. */
1299 			err = -ENOSPC;
1300 			unix_release_addr(addr);
1301 			goto out;
1302 		}
1303 
1304 		goto retry;
1305 	}
1306 
1307 	__unix_set_addr_hash(net, sk, addr, new_hash);
1308 	unix_table_double_unlock(net, old_hash, new_hash);
1309 	err = 0;
1310 
1311 out:	mutex_unlock(&u->bindlock);
1312 	return err;
1313 }
1314 
1315 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1316 			 int addr_len)
1317 {
1318 	umode_t mode = S_IFSOCK |
1319 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1320 	struct unix_sock *u = unix_sk(sk);
1321 	unsigned int new_hash, old_hash;
1322 	struct net *net = sock_net(sk);
1323 	struct mnt_idmap *idmap;
1324 	struct unix_address *addr;
1325 	struct dentry *dentry;
1326 	struct path parent;
1327 	int err;
1328 
1329 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1330 	addr = unix_create_addr(sunaddr, addr_len);
1331 	if (!addr)
1332 		return -ENOMEM;
1333 
1334 	/*
1335 	 * Get the parent directory, calculate the hash for last
1336 	 * component.
1337 	 */
1338 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1339 	if (IS_ERR(dentry)) {
1340 		err = PTR_ERR(dentry);
1341 		goto out;
1342 	}
1343 
1344 	/*
1345 	 * All right, let's create it.
1346 	 */
1347 	idmap = mnt_idmap(parent.mnt);
1348 	err = security_path_mknod(&parent, dentry, mode, 0);
1349 	if (!err)
1350 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1351 	if (err)
1352 		goto out_path;
1353 	err = mutex_lock_interruptible(&u->bindlock);
1354 	if (err)
1355 		goto out_unlink;
1356 	if (u->addr)
1357 		goto out_unlock;
1358 
1359 	old_hash = sk->sk_hash;
1360 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1361 	unix_table_double_lock(net, old_hash, new_hash);
1362 	u->path.mnt = mntget(parent.mnt);
1363 	u->path.dentry = dget(dentry);
1364 	__unix_set_addr_hash(net, sk, addr, new_hash);
1365 	unix_table_double_unlock(net, old_hash, new_hash);
1366 	unix_insert_bsd_socket(sk);
1367 	mutex_unlock(&u->bindlock);
1368 	done_path_create(&parent, dentry);
1369 	return 0;
1370 
1371 out_unlock:
1372 	mutex_unlock(&u->bindlock);
1373 	err = -EINVAL;
1374 out_unlink:
1375 	/* failed after successful mknod?  unlink what we'd created... */
1376 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1377 out_path:
1378 	done_path_create(&parent, dentry);
1379 out:
1380 	unix_release_addr(addr);
1381 	return err == -EEXIST ? -EADDRINUSE : err;
1382 }
1383 
1384 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1385 			      int addr_len)
1386 {
1387 	struct unix_sock *u = unix_sk(sk);
1388 	unsigned int new_hash, old_hash;
1389 	struct net *net = sock_net(sk);
1390 	struct unix_address *addr;
1391 	int err;
1392 
1393 	addr = unix_create_addr(sunaddr, addr_len);
1394 	if (!addr)
1395 		return -ENOMEM;
1396 
1397 	err = mutex_lock_interruptible(&u->bindlock);
1398 	if (err)
1399 		goto out;
1400 
1401 	if (u->addr) {
1402 		err = -EINVAL;
1403 		goto out_mutex;
1404 	}
1405 
1406 	old_hash = sk->sk_hash;
1407 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1408 	unix_table_double_lock(net, old_hash, new_hash);
1409 
1410 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1411 		goto out_spin;
1412 
1413 	__unix_set_addr_hash(net, sk, addr, new_hash);
1414 	unix_table_double_unlock(net, old_hash, new_hash);
1415 	mutex_unlock(&u->bindlock);
1416 	return 0;
1417 
1418 out_spin:
1419 	unix_table_double_unlock(net, old_hash, new_hash);
1420 	err = -EADDRINUSE;
1421 out_mutex:
1422 	mutex_unlock(&u->bindlock);
1423 out:
1424 	unix_release_addr(addr);
1425 	return err;
1426 }
1427 
1428 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1429 {
1430 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1431 	struct sock *sk = sock->sk;
1432 	int err;
1433 
1434 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1435 	    sunaddr->sun_family == AF_UNIX)
1436 		return unix_autobind(sk);
1437 
1438 	err = unix_validate_addr(sunaddr, addr_len);
1439 	if (err)
1440 		return err;
1441 
1442 	if (sunaddr->sun_path[0])
1443 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1444 	else
1445 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1446 
1447 	return err;
1448 }
1449 
1450 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1451 {
1452 	if (unlikely(sk1 == sk2) || !sk2) {
1453 		unix_state_lock(sk1);
1454 		return;
1455 	}
1456 
1457 	if (sk1 > sk2)
1458 		swap(sk1, sk2);
1459 
1460 	unix_state_lock(sk1);
1461 	unix_state_lock(sk2);
1462 }
1463 
1464 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1465 {
1466 	if (unlikely(sk1 == sk2) || !sk2) {
1467 		unix_state_unlock(sk1);
1468 		return;
1469 	}
1470 	unix_state_unlock(sk1);
1471 	unix_state_unlock(sk2);
1472 }
1473 
1474 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1475 			      int alen, int flags)
1476 {
1477 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1478 	struct sock *sk = sock->sk;
1479 	struct sock *other;
1480 	int err;
1481 
1482 	err = -EINVAL;
1483 	if (alen < offsetofend(struct sockaddr, sa_family))
1484 		goto out;
1485 
1486 	if (addr->sa_family != AF_UNSPEC) {
1487 		err = unix_validate_addr(sunaddr, alen);
1488 		if (err)
1489 			goto out;
1490 
1491 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1492 		if (err)
1493 			goto out;
1494 
1495 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1496 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1497 		    !READ_ONCE(unix_sk(sk)->addr)) {
1498 			err = unix_autobind(sk);
1499 			if (err)
1500 				goto out;
1501 		}
1502 
1503 restart:
1504 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0);
1505 		if (IS_ERR(other)) {
1506 			err = PTR_ERR(other);
1507 			goto out;
1508 		}
1509 
1510 		unix_state_double_lock(sk, other);
1511 
1512 		/* Apparently VFS overslept socket death. Retry. */
1513 		if (sock_flag(other, SOCK_DEAD)) {
1514 			unix_state_double_unlock(sk, other);
1515 			sock_put(other);
1516 			goto restart;
1517 		}
1518 
1519 		err = -EPERM;
1520 		if (!unix_may_send(sk, other))
1521 			goto out_unlock;
1522 
1523 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1524 		if (err)
1525 			goto out_unlock;
1526 
1527 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1528 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1529 	} else {
1530 		/*
1531 		 *	1003.1g breaking connected state with AF_UNSPEC
1532 		 */
1533 		other = NULL;
1534 		unix_state_double_lock(sk, other);
1535 	}
1536 
1537 	/*
1538 	 * If it was connected, reconnect.
1539 	 */
1540 	if (unix_peer(sk)) {
1541 		struct sock *old_peer = unix_peer(sk);
1542 
1543 		unix_peer(sk) = other;
1544 		if (!other)
1545 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1546 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1547 
1548 		unix_state_double_unlock(sk, other);
1549 
1550 		if (other != old_peer) {
1551 			unix_dgram_disconnected(sk, old_peer);
1552 
1553 			unix_state_lock(old_peer);
1554 			if (!unix_peer(old_peer))
1555 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1556 			unix_state_unlock(old_peer);
1557 		}
1558 
1559 		sock_put(old_peer);
1560 	} else {
1561 		unix_peer(sk) = other;
1562 		unix_state_double_unlock(sk, other);
1563 	}
1564 
1565 	return 0;
1566 
1567 out_unlock:
1568 	unix_state_double_unlock(sk, other);
1569 	sock_put(other);
1570 out:
1571 	return err;
1572 }
1573 
1574 static long unix_wait_for_peer(struct sock *other, long timeo)
1575 {
1576 	struct unix_sock *u = unix_sk(other);
1577 	int sched;
1578 	DEFINE_WAIT(wait);
1579 
1580 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1581 
1582 	sched = !sock_flag(other, SOCK_DEAD) &&
1583 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1584 		unix_recvq_full_lockless(other);
1585 
1586 	unix_state_unlock(other);
1587 
1588 	if (sched)
1589 		timeo = schedule_timeout(timeo);
1590 
1591 	finish_wait(&u->peer_wait, &wait);
1592 	return timeo;
1593 }
1594 
1595 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1596 			       int addr_len, int flags)
1597 {
1598 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1599 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1600 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1601 	struct unix_peercred peercred = {};
1602 	struct net *net = sock_net(sk);
1603 	struct sk_buff *skb = NULL;
1604 	unsigned char state;
1605 	long timeo;
1606 	int err;
1607 
1608 	err = unix_validate_addr(sunaddr, addr_len);
1609 	if (err)
1610 		goto out;
1611 
1612 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1613 	if (err)
1614 		goto out;
1615 
1616 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1617 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1618 	    !READ_ONCE(u->addr)) {
1619 		err = unix_autobind(sk);
1620 		if (err)
1621 			goto out;
1622 	}
1623 
1624 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1625 
1626 	/* First of all allocate resources.
1627 	 * If we will make it after state is locked,
1628 	 * we will have to recheck all again in any case.
1629 	 */
1630 
1631 	/* create new sock for complete connection */
1632 	newsk = unix_create1(net, NULL, 0, sock->type);
1633 	if (IS_ERR(newsk)) {
1634 		err = PTR_ERR(newsk);
1635 		goto out;
1636 	}
1637 
1638 	err = prepare_peercred(&peercred);
1639 	if (err)
1640 		goto out;
1641 
1642 	/* Allocate skb for sending to listening sock */
1643 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1644 	if (!skb) {
1645 		err = -ENOMEM;
1646 		goto out_free_sk;
1647 	}
1648 
1649 restart:
1650 	/*  Find listening sock. */
1651 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags);
1652 	if (IS_ERR(other)) {
1653 		err = PTR_ERR(other);
1654 		goto out_free_skb;
1655 	}
1656 
1657 	unix_state_lock(other);
1658 
1659 	/* Apparently VFS overslept socket death. Retry. */
1660 	if (sock_flag(other, SOCK_DEAD)) {
1661 		unix_state_unlock(other);
1662 		sock_put(other);
1663 		goto restart;
1664 	}
1665 
1666 	if (other->sk_state != TCP_LISTEN ||
1667 	    other->sk_shutdown & RCV_SHUTDOWN) {
1668 		err = -ECONNREFUSED;
1669 		goto out_unlock;
1670 	}
1671 
1672 	if (unix_recvq_full_lockless(other)) {
1673 		if (!timeo) {
1674 			err = -EAGAIN;
1675 			goto out_unlock;
1676 		}
1677 
1678 		timeo = unix_wait_for_peer(other, timeo);
1679 		sock_put(other);
1680 
1681 		err = sock_intr_errno(timeo);
1682 		if (signal_pending(current))
1683 			goto out_free_skb;
1684 
1685 		goto restart;
1686 	}
1687 
1688 	/* self connect and simultaneous connect are eliminated
1689 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1690 	 */
1691 	state = READ_ONCE(sk->sk_state);
1692 	if (unlikely(state != TCP_CLOSE)) {
1693 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1694 		goto out_unlock;
1695 	}
1696 
1697 	unix_state_lock(sk);
1698 
1699 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1700 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1701 		unix_state_unlock(sk);
1702 		goto out_unlock;
1703 	}
1704 
1705 	err = security_unix_stream_connect(sk, other, newsk);
1706 	if (err) {
1707 		unix_state_unlock(sk);
1708 		goto out_unlock;
1709 	}
1710 
1711 	/* The way is open! Fastly set all the necessary fields... */
1712 
1713 	sock_hold(sk);
1714 	unix_peer(newsk)	= sk;
1715 	newsk->sk_state		= TCP_ESTABLISHED;
1716 	newsk->sk_type		= sk->sk_type;
1717 	init_peercred(newsk, &peercred);
1718 	newu = unix_sk(newsk);
1719 	newu->listener = other;
1720 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1721 	otheru = unix_sk(other);
1722 
1723 	/* copy address information from listening to new sock
1724 	 *
1725 	 * The contents of *(otheru->addr) and otheru->path
1726 	 * are seen fully set up here, since we have found
1727 	 * otheru in hash under its lock.  Insertion into the
1728 	 * hash chain we'd found it in had been done in an
1729 	 * earlier critical area protected by the chain's lock,
1730 	 * the same one where we'd set *(otheru->addr) contents,
1731 	 * as well as otheru->path and otheru->addr itself.
1732 	 *
1733 	 * Using smp_store_release() here to set newu->addr
1734 	 * is enough to make those stores, as well as stores
1735 	 * to newu->path visible to anyone who gets newu->addr
1736 	 * by smp_load_acquire().  IOW, the same warranties
1737 	 * as for unix_sock instances bound in unix_bind() or
1738 	 * in unix_autobind().
1739 	 */
1740 	if (otheru->path.dentry) {
1741 		path_get(&otheru->path);
1742 		newu->path = otheru->path;
1743 	}
1744 	refcount_inc(&otheru->addr->refcnt);
1745 	smp_store_release(&newu->addr, otheru->addr);
1746 
1747 	/* Set credentials */
1748 	copy_peercred(sk, other);
1749 
1750 	sock->state	= SS_CONNECTED;
1751 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1752 	sock_hold(newsk);
1753 
1754 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1755 	unix_peer(sk)	= newsk;
1756 
1757 	unix_state_unlock(sk);
1758 
1759 	/* take ten and send info to listening sock */
1760 	spin_lock(&other->sk_receive_queue.lock);
1761 	__skb_queue_tail(&other->sk_receive_queue, skb);
1762 	spin_unlock(&other->sk_receive_queue.lock);
1763 	unix_state_unlock(other);
1764 	other->sk_data_ready(other);
1765 	sock_put(other);
1766 	return 0;
1767 
1768 out_unlock:
1769 	unix_state_unlock(other);
1770 	sock_put(other);
1771 out_free_skb:
1772 	consume_skb(skb);
1773 out_free_sk:
1774 	unix_release_sock(newsk, 0);
1775 out:
1776 	drop_peercred(&peercred);
1777 	return err;
1778 }
1779 
1780 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1781 {
1782 	struct unix_peercred ska_peercred = {}, skb_peercred = {};
1783 	struct sock *ska = socka->sk, *skb = sockb->sk;
1784 	int err;
1785 
1786 	err = prepare_peercred(&ska_peercred);
1787 	if (err)
1788 		return err;
1789 
1790 	err = prepare_peercred(&skb_peercred);
1791 	if (err) {
1792 		drop_peercred(&ska_peercred);
1793 		return err;
1794 	}
1795 
1796 	/* Join our sockets back to back */
1797 	sock_hold(ska);
1798 	sock_hold(skb);
1799 	unix_peer(ska) = skb;
1800 	unix_peer(skb) = ska;
1801 	init_peercred(ska, &ska_peercred);
1802 	init_peercred(skb, &skb_peercred);
1803 
1804 	ska->sk_state = TCP_ESTABLISHED;
1805 	skb->sk_state = TCP_ESTABLISHED;
1806 	socka->state  = SS_CONNECTED;
1807 	sockb->state  = SS_CONNECTED;
1808 	return 0;
1809 }
1810 
1811 static void unix_sock_inherit_flags(const struct socket *old,
1812 				    struct socket *new)
1813 {
1814 	if (test_bit(SOCK_PASSCRED, &old->flags))
1815 		set_bit(SOCK_PASSCRED, &new->flags);
1816 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1817 		set_bit(SOCK_PASSPIDFD, &new->flags);
1818 	if (test_bit(SOCK_PASSSEC, &old->flags))
1819 		set_bit(SOCK_PASSSEC, &new->flags);
1820 }
1821 
1822 static int unix_accept(struct socket *sock, struct socket *newsock,
1823 		       struct proto_accept_arg *arg)
1824 {
1825 	struct sock *sk = sock->sk;
1826 	struct sk_buff *skb;
1827 	struct sock *tsk;
1828 
1829 	arg->err = -EOPNOTSUPP;
1830 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1831 		goto out;
1832 
1833 	arg->err = -EINVAL;
1834 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1835 		goto out;
1836 
1837 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1838 	 * so that no locks are necessary.
1839 	 */
1840 
1841 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1842 				&arg->err);
1843 	if (!skb) {
1844 		/* This means receive shutdown. */
1845 		if (arg->err == 0)
1846 			arg->err = -EINVAL;
1847 		goto out;
1848 	}
1849 
1850 	tsk = skb->sk;
1851 	skb_free_datagram(sk, skb);
1852 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1853 
1854 	/* attach accepted sock to socket */
1855 	unix_state_lock(tsk);
1856 	unix_update_edges(unix_sk(tsk));
1857 	newsock->state = SS_CONNECTED;
1858 	unix_sock_inherit_flags(sock, newsock);
1859 	sock_graft(tsk, newsock);
1860 	unix_state_unlock(tsk);
1861 	return 0;
1862 
1863 out:
1864 	return arg->err;
1865 }
1866 
1867 
1868 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1869 {
1870 	struct sock *sk = sock->sk;
1871 	struct unix_address *addr;
1872 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1873 	int err = 0;
1874 
1875 	if (peer) {
1876 		sk = unix_peer_get(sk);
1877 
1878 		err = -ENOTCONN;
1879 		if (!sk)
1880 			goto out;
1881 		err = 0;
1882 	} else {
1883 		sock_hold(sk);
1884 	}
1885 
1886 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1887 	if (!addr) {
1888 		sunaddr->sun_family = AF_UNIX;
1889 		sunaddr->sun_path[0] = 0;
1890 		err = offsetof(struct sockaddr_un, sun_path);
1891 	} else {
1892 		err = addr->len;
1893 		memcpy(sunaddr, addr->name, addr->len);
1894 
1895 		if (peer)
1896 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1897 					       CGROUP_UNIX_GETPEERNAME);
1898 		else
1899 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1900 					       CGROUP_UNIX_GETSOCKNAME);
1901 	}
1902 	sock_put(sk);
1903 out:
1904 	return err;
1905 }
1906 
1907 /* The "user->unix_inflight" variable is protected by the garbage
1908  * collection lock, and we just read it locklessly here. If you go
1909  * over the limit, there might be a tiny race in actually noticing
1910  * it across threads. Tough.
1911  */
1912 static inline bool too_many_unix_fds(struct task_struct *p)
1913 {
1914 	struct user_struct *user = current_user();
1915 
1916 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1917 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1918 	return false;
1919 }
1920 
1921 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1922 {
1923 	if (too_many_unix_fds(current))
1924 		return -ETOOMANYREFS;
1925 
1926 	UNIXCB(skb).fp = scm->fp;
1927 	scm->fp = NULL;
1928 
1929 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1930 		return -ENOMEM;
1931 
1932 	return 0;
1933 }
1934 
1935 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1936 {
1937 	scm->fp = UNIXCB(skb).fp;
1938 	UNIXCB(skb).fp = NULL;
1939 
1940 	unix_destroy_fpl(scm->fp);
1941 }
1942 
1943 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1944 {
1945 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1946 }
1947 
1948 static void unix_destruct_scm(struct sk_buff *skb)
1949 {
1950 	struct scm_cookie scm;
1951 
1952 	memset(&scm, 0, sizeof(scm));
1953 	scm.pid  = UNIXCB(skb).pid;
1954 	if (UNIXCB(skb).fp)
1955 		unix_detach_fds(&scm, skb);
1956 
1957 	/* Alas, it calls VFS */
1958 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1959 	scm_destroy(&scm);
1960 	sock_wfree(skb);
1961 }
1962 
1963 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1964 {
1965 	int err = 0;
1966 
1967 	UNIXCB(skb).pid  = get_pid(scm->pid);
1968 	UNIXCB(skb).uid = scm->creds.uid;
1969 	UNIXCB(skb).gid = scm->creds.gid;
1970 	UNIXCB(skb).fp = NULL;
1971 	unix_get_secdata(scm, skb);
1972 	if (scm->fp && send_fds)
1973 		err = unix_attach_fds(scm, skb);
1974 
1975 	skb->destructor = unix_destruct_scm;
1976 	return err;
1977 }
1978 
1979 static bool unix_passcred_enabled(const struct socket *sock,
1980 				  const struct sock *other)
1981 {
1982 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1983 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1984 	       !other->sk_socket ||
1985 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1986 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1987 }
1988 
1989 /*
1990  * Some apps rely on write() giving SCM_CREDENTIALS
1991  * We include credentials if source or destination socket
1992  * asserted SOCK_PASSCRED.
1993  */
1994 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1995 			    const struct sock *other)
1996 {
1997 	if (UNIXCB(skb).pid)
1998 		return;
1999 	if (unix_passcred_enabled(sock, other)) {
2000 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
2001 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
2002 	}
2003 }
2004 
2005 static bool unix_skb_scm_eq(struct sk_buff *skb,
2006 			    struct scm_cookie *scm)
2007 {
2008 	return UNIXCB(skb).pid == scm->pid &&
2009 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
2010 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
2011 	       unix_secdata_eq(scm, skb);
2012 }
2013 
2014 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
2015 {
2016 	struct scm_fp_list *fp = UNIXCB(skb).fp;
2017 	struct unix_sock *u = unix_sk(sk);
2018 
2019 	if (unlikely(fp && fp->count)) {
2020 		atomic_add(fp->count, &u->scm_stat.nr_fds);
2021 		unix_add_edges(fp, u);
2022 	}
2023 }
2024 
2025 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
2026 {
2027 	struct scm_fp_list *fp = UNIXCB(skb).fp;
2028 	struct unix_sock *u = unix_sk(sk);
2029 
2030 	if (unlikely(fp && fp->count)) {
2031 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
2032 		unix_del_edges(fp);
2033 	}
2034 }
2035 
2036 /*
2037  *	Send AF_UNIX data.
2038  */
2039 
2040 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
2041 			      size_t len)
2042 {
2043 	struct sock *sk = sock->sk, *other = NULL;
2044 	struct unix_sock *u = unix_sk(sk);
2045 	struct scm_cookie scm;
2046 	struct sk_buff *skb;
2047 	int data_len = 0;
2048 	int sk_locked;
2049 	long timeo;
2050 	int err;
2051 
2052 	err = scm_send(sock, msg, &scm, false);
2053 	if (err < 0)
2054 		return err;
2055 
2056 	wait_for_unix_gc(scm.fp);
2057 
2058 	if (msg->msg_flags & MSG_OOB) {
2059 		err = -EOPNOTSUPP;
2060 		goto out;
2061 	}
2062 
2063 	if (msg->msg_namelen) {
2064 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
2065 		if (err)
2066 			goto out;
2067 
2068 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
2069 							    msg->msg_name,
2070 							    &msg->msg_namelen,
2071 							    NULL);
2072 		if (err)
2073 			goto out;
2074 	}
2075 
2076 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2077 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
2078 	    !READ_ONCE(u->addr)) {
2079 		err = unix_autobind(sk);
2080 		if (err)
2081 			goto out;
2082 	}
2083 
2084 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
2085 		err = -EMSGSIZE;
2086 		goto out;
2087 	}
2088 
2089 	if (len > SKB_MAX_ALLOC) {
2090 		data_len = min_t(size_t,
2091 				 len - SKB_MAX_ALLOC,
2092 				 MAX_SKB_FRAGS * PAGE_SIZE);
2093 		data_len = PAGE_ALIGN(data_len);
2094 
2095 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2096 	}
2097 
2098 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2099 				   msg->msg_flags & MSG_DONTWAIT, &err,
2100 				   PAGE_ALLOC_COSTLY_ORDER);
2101 	if (!skb)
2102 		goto out;
2103 
2104 	err = unix_scm_to_skb(&scm, skb, true);
2105 	if (err < 0)
2106 		goto out_free;
2107 
2108 	skb_put(skb, len - data_len);
2109 	skb->data_len = data_len;
2110 	skb->len = len;
2111 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2112 	if (err)
2113 		goto out_free;
2114 
2115 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2116 
2117 	if (msg->msg_namelen) {
2118 lookup:
2119 		other = unix_find_other(sock_net(sk), msg->msg_name,
2120 					msg->msg_namelen, sk->sk_type, 0);
2121 		if (IS_ERR(other)) {
2122 			err = PTR_ERR(other);
2123 			goto out_free;
2124 		}
2125 	} else {
2126 		other = unix_peer_get(sk);
2127 		if (!other) {
2128 			err = -ENOTCONN;
2129 			goto out_free;
2130 		}
2131 	}
2132 
2133 	if (sk_filter(other, skb) < 0) {
2134 		/* Toss the packet but do not return any error to the sender */
2135 		err = len;
2136 		goto out_sock_put;
2137 	}
2138 
2139 restart:
2140 	sk_locked = 0;
2141 	unix_state_lock(other);
2142 restart_locked:
2143 
2144 	if (!unix_may_send(sk, other)) {
2145 		err = -EPERM;
2146 		goto out_unlock;
2147 	}
2148 
2149 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2150 		/* Check with 1003.1g - what should datagram error */
2151 
2152 		unix_state_unlock(other);
2153 
2154 		if (sk->sk_type == SOCK_SEQPACKET) {
2155 			/* We are here only when racing with unix_release_sock()
2156 			 * is clearing @other. Never change state to TCP_CLOSE
2157 			 * unlike SOCK_DGRAM wants.
2158 			 */
2159 			err = -EPIPE;
2160 			goto out_sock_put;
2161 		}
2162 
2163 		if (!sk_locked)
2164 			unix_state_lock(sk);
2165 
2166 		if (unix_peer(sk) == other) {
2167 			unix_peer(sk) = NULL;
2168 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2169 
2170 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2171 			unix_state_unlock(sk);
2172 
2173 			unix_dgram_disconnected(sk, other);
2174 			sock_put(other);
2175 			err = -ECONNREFUSED;
2176 			goto out_sock_put;
2177 		}
2178 
2179 		unix_state_unlock(sk);
2180 
2181 		if (!msg->msg_namelen) {
2182 			err = -ECONNRESET;
2183 			goto out_sock_put;
2184 		}
2185 
2186 		sock_put(other);
2187 		goto lookup;
2188 	}
2189 
2190 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2191 		err = -EPIPE;
2192 		goto out_unlock;
2193 	}
2194 
2195 	if (sk->sk_type != SOCK_SEQPACKET) {
2196 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2197 		if (err)
2198 			goto out_unlock;
2199 	}
2200 
2201 	/* other == sk && unix_peer(other) != sk if
2202 	 * - unix_peer(sk) == NULL, destination address bound to sk
2203 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2204 	 */
2205 	if (other != sk &&
2206 	    unlikely(unix_peer(other) != sk &&
2207 	    unix_recvq_full_lockless(other))) {
2208 		if (timeo) {
2209 			timeo = unix_wait_for_peer(other, timeo);
2210 
2211 			err = sock_intr_errno(timeo);
2212 			if (signal_pending(current))
2213 				goto out_sock_put;
2214 
2215 			goto restart;
2216 		}
2217 
2218 		if (!sk_locked) {
2219 			unix_state_unlock(other);
2220 			unix_state_double_lock(sk, other);
2221 		}
2222 
2223 		if (unix_peer(sk) != other ||
2224 		    unix_dgram_peer_wake_me(sk, other)) {
2225 			err = -EAGAIN;
2226 			sk_locked = 1;
2227 			goto out_unlock;
2228 		}
2229 
2230 		if (!sk_locked) {
2231 			sk_locked = 1;
2232 			goto restart_locked;
2233 		}
2234 	}
2235 
2236 	if (unlikely(sk_locked))
2237 		unix_state_unlock(sk);
2238 
2239 	if (sock_flag(other, SOCK_RCVTSTAMP))
2240 		__net_timestamp(skb);
2241 	maybe_add_creds(skb, sock, other);
2242 	scm_stat_add(other, skb);
2243 	skb_queue_tail(&other->sk_receive_queue, skb);
2244 	unix_state_unlock(other);
2245 	other->sk_data_ready(other);
2246 	sock_put(other);
2247 	scm_destroy(&scm);
2248 	return len;
2249 
2250 out_unlock:
2251 	if (sk_locked)
2252 		unix_state_unlock(sk);
2253 	unix_state_unlock(other);
2254 out_sock_put:
2255 	sock_put(other);
2256 out_free:
2257 	consume_skb(skb);
2258 out:
2259 	scm_destroy(&scm);
2260 	return err;
2261 }
2262 
2263 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2264  * bytes, and a minimum of a full page.
2265  */
2266 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2267 
2268 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2269 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2270 		     struct scm_cookie *scm, bool fds_sent)
2271 {
2272 	struct unix_sock *ousk = unix_sk(other);
2273 	struct sk_buff *skb;
2274 	int err;
2275 
2276 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2277 
2278 	if (!skb)
2279 		return err;
2280 
2281 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2282 	if (err < 0)
2283 		goto out;
2284 
2285 	skb_put(skb, 1);
2286 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2287 
2288 	if (err)
2289 		goto out;
2290 
2291 	unix_state_lock(other);
2292 
2293 	if (sock_flag(other, SOCK_DEAD) ||
2294 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2295 		unix_state_unlock(other);
2296 		err = -EPIPE;
2297 		goto out;
2298 	}
2299 
2300 	maybe_add_creds(skb, sock, other);
2301 	scm_stat_add(other, skb);
2302 
2303 	spin_lock(&other->sk_receive_queue.lock);
2304 	WRITE_ONCE(ousk->oob_skb, skb);
2305 	__skb_queue_tail(&other->sk_receive_queue, skb);
2306 	spin_unlock(&other->sk_receive_queue.lock);
2307 
2308 	sk_send_sigurg(other);
2309 	unix_state_unlock(other);
2310 	other->sk_data_ready(other);
2311 
2312 	return 0;
2313 out:
2314 	consume_skb(skb);
2315 	return err;
2316 }
2317 #endif
2318 
2319 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2320 			       size_t len)
2321 {
2322 	struct sock *sk = sock->sk;
2323 	struct sk_buff *skb = NULL;
2324 	struct sock *other = NULL;
2325 	struct scm_cookie scm;
2326 	bool fds_sent = false;
2327 	int err, sent = 0;
2328 
2329 	err = scm_send(sock, msg, &scm, false);
2330 	if (err < 0)
2331 		return err;
2332 
2333 	wait_for_unix_gc(scm.fp);
2334 
2335 	if (msg->msg_flags & MSG_OOB) {
2336 		err = -EOPNOTSUPP;
2337 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2338 		if (len)
2339 			len--;
2340 		else
2341 #endif
2342 			goto out_err;
2343 	}
2344 
2345 	if (msg->msg_namelen) {
2346 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2347 		goto out_err;
2348 	} else {
2349 		other = unix_peer(sk);
2350 		if (!other) {
2351 			err = -ENOTCONN;
2352 			goto out_err;
2353 		}
2354 	}
2355 
2356 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2357 		goto out_pipe;
2358 
2359 	while (sent < len) {
2360 		int size = len - sent;
2361 		int data_len;
2362 
2363 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2364 			skb = sock_alloc_send_pskb(sk, 0, 0,
2365 						   msg->msg_flags & MSG_DONTWAIT,
2366 						   &err, 0);
2367 		} else {
2368 			/* Keep two messages in the pipe so it schedules better */
2369 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2370 
2371 			/* allow fallback to order-0 allocations */
2372 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2373 
2374 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2375 
2376 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2377 
2378 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2379 						   msg->msg_flags & MSG_DONTWAIT, &err,
2380 						   get_order(UNIX_SKB_FRAGS_SZ));
2381 		}
2382 		if (!skb)
2383 			goto out_err;
2384 
2385 		/* Only send the fds in the first buffer */
2386 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2387 		if (err < 0)
2388 			goto out_free;
2389 
2390 		fds_sent = true;
2391 
2392 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2393 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2394 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2395 						   sk->sk_allocation);
2396 			if (err < 0)
2397 				goto out_free;
2398 
2399 			size = err;
2400 			refcount_add(size, &sk->sk_wmem_alloc);
2401 		} else {
2402 			skb_put(skb, size - data_len);
2403 			skb->data_len = data_len;
2404 			skb->len = size;
2405 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2406 			if (err)
2407 				goto out_free;
2408 		}
2409 
2410 		unix_state_lock(other);
2411 
2412 		if (sock_flag(other, SOCK_DEAD) ||
2413 		    (other->sk_shutdown & RCV_SHUTDOWN))
2414 			goto out_pipe_unlock;
2415 
2416 		maybe_add_creds(skb, sock, other);
2417 		scm_stat_add(other, skb);
2418 		skb_queue_tail(&other->sk_receive_queue, skb);
2419 		unix_state_unlock(other);
2420 		other->sk_data_ready(other);
2421 		sent += size;
2422 	}
2423 
2424 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2425 	if (msg->msg_flags & MSG_OOB) {
2426 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2427 		if (err)
2428 			goto out_err;
2429 		sent++;
2430 	}
2431 #endif
2432 
2433 	scm_destroy(&scm);
2434 
2435 	return sent;
2436 
2437 out_pipe_unlock:
2438 	unix_state_unlock(other);
2439 out_pipe:
2440 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2441 		send_sig(SIGPIPE, current, 0);
2442 	err = -EPIPE;
2443 out_free:
2444 	consume_skb(skb);
2445 out_err:
2446 	scm_destroy(&scm);
2447 	return sent ? : err;
2448 }
2449 
2450 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2451 				  size_t len)
2452 {
2453 	int err;
2454 	struct sock *sk = sock->sk;
2455 
2456 	err = sock_error(sk);
2457 	if (err)
2458 		return err;
2459 
2460 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2461 		return -ENOTCONN;
2462 
2463 	if (msg->msg_namelen)
2464 		msg->msg_namelen = 0;
2465 
2466 	return unix_dgram_sendmsg(sock, msg, len);
2467 }
2468 
2469 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2470 				  size_t size, int flags)
2471 {
2472 	struct sock *sk = sock->sk;
2473 
2474 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2475 		return -ENOTCONN;
2476 
2477 	return unix_dgram_recvmsg(sock, msg, size, flags);
2478 }
2479 
2480 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2481 {
2482 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2483 
2484 	if (addr) {
2485 		msg->msg_namelen = addr->len;
2486 		memcpy(msg->msg_name, addr->name, addr->len);
2487 	}
2488 }
2489 
2490 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2491 			 int flags)
2492 {
2493 	struct scm_cookie scm;
2494 	struct socket *sock = sk->sk_socket;
2495 	struct unix_sock *u = unix_sk(sk);
2496 	struct sk_buff *skb, *last;
2497 	long timeo;
2498 	int skip;
2499 	int err;
2500 
2501 	err = -EOPNOTSUPP;
2502 	if (flags&MSG_OOB)
2503 		goto out;
2504 
2505 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2506 
2507 	do {
2508 		mutex_lock(&u->iolock);
2509 
2510 		skip = sk_peek_offset(sk, flags);
2511 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2512 					      &skip, &err, &last);
2513 		if (skb) {
2514 			if (!(flags & MSG_PEEK))
2515 				scm_stat_del(sk, skb);
2516 			break;
2517 		}
2518 
2519 		mutex_unlock(&u->iolock);
2520 
2521 		if (err != -EAGAIN)
2522 			break;
2523 	} while (timeo &&
2524 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2525 					      &err, &timeo, last));
2526 
2527 	if (!skb) { /* implies iolock unlocked */
2528 		unix_state_lock(sk);
2529 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2530 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2531 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2532 			err = 0;
2533 		unix_state_unlock(sk);
2534 		goto out;
2535 	}
2536 
2537 	if (wq_has_sleeper(&u->peer_wait))
2538 		wake_up_interruptible_sync_poll(&u->peer_wait,
2539 						EPOLLOUT | EPOLLWRNORM |
2540 						EPOLLWRBAND);
2541 
2542 	if (msg->msg_name) {
2543 		unix_copy_addr(msg, skb->sk);
2544 
2545 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2546 						      msg->msg_name,
2547 						      &msg->msg_namelen);
2548 	}
2549 
2550 	if (size > skb->len - skip)
2551 		size = skb->len - skip;
2552 	else if (size < skb->len - skip)
2553 		msg->msg_flags |= MSG_TRUNC;
2554 
2555 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2556 	if (err)
2557 		goto out_free;
2558 
2559 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2560 		__sock_recv_timestamp(msg, sk, skb);
2561 
2562 	memset(&scm, 0, sizeof(scm));
2563 
2564 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2565 	unix_set_secdata(&scm, skb);
2566 
2567 	if (!(flags & MSG_PEEK)) {
2568 		if (UNIXCB(skb).fp)
2569 			unix_detach_fds(&scm, skb);
2570 
2571 		sk_peek_offset_bwd(sk, skb->len);
2572 	} else {
2573 		/* It is questionable: on PEEK we could:
2574 		   - do not return fds - good, but too simple 8)
2575 		   - return fds, and do not return them on read (old strategy,
2576 		     apparently wrong)
2577 		   - clone fds (I chose it for now, it is the most universal
2578 		     solution)
2579 
2580 		   POSIX 1003.1g does not actually define this clearly
2581 		   at all. POSIX 1003.1g doesn't define a lot of things
2582 		   clearly however!
2583 
2584 		*/
2585 
2586 		sk_peek_offset_fwd(sk, size);
2587 
2588 		if (UNIXCB(skb).fp)
2589 			unix_peek_fds(&scm, skb);
2590 	}
2591 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2592 
2593 	scm_recv_unix(sock, msg, &scm, flags);
2594 
2595 out_free:
2596 	skb_free_datagram(sk, skb);
2597 	mutex_unlock(&u->iolock);
2598 out:
2599 	return err;
2600 }
2601 
2602 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2603 			      int flags)
2604 {
2605 	struct sock *sk = sock->sk;
2606 
2607 #ifdef CONFIG_BPF_SYSCALL
2608 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2609 
2610 	if (prot != &unix_dgram_proto)
2611 		return prot->recvmsg(sk, msg, size, flags, NULL);
2612 #endif
2613 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2614 }
2615 
2616 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2617 {
2618 	struct unix_sock *u = unix_sk(sk);
2619 	struct sk_buff *skb;
2620 	int err;
2621 
2622 	mutex_lock(&u->iolock);
2623 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2624 	mutex_unlock(&u->iolock);
2625 	if (!skb)
2626 		return err;
2627 
2628 	return recv_actor(sk, skb);
2629 }
2630 
2631 /*
2632  *	Sleep until more data has arrived. But check for races..
2633  */
2634 static long unix_stream_data_wait(struct sock *sk, long timeo,
2635 				  struct sk_buff *last, unsigned int last_len,
2636 				  bool freezable)
2637 {
2638 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2639 	struct sk_buff *tail;
2640 	DEFINE_WAIT(wait);
2641 
2642 	unix_state_lock(sk);
2643 
2644 	for (;;) {
2645 		prepare_to_wait(sk_sleep(sk), &wait, state);
2646 
2647 		tail = skb_peek_tail(&sk->sk_receive_queue);
2648 		if (tail != last ||
2649 		    (tail && tail->len != last_len) ||
2650 		    sk->sk_err ||
2651 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2652 		    signal_pending(current) ||
2653 		    !timeo)
2654 			break;
2655 
2656 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2657 		unix_state_unlock(sk);
2658 		timeo = schedule_timeout(timeo);
2659 		unix_state_lock(sk);
2660 
2661 		if (sock_flag(sk, SOCK_DEAD))
2662 			break;
2663 
2664 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2665 	}
2666 
2667 	finish_wait(sk_sleep(sk), &wait);
2668 	unix_state_unlock(sk);
2669 	return timeo;
2670 }
2671 
2672 static unsigned int unix_skb_len(const struct sk_buff *skb)
2673 {
2674 	return skb->len - UNIXCB(skb).consumed;
2675 }
2676 
2677 struct unix_stream_read_state {
2678 	int (*recv_actor)(struct sk_buff *, int, int,
2679 			  struct unix_stream_read_state *);
2680 	struct socket *socket;
2681 	struct msghdr *msg;
2682 	struct pipe_inode_info *pipe;
2683 	size_t size;
2684 	int flags;
2685 	unsigned int splice_flags;
2686 };
2687 
2688 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2689 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2690 {
2691 	struct socket *sock = state->socket;
2692 	struct sock *sk = sock->sk;
2693 	struct unix_sock *u = unix_sk(sk);
2694 	int chunk = 1;
2695 	struct sk_buff *oob_skb;
2696 
2697 	mutex_lock(&u->iolock);
2698 	unix_state_lock(sk);
2699 	spin_lock(&sk->sk_receive_queue.lock);
2700 
2701 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2702 		spin_unlock(&sk->sk_receive_queue.lock);
2703 		unix_state_unlock(sk);
2704 		mutex_unlock(&u->iolock);
2705 		return -EINVAL;
2706 	}
2707 
2708 	oob_skb = u->oob_skb;
2709 
2710 	if (!(state->flags & MSG_PEEK))
2711 		WRITE_ONCE(u->oob_skb, NULL);
2712 
2713 	spin_unlock(&sk->sk_receive_queue.lock);
2714 	unix_state_unlock(sk);
2715 
2716 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2717 
2718 	if (!(state->flags & MSG_PEEK))
2719 		UNIXCB(oob_skb).consumed += 1;
2720 
2721 	mutex_unlock(&u->iolock);
2722 
2723 	if (chunk < 0)
2724 		return -EFAULT;
2725 
2726 	state->msg->msg_flags |= MSG_OOB;
2727 	return 1;
2728 }
2729 
2730 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2731 				  int flags, int copied)
2732 {
2733 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2734 	struct unix_sock *u = unix_sk(sk);
2735 
2736 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2737 		return skb;
2738 
2739 	spin_lock(&sk->sk_receive_queue.lock);
2740 
2741 	if (!unix_skb_len(skb)) {
2742 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2743 			skb = NULL;
2744 		} else if (flags & MSG_PEEK) {
2745 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2746 		} else {
2747 			read_skb = skb;
2748 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2749 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2750 		}
2751 
2752 		if (!skb)
2753 			goto unlock;
2754 	}
2755 
2756 	if (skb != u->oob_skb)
2757 		goto unlock;
2758 
2759 	if (copied) {
2760 		skb = NULL;
2761 	} else if (!(flags & MSG_PEEK)) {
2762 		WRITE_ONCE(u->oob_skb, NULL);
2763 
2764 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2765 			__skb_unlink(skb, &sk->sk_receive_queue);
2766 			unread_skb = skb;
2767 			skb = skb_peek(&sk->sk_receive_queue);
2768 		}
2769 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2770 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2771 	}
2772 
2773 unlock:
2774 	spin_unlock(&sk->sk_receive_queue.lock);
2775 
2776 	consume_skb(read_skb);
2777 	kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2778 
2779 	return skb;
2780 }
2781 #endif
2782 
2783 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2784 {
2785 	struct unix_sock *u = unix_sk(sk);
2786 	struct sk_buff *skb;
2787 	int err;
2788 
2789 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2790 		return -ENOTCONN;
2791 
2792 	mutex_lock(&u->iolock);
2793 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2794 	mutex_unlock(&u->iolock);
2795 	if (!skb)
2796 		return err;
2797 
2798 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2799 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2800 		bool drop = false;
2801 
2802 		unix_state_lock(sk);
2803 
2804 		if (sock_flag(sk, SOCK_DEAD)) {
2805 			unix_state_unlock(sk);
2806 			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2807 			return -ECONNRESET;
2808 		}
2809 
2810 		spin_lock(&sk->sk_receive_queue.lock);
2811 		if (likely(skb == u->oob_skb)) {
2812 			WRITE_ONCE(u->oob_skb, NULL);
2813 			drop = true;
2814 		}
2815 		spin_unlock(&sk->sk_receive_queue.lock);
2816 
2817 		unix_state_unlock(sk);
2818 
2819 		if (drop) {
2820 			kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2821 			return -EAGAIN;
2822 		}
2823 	}
2824 #endif
2825 
2826 	return recv_actor(sk, skb);
2827 }
2828 
2829 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2830 				    bool freezable)
2831 {
2832 	struct scm_cookie scm;
2833 	struct socket *sock = state->socket;
2834 	struct sock *sk = sock->sk;
2835 	struct unix_sock *u = unix_sk(sk);
2836 	int copied = 0;
2837 	int flags = state->flags;
2838 	int noblock = flags & MSG_DONTWAIT;
2839 	bool check_creds = false;
2840 	int target;
2841 	int err = 0;
2842 	long timeo;
2843 	int skip;
2844 	size_t size = state->size;
2845 	unsigned int last_len;
2846 
2847 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2848 		err = -EINVAL;
2849 		goto out;
2850 	}
2851 
2852 	if (unlikely(flags & MSG_OOB)) {
2853 		err = -EOPNOTSUPP;
2854 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2855 		err = unix_stream_recv_urg(state);
2856 #endif
2857 		goto out;
2858 	}
2859 
2860 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2861 	timeo = sock_rcvtimeo(sk, noblock);
2862 
2863 	memset(&scm, 0, sizeof(scm));
2864 
2865 	/* Lock the socket to prevent queue disordering
2866 	 * while sleeps in memcpy_tomsg
2867 	 */
2868 	mutex_lock(&u->iolock);
2869 
2870 	skip = max(sk_peek_offset(sk, flags), 0);
2871 
2872 	do {
2873 		struct sk_buff *skb, *last;
2874 		int chunk;
2875 
2876 redo:
2877 		unix_state_lock(sk);
2878 		if (sock_flag(sk, SOCK_DEAD)) {
2879 			err = -ECONNRESET;
2880 			goto unlock;
2881 		}
2882 		last = skb = skb_peek(&sk->sk_receive_queue);
2883 		last_len = last ? last->len : 0;
2884 
2885 again:
2886 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2887 		if (skb) {
2888 			skb = manage_oob(skb, sk, flags, copied);
2889 			if (!skb && copied) {
2890 				unix_state_unlock(sk);
2891 				break;
2892 			}
2893 		}
2894 #endif
2895 		if (skb == NULL) {
2896 			if (copied >= target)
2897 				goto unlock;
2898 
2899 			/*
2900 			 *	POSIX 1003.1g mandates this order.
2901 			 */
2902 
2903 			err = sock_error(sk);
2904 			if (err)
2905 				goto unlock;
2906 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2907 				goto unlock;
2908 
2909 			unix_state_unlock(sk);
2910 			if (!timeo) {
2911 				err = -EAGAIN;
2912 				break;
2913 			}
2914 
2915 			mutex_unlock(&u->iolock);
2916 
2917 			timeo = unix_stream_data_wait(sk, timeo, last,
2918 						      last_len, freezable);
2919 
2920 			if (signal_pending(current)) {
2921 				err = sock_intr_errno(timeo);
2922 				scm_destroy(&scm);
2923 				goto out;
2924 			}
2925 
2926 			mutex_lock(&u->iolock);
2927 			goto redo;
2928 unlock:
2929 			unix_state_unlock(sk);
2930 			break;
2931 		}
2932 
2933 		while (skip >= unix_skb_len(skb)) {
2934 			skip -= unix_skb_len(skb);
2935 			last = skb;
2936 			last_len = skb->len;
2937 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2938 			if (!skb)
2939 				goto again;
2940 		}
2941 
2942 		unix_state_unlock(sk);
2943 
2944 		if (check_creds) {
2945 			/* Never glue messages from different writers */
2946 			if (!unix_skb_scm_eq(skb, &scm))
2947 				break;
2948 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2949 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2950 			/* Copy credentials */
2951 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2952 			unix_set_secdata(&scm, skb);
2953 			check_creds = true;
2954 		}
2955 
2956 		/* Copy address just once */
2957 		if (state->msg && state->msg->msg_name) {
2958 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2959 					 state->msg->msg_name);
2960 			unix_copy_addr(state->msg, skb->sk);
2961 
2962 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2963 							      state->msg->msg_name,
2964 							      &state->msg->msg_namelen);
2965 
2966 			sunaddr = NULL;
2967 		}
2968 
2969 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2970 		chunk = state->recv_actor(skb, skip, chunk, state);
2971 		if (chunk < 0) {
2972 			if (copied == 0)
2973 				copied = -EFAULT;
2974 			break;
2975 		}
2976 		copied += chunk;
2977 		size -= chunk;
2978 
2979 		/* Mark read part of skb as used */
2980 		if (!(flags & MSG_PEEK)) {
2981 			UNIXCB(skb).consumed += chunk;
2982 
2983 			sk_peek_offset_bwd(sk, chunk);
2984 
2985 			if (UNIXCB(skb).fp) {
2986 				scm_stat_del(sk, skb);
2987 				unix_detach_fds(&scm, skb);
2988 			}
2989 
2990 			if (unix_skb_len(skb))
2991 				break;
2992 
2993 			skb_unlink(skb, &sk->sk_receive_queue);
2994 			consume_skb(skb);
2995 
2996 			if (scm.fp)
2997 				break;
2998 		} else {
2999 			/* It is questionable, see note in unix_dgram_recvmsg.
3000 			 */
3001 			if (UNIXCB(skb).fp)
3002 				unix_peek_fds(&scm, skb);
3003 
3004 			sk_peek_offset_fwd(sk, chunk);
3005 
3006 			if (UNIXCB(skb).fp)
3007 				break;
3008 
3009 			skip = 0;
3010 			last = skb;
3011 			last_len = skb->len;
3012 			unix_state_lock(sk);
3013 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
3014 			if (skb)
3015 				goto again;
3016 			unix_state_unlock(sk);
3017 			break;
3018 		}
3019 	} while (size);
3020 
3021 	mutex_unlock(&u->iolock);
3022 	if (state->msg)
3023 		scm_recv_unix(sock, state->msg, &scm, flags);
3024 	else
3025 		scm_destroy(&scm);
3026 out:
3027 	return copied ? : err;
3028 }
3029 
3030 static int unix_stream_read_actor(struct sk_buff *skb,
3031 				  int skip, int chunk,
3032 				  struct unix_stream_read_state *state)
3033 {
3034 	int ret;
3035 
3036 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
3037 				    state->msg, chunk);
3038 	return ret ?: chunk;
3039 }
3040 
3041 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
3042 			  size_t size, int flags)
3043 {
3044 	struct unix_stream_read_state state = {
3045 		.recv_actor = unix_stream_read_actor,
3046 		.socket = sk->sk_socket,
3047 		.msg = msg,
3048 		.size = size,
3049 		.flags = flags
3050 	};
3051 
3052 	return unix_stream_read_generic(&state, true);
3053 }
3054 
3055 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
3056 			       size_t size, int flags)
3057 {
3058 	struct unix_stream_read_state state = {
3059 		.recv_actor = unix_stream_read_actor,
3060 		.socket = sock,
3061 		.msg = msg,
3062 		.size = size,
3063 		.flags = flags
3064 	};
3065 
3066 #ifdef CONFIG_BPF_SYSCALL
3067 	struct sock *sk = sock->sk;
3068 	const struct proto *prot = READ_ONCE(sk->sk_prot);
3069 
3070 	if (prot != &unix_stream_proto)
3071 		return prot->recvmsg(sk, msg, size, flags, NULL);
3072 #endif
3073 	return unix_stream_read_generic(&state, true);
3074 }
3075 
3076 static int unix_stream_splice_actor(struct sk_buff *skb,
3077 				    int skip, int chunk,
3078 				    struct unix_stream_read_state *state)
3079 {
3080 	return skb_splice_bits(skb, state->socket->sk,
3081 			       UNIXCB(skb).consumed + skip,
3082 			       state->pipe, chunk, state->splice_flags);
3083 }
3084 
3085 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3086 				       struct pipe_inode_info *pipe,
3087 				       size_t size, unsigned int flags)
3088 {
3089 	struct unix_stream_read_state state = {
3090 		.recv_actor = unix_stream_splice_actor,
3091 		.socket = sock,
3092 		.pipe = pipe,
3093 		.size = size,
3094 		.splice_flags = flags,
3095 	};
3096 
3097 	if (unlikely(*ppos))
3098 		return -ESPIPE;
3099 
3100 	if (sock->file->f_flags & O_NONBLOCK ||
3101 	    flags & SPLICE_F_NONBLOCK)
3102 		state.flags = MSG_DONTWAIT;
3103 
3104 	return unix_stream_read_generic(&state, false);
3105 }
3106 
3107 static int unix_shutdown(struct socket *sock, int mode)
3108 {
3109 	struct sock *sk = sock->sk;
3110 	struct sock *other;
3111 
3112 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3113 		return -EINVAL;
3114 	/* This maps:
3115 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3116 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3117 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3118 	 */
3119 	++mode;
3120 
3121 	unix_state_lock(sk);
3122 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3123 	other = unix_peer(sk);
3124 	if (other)
3125 		sock_hold(other);
3126 	unix_state_unlock(sk);
3127 	sk->sk_state_change(sk);
3128 
3129 	if (other &&
3130 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3131 
3132 		int peer_mode = 0;
3133 		const struct proto *prot = READ_ONCE(other->sk_prot);
3134 
3135 		if (prot->unhash)
3136 			prot->unhash(other);
3137 		if (mode&RCV_SHUTDOWN)
3138 			peer_mode |= SEND_SHUTDOWN;
3139 		if (mode&SEND_SHUTDOWN)
3140 			peer_mode |= RCV_SHUTDOWN;
3141 		unix_state_lock(other);
3142 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3143 		unix_state_unlock(other);
3144 		other->sk_state_change(other);
3145 		if (peer_mode == SHUTDOWN_MASK)
3146 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3147 		else if (peer_mode & RCV_SHUTDOWN)
3148 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3149 	}
3150 	if (other)
3151 		sock_put(other);
3152 
3153 	return 0;
3154 }
3155 
3156 long unix_inq_len(struct sock *sk)
3157 {
3158 	struct sk_buff *skb;
3159 	long amount = 0;
3160 
3161 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3162 		return -EINVAL;
3163 
3164 	spin_lock(&sk->sk_receive_queue.lock);
3165 	if (sk->sk_type == SOCK_STREAM ||
3166 	    sk->sk_type == SOCK_SEQPACKET) {
3167 		skb_queue_walk(&sk->sk_receive_queue, skb)
3168 			amount += unix_skb_len(skb);
3169 	} else {
3170 		skb = skb_peek(&sk->sk_receive_queue);
3171 		if (skb)
3172 			amount = skb->len;
3173 	}
3174 	spin_unlock(&sk->sk_receive_queue.lock);
3175 
3176 	return amount;
3177 }
3178 EXPORT_SYMBOL_GPL(unix_inq_len);
3179 
3180 long unix_outq_len(struct sock *sk)
3181 {
3182 	return sk_wmem_alloc_get(sk);
3183 }
3184 EXPORT_SYMBOL_GPL(unix_outq_len);
3185 
3186 static int unix_open_file(struct sock *sk)
3187 {
3188 	struct path path;
3189 	struct file *f;
3190 	int fd;
3191 
3192 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3193 		return -EPERM;
3194 
3195 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3196 		return -ENOENT;
3197 
3198 	path = unix_sk(sk)->path;
3199 	if (!path.dentry)
3200 		return -ENOENT;
3201 
3202 	path_get(&path);
3203 
3204 	fd = get_unused_fd_flags(O_CLOEXEC);
3205 	if (fd < 0)
3206 		goto out;
3207 
3208 	f = dentry_open(&path, O_PATH, current_cred());
3209 	if (IS_ERR(f)) {
3210 		put_unused_fd(fd);
3211 		fd = PTR_ERR(f);
3212 		goto out;
3213 	}
3214 
3215 	fd_install(fd, f);
3216 out:
3217 	path_put(&path);
3218 
3219 	return fd;
3220 }
3221 
3222 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3223 {
3224 	struct sock *sk = sock->sk;
3225 	long amount = 0;
3226 	int err;
3227 
3228 	switch (cmd) {
3229 	case SIOCOUTQ:
3230 		amount = unix_outq_len(sk);
3231 		err = put_user(amount, (int __user *)arg);
3232 		break;
3233 	case SIOCINQ:
3234 		amount = unix_inq_len(sk);
3235 		if (amount < 0)
3236 			err = amount;
3237 		else
3238 			err = put_user(amount, (int __user *)arg);
3239 		break;
3240 	case SIOCUNIXFILE:
3241 		err = unix_open_file(sk);
3242 		break;
3243 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3244 	case SIOCATMARK:
3245 		{
3246 			struct unix_sock *u = unix_sk(sk);
3247 			struct sk_buff *skb;
3248 			int answ = 0;
3249 
3250 			mutex_lock(&u->iolock);
3251 
3252 			skb = skb_peek(&sk->sk_receive_queue);
3253 			if (skb) {
3254 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3255 				struct sk_buff *next_skb;
3256 
3257 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3258 
3259 				if (skb == oob_skb ||
3260 				    (!unix_skb_len(skb) &&
3261 				     (!oob_skb || next_skb == oob_skb)))
3262 					answ = 1;
3263 			}
3264 
3265 			mutex_unlock(&u->iolock);
3266 
3267 			err = put_user(answ, (int __user *)arg);
3268 		}
3269 		break;
3270 #endif
3271 	default:
3272 		err = -ENOIOCTLCMD;
3273 		break;
3274 	}
3275 	return err;
3276 }
3277 
3278 #ifdef CONFIG_COMPAT
3279 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3280 {
3281 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3282 }
3283 #endif
3284 
3285 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3286 {
3287 	struct sock *sk = sock->sk;
3288 	unsigned char state;
3289 	__poll_t mask;
3290 	u8 shutdown;
3291 
3292 	sock_poll_wait(file, sock, wait);
3293 	mask = 0;
3294 	shutdown = READ_ONCE(sk->sk_shutdown);
3295 	state = READ_ONCE(sk->sk_state);
3296 
3297 	/* exceptional events? */
3298 	if (READ_ONCE(sk->sk_err))
3299 		mask |= EPOLLERR;
3300 	if (shutdown == SHUTDOWN_MASK)
3301 		mask |= EPOLLHUP;
3302 	if (shutdown & RCV_SHUTDOWN)
3303 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3304 
3305 	/* readable? */
3306 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3307 		mask |= EPOLLIN | EPOLLRDNORM;
3308 	if (sk_is_readable(sk))
3309 		mask |= EPOLLIN | EPOLLRDNORM;
3310 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3311 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3312 		mask |= EPOLLPRI;
3313 #endif
3314 
3315 	/* Connection-based need to check for termination and startup */
3316 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3317 	    state == TCP_CLOSE)
3318 		mask |= EPOLLHUP;
3319 
3320 	/*
3321 	 * we set writable also when the other side has shut down the
3322 	 * connection. This prevents stuck sockets.
3323 	 */
3324 	if (unix_writable(sk, state))
3325 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3326 
3327 	return mask;
3328 }
3329 
3330 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3331 				    poll_table *wait)
3332 {
3333 	struct sock *sk = sock->sk, *other;
3334 	unsigned int writable;
3335 	unsigned char state;
3336 	__poll_t mask;
3337 	u8 shutdown;
3338 
3339 	sock_poll_wait(file, sock, wait);
3340 	mask = 0;
3341 	shutdown = READ_ONCE(sk->sk_shutdown);
3342 	state = READ_ONCE(sk->sk_state);
3343 
3344 	/* exceptional events? */
3345 	if (READ_ONCE(sk->sk_err) ||
3346 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3347 		mask |= EPOLLERR |
3348 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3349 
3350 	if (shutdown & RCV_SHUTDOWN)
3351 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3352 	if (shutdown == SHUTDOWN_MASK)
3353 		mask |= EPOLLHUP;
3354 
3355 	/* readable? */
3356 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3357 		mask |= EPOLLIN | EPOLLRDNORM;
3358 	if (sk_is_readable(sk))
3359 		mask |= EPOLLIN | EPOLLRDNORM;
3360 
3361 	/* Connection-based need to check for termination and startup */
3362 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3363 		mask |= EPOLLHUP;
3364 
3365 	/* No write status requested, avoid expensive OUT tests. */
3366 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3367 		return mask;
3368 
3369 	writable = unix_writable(sk, state);
3370 	if (writable) {
3371 		unix_state_lock(sk);
3372 
3373 		other = unix_peer(sk);
3374 		if (other && unix_peer(other) != sk &&
3375 		    unix_recvq_full_lockless(other) &&
3376 		    unix_dgram_peer_wake_me(sk, other))
3377 			writable = 0;
3378 
3379 		unix_state_unlock(sk);
3380 	}
3381 
3382 	if (writable)
3383 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3384 	else
3385 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3386 
3387 	return mask;
3388 }
3389 
3390 #ifdef CONFIG_PROC_FS
3391 
3392 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3393 
3394 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3395 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3396 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3397 
3398 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3399 {
3400 	unsigned long offset = get_offset(*pos);
3401 	unsigned long bucket = get_bucket(*pos);
3402 	unsigned long count = 0;
3403 	struct sock *sk;
3404 
3405 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3406 	     sk; sk = sk_next(sk)) {
3407 		if (++count == offset)
3408 			break;
3409 	}
3410 
3411 	return sk;
3412 }
3413 
3414 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3415 {
3416 	unsigned long bucket = get_bucket(*pos);
3417 	struct net *net = seq_file_net(seq);
3418 	struct sock *sk;
3419 
3420 	while (bucket < UNIX_HASH_SIZE) {
3421 		spin_lock(&net->unx.table.locks[bucket]);
3422 
3423 		sk = unix_from_bucket(seq, pos);
3424 		if (sk)
3425 			return sk;
3426 
3427 		spin_unlock(&net->unx.table.locks[bucket]);
3428 
3429 		*pos = set_bucket_offset(++bucket, 1);
3430 	}
3431 
3432 	return NULL;
3433 }
3434 
3435 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3436 				  loff_t *pos)
3437 {
3438 	unsigned long bucket = get_bucket(*pos);
3439 
3440 	sk = sk_next(sk);
3441 	if (sk)
3442 		return sk;
3443 
3444 
3445 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3446 
3447 	*pos = set_bucket_offset(++bucket, 1);
3448 
3449 	return unix_get_first(seq, pos);
3450 }
3451 
3452 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3453 {
3454 	if (!*pos)
3455 		return SEQ_START_TOKEN;
3456 
3457 	return unix_get_first(seq, pos);
3458 }
3459 
3460 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3461 {
3462 	++*pos;
3463 
3464 	if (v == SEQ_START_TOKEN)
3465 		return unix_get_first(seq, pos);
3466 
3467 	return unix_get_next(seq, v, pos);
3468 }
3469 
3470 static void unix_seq_stop(struct seq_file *seq, void *v)
3471 {
3472 	struct sock *sk = v;
3473 
3474 	if (sk)
3475 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3476 }
3477 
3478 static int unix_seq_show(struct seq_file *seq, void *v)
3479 {
3480 
3481 	if (v == SEQ_START_TOKEN)
3482 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3483 			 "Inode Path\n");
3484 	else {
3485 		struct sock *s = v;
3486 		struct unix_sock *u = unix_sk(s);
3487 		unix_state_lock(s);
3488 
3489 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3490 			s,
3491 			refcount_read(&s->sk_refcnt),
3492 			0,
3493 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3494 			s->sk_type,
3495 			s->sk_socket ?
3496 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3497 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3498 			sock_i_ino(s));
3499 
3500 		if (u->addr) {	// under a hash table lock here
3501 			int i, len;
3502 			seq_putc(seq, ' ');
3503 
3504 			i = 0;
3505 			len = u->addr->len -
3506 				offsetof(struct sockaddr_un, sun_path);
3507 			if (u->addr->name->sun_path[0]) {
3508 				len--;
3509 			} else {
3510 				seq_putc(seq, '@');
3511 				i++;
3512 			}
3513 			for ( ; i < len; i++)
3514 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3515 					 '@');
3516 		}
3517 		unix_state_unlock(s);
3518 		seq_putc(seq, '\n');
3519 	}
3520 
3521 	return 0;
3522 }
3523 
3524 static const struct seq_operations unix_seq_ops = {
3525 	.start  = unix_seq_start,
3526 	.next   = unix_seq_next,
3527 	.stop   = unix_seq_stop,
3528 	.show   = unix_seq_show,
3529 };
3530 
3531 #ifdef CONFIG_BPF_SYSCALL
3532 struct bpf_unix_iter_state {
3533 	struct seq_net_private p;
3534 	unsigned int cur_sk;
3535 	unsigned int end_sk;
3536 	unsigned int max_sk;
3537 	struct sock **batch;
3538 	bool st_bucket_done;
3539 };
3540 
3541 struct bpf_iter__unix {
3542 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3543 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3544 	uid_t uid __aligned(8);
3545 };
3546 
3547 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3548 			      struct unix_sock *unix_sk, uid_t uid)
3549 {
3550 	struct bpf_iter__unix ctx;
3551 
3552 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3553 	ctx.meta = meta;
3554 	ctx.unix_sk = unix_sk;
3555 	ctx.uid = uid;
3556 	return bpf_iter_run_prog(prog, &ctx);
3557 }
3558 
3559 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3560 
3561 {
3562 	struct bpf_unix_iter_state *iter = seq->private;
3563 	unsigned int expected = 1;
3564 	struct sock *sk;
3565 
3566 	sock_hold(start_sk);
3567 	iter->batch[iter->end_sk++] = start_sk;
3568 
3569 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3570 		if (iter->end_sk < iter->max_sk) {
3571 			sock_hold(sk);
3572 			iter->batch[iter->end_sk++] = sk;
3573 		}
3574 
3575 		expected++;
3576 	}
3577 
3578 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3579 
3580 	return expected;
3581 }
3582 
3583 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3584 {
3585 	while (iter->cur_sk < iter->end_sk)
3586 		sock_put(iter->batch[iter->cur_sk++]);
3587 }
3588 
3589 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3590 				       unsigned int new_batch_sz)
3591 {
3592 	struct sock **new_batch;
3593 
3594 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3595 			     GFP_USER | __GFP_NOWARN);
3596 	if (!new_batch)
3597 		return -ENOMEM;
3598 
3599 	bpf_iter_unix_put_batch(iter);
3600 	kvfree(iter->batch);
3601 	iter->batch = new_batch;
3602 	iter->max_sk = new_batch_sz;
3603 
3604 	return 0;
3605 }
3606 
3607 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3608 					loff_t *pos)
3609 {
3610 	struct bpf_unix_iter_state *iter = seq->private;
3611 	unsigned int expected;
3612 	bool resized = false;
3613 	struct sock *sk;
3614 
3615 	if (iter->st_bucket_done)
3616 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3617 
3618 again:
3619 	/* Get a new batch */
3620 	iter->cur_sk = 0;
3621 	iter->end_sk = 0;
3622 
3623 	sk = unix_get_first(seq, pos);
3624 	if (!sk)
3625 		return NULL; /* Done */
3626 
3627 	expected = bpf_iter_unix_hold_batch(seq, sk);
3628 
3629 	if (iter->end_sk == expected) {
3630 		iter->st_bucket_done = true;
3631 		return sk;
3632 	}
3633 
3634 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3635 		resized = true;
3636 		goto again;
3637 	}
3638 
3639 	return sk;
3640 }
3641 
3642 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3643 {
3644 	if (!*pos)
3645 		return SEQ_START_TOKEN;
3646 
3647 	/* bpf iter does not support lseek, so it always
3648 	 * continue from where it was stop()-ped.
3649 	 */
3650 	return bpf_iter_unix_batch(seq, pos);
3651 }
3652 
3653 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3654 {
3655 	struct bpf_unix_iter_state *iter = seq->private;
3656 	struct sock *sk;
3657 
3658 	/* Whenever seq_next() is called, the iter->cur_sk is
3659 	 * done with seq_show(), so advance to the next sk in
3660 	 * the batch.
3661 	 */
3662 	if (iter->cur_sk < iter->end_sk)
3663 		sock_put(iter->batch[iter->cur_sk++]);
3664 
3665 	++*pos;
3666 
3667 	if (iter->cur_sk < iter->end_sk)
3668 		sk = iter->batch[iter->cur_sk];
3669 	else
3670 		sk = bpf_iter_unix_batch(seq, pos);
3671 
3672 	return sk;
3673 }
3674 
3675 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3676 {
3677 	struct bpf_iter_meta meta;
3678 	struct bpf_prog *prog;
3679 	struct sock *sk = v;
3680 	uid_t uid;
3681 	bool slow;
3682 	int ret;
3683 
3684 	if (v == SEQ_START_TOKEN)
3685 		return 0;
3686 
3687 	slow = lock_sock_fast(sk);
3688 
3689 	if (unlikely(sk_unhashed(sk))) {
3690 		ret = SEQ_SKIP;
3691 		goto unlock;
3692 	}
3693 
3694 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3695 	meta.seq = seq;
3696 	prog = bpf_iter_get_info(&meta, false);
3697 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3698 unlock:
3699 	unlock_sock_fast(sk, slow);
3700 	return ret;
3701 }
3702 
3703 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3704 {
3705 	struct bpf_unix_iter_state *iter = seq->private;
3706 	struct bpf_iter_meta meta;
3707 	struct bpf_prog *prog;
3708 
3709 	if (!v) {
3710 		meta.seq = seq;
3711 		prog = bpf_iter_get_info(&meta, true);
3712 		if (prog)
3713 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3714 	}
3715 
3716 	if (iter->cur_sk < iter->end_sk)
3717 		bpf_iter_unix_put_batch(iter);
3718 }
3719 
3720 static const struct seq_operations bpf_iter_unix_seq_ops = {
3721 	.start	= bpf_iter_unix_seq_start,
3722 	.next	= bpf_iter_unix_seq_next,
3723 	.stop	= bpf_iter_unix_seq_stop,
3724 	.show	= bpf_iter_unix_seq_show,
3725 };
3726 #endif
3727 #endif
3728 
3729 static const struct net_proto_family unix_family_ops = {
3730 	.family = PF_UNIX,
3731 	.create = unix_create,
3732 	.owner	= THIS_MODULE,
3733 };
3734 
3735 
3736 static int __net_init unix_net_init(struct net *net)
3737 {
3738 	int i;
3739 
3740 	net->unx.sysctl_max_dgram_qlen = 10;
3741 	if (unix_sysctl_register(net))
3742 		goto out;
3743 
3744 #ifdef CONFIG_PROC_FS
3745 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3746 			     sizeof(struct seq_net_private)))
3747 		goto err_sysctl;
3748 #endif
3749 
3750 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3751 					      sizeof(spinlock_t), GFP_KERNEL);
3752 	if (!net->unx.table.locks)
3753 		goto err_proc;
3754 
3755 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3756 						sizeof(struct hlist_head),
3757 						GFP_KERNEL);
3758 	if (!net->unx.table.buckets)
3759 		goto free_locks;
3760 
3761 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3762 		spin_lock_init(&net->unx.table.locks[i]);
3763 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3764 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3765 	}
3766 
3767 	return 0;
3768 
3769 free_locks:
3770 	kvfree(net->unx.table.locks);
3771 err_proc:
3772 #ifdef CONFIG_PROC_FS
3773 	remove_proc_entry("unix", net->proc_net);
3774 err_sysctl:
3775 #endif
3776 	unix_sysctl_unregister(net);
3777 out:
3778 	return -ENOMEM;
3779 }
3780 
3781 static void __net_exit unix_net_exit(struct net *net)
3782 {
3783 	kvfree(net->unx.table.buckets);
3784 	kvfree(net->unx.table.locks);
3785 	unix_sysctl_unregister(net);
3786 	remove_proc_entry("unix", net->proc_net);
3787 }
3788 
3789 static struct pernet_operations unix_net_ops = {
3790 	.init = unix_net_init,
3791 	.exit = unix_net_exit,
3792 };
3793 
3794 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3795 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3796 		     struct unix_sock *unix_sk, uid_t uid)
3797 
3798 #define INIT_BATCH_SZ 16
3799 
3800 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3801 {
3802 	struct bpf_unix_iter_state *iter = priv_data;
3803 	int err;
3804 
3805 	err = bpf_iter_init_seq_net(priv_data, aux);
3806 	if (err)
3807 		return err;
3808 
3809 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3810 	if (err) {
3811 		bpf_iter_fini_seq_net(priv_data);
3812 		return err;
3813 	}
3814 
3815 	return 0;
3816 }
3817 
3818 static void bpf_iter_fini_unix(void *priv_data)
3819 {
3820 	struct bpf_unix_iter_state *iter = priv_data;
3821 
3822 	bpf_iter_fini_seq_net(priv_data);
3823 	kvfree(iter->batch);
3824 }
3825 
3826 static const struct bpf_iter_seq_info unix_seq_info = {
3827 	.seq_ops		= &bpf_iter_unix_seq_ops,
3828 	.init_seq_private	= bpf_iter_init_unix,
3829 	.fini_seq_private	= bpf_iter_fini_unix,
3830 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3831 };
3832 
3833 static const struct bpf_func_proto *
3834 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3835 			     const struct bpf_prog *prog)
3836 {
3837 	switch (func_id) {
3838 	case BPF_FUNC_setsockopt:
3839 		return &bpf_sk_setsockopt_proto;
3840 	case BPF_FUNC_getsockopt:
3841 		return &bpf_sk_getsockopt_proto;
3842 	default:
3843 		return NULL;
3844 	}
3845 }
3846 
3847 static struct bpf_iter_reg unix_reg_info = {
3848 	.target			= "unix",
3849 	.ctx_arg_info_size	= 1,
3850 	.ctx_arg_info		= {
3851 		{ offsetof(struct bpf_iter__unix, unix_sk),
3852 		  PTR_TO_BTF_ID_OR_NULL },
3853 	},
3854 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3855 	.seq_info		= &unix_seq_info,
3856 };
3857 
3858 static void __init bpf_iter_register(void)
3859 {
3860 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3861 	if (bpf_iter_reg_target(&unix_reg_info))
3862 		pr_warn("Warning: could not register bpf iterator unix\n");
3863 }
3864 #endif
3865 
3866 static int __init af_unix_init(void)
3867 {
3868 	int i, rc = -1;
3869 
3870 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3871 
3872 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3873 		spin_lock_init(&bsd_socket_locks[i]);
3874 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3875 	}
3876 
3877 	rc = proto_register(&unix_dgram_proto, 1);
3878 	if (rc != 0) {
3879 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3880 		goto out;
3881 	}
3882 
3883 	rc = proto_register(&unix_stream_proto, 1);
3884 	if (rc != 0) {
3885 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3886 		proto_unregister(&unix_dgram_proto);
3887 		goto out;
3888 	}
3889 
3890 	sock_register(&unix_family_ops);
3891 	register_pernet_subsys(&unix_net_ops);
3892 	unix_bpf_build_proto();
3893 
3894 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3895 	bpf_iter_register();
3896 #endif
3897 
3898 out:
3899 	return rc;
3900 }
3901 
3902 /* Later than subsys_initcall() because we depend on stuff initialised there */
3903 fs_initcall(af_unix_init);
3904