xref: /linux/net/unix/af_unix.c (revision e35ba5811714b7e5a73f98100ab8112a8176f84c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216 	return unix_peer(osk) == sk;
217 }
218 
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223 
224 static inline int unix_recvq_full(const struct sock *sk)
225 {
226 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228 
229 static inline int unix_recvq_full_lockless(const struct sock *sk)
230 {
231 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
232 		READ_ONCE(sk->sk_max_ack_backlog);
233 }
234 
235 struct sock *unix_peer_get(struct sock *s)
236 {
237 	struct sock *peer;
238 
239 	unix_state_lock(s);
240 	peer = unix_peer(s);
241 	if (peer)
242 		sock_hold(peer);
243 	unix_state_unlock(s);
244 	return peer;
245 }
246 EXPORT_SYMBOL_GPL(unix_peer_get);
247 
248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
249 					     int addr_len)
250 {
251 	struct unix_address *addr;
252 
253 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
254 	if (!addr)
255 		return NULL;
256 
257 	refcount_set(&addr->refcnt, 1);
258 	addr->len = addr_len;
259 	memcpy(addr->name, sunaddr, addr_len);
260 
261 	return addr;
262 }
263 
264 static inline void unix_release_addr(struct unix_address *addr)
265 {
266 	if (refcount_dec_and_test(&addr->refcnt))
267 		kfree(addr);
268 }
269 
270 /*
271  *	Check unix socket name:
272  *		- should be not zero length.
273  *	        - if started by not zero, should be NULL terminated (FS object)
274  *		- if started by zero, it is abstract name.
275  */
276 
277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
278 {
279 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
280 	    addr_len > sizeof(*sunaddr))
281 		return -EINVAL;
282 
283 	if (sunaddr->sun_family != AF_UNIX)
284 		return -EINVAL;
285 
286 	return 0;
287 }
288 
289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
290 {
291 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
292 	short offset = offsetof(struct sockaddr_storage, __data);
293 
294 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295 
296 	/* This may look like an off by one error but it is a bit more
297 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
298 	 * sun_path[108] doesn't as such exist.  However in kernel space
299 	 * we are guaranteed that it is a valid memory location in our
300 	 * kernel address buffer because syscall functions always pass
301 	 * a pointer of struct sockaddr_storage which has a bigger buffer
302 	 * than 108.  Also, we must terminate sun_path for strlen() in
303 	 * getname_kernel().
304 	 */
305 	addr->__data[addr_len - offset] = 0;
306 
307 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
308 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
309 	 * know the actual buffer.
310 	 */
311 	return strlen(addr->__data) + offset + 1;
312 }
313 
314 static void __unix_remove_socket(struct sock *sk)
315 {
316 	sk_del_node_init(sk);
317 }
318 
319 static void __unix_insert_socket(struct net *net, struct sock *sk)
320 {
321 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
323 }
324 
325 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
326 				 struct unix_address *addr, unsigned int hash)
327 {
328 	__unix_remove_socket(sk);
329 	smp_store_release(&unix_sk(sk)->addr, addr);
330 
331 	sk->sk_hash = hash;
332 	__unix_insert_socket(net, sk);
333 }
334 
335 static void unix_remove_socket(struct net *net, struct sock *sk)
336 {
337 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
338 	__unix_remove_socket(sk);
339 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 }
341 
342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
343 {
344 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
345 	__unix_insert_socket(net, sk);
346 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
347 }
348 
349 static void unix_insert_bsd_socket(struct sock *sk)
350 {
351 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
352 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
353 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354 }
355 
356 static void unix_remove_bsd_socket(struct sock *sk)
357 {
358 	if (!hlist_unhashed(&sk->sk_bind_node)) {
359 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
360 		__sk_del_bind_node(sk);
361 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
362 
363 		sk_node_init(&sk->sk_bind_node);
364 	}
365 }
366 
367 static struct sock *__unix_find_socket_byname(struct net *net,
368 					      struct sockaddr_un *sunname,
369 					      int len, unsigned int hash)
370 {
371 	struct sock *s;
372 
373 	sk_for_each(s, &net->unx.table.buckets[hash]) {
374 		struct unix_sock *u = unix_sk(s);
375 
376 		if (u->addr->len == len &&
377 		    !memcmp(u->addr->name, sunname, len))
378 			return s;
379 	}
380 	return NULL;
381 }
382 
383 static inline struct sock *unix_find_socket_byname(struct net *net,
384 						   struct sockaddr_un *sunname,
385 						   int len, unsigned int hash)
386 {
387 	struct sock *s;
388 
389 	spin_lock(&net->unx.table.locks[hash]);
390 	s = __unix_find_socket_byname(net, sunname, len, hash);
391 	if (s)
392 		sock_hold(s);
393 	spin_unlock(&net->unx.table.locks[hash]);
394 	return s;
395 }
396 
397 static struct sock *unix_find_socket_byinode(struct inode *i)
398 {
399 	unsigned int hash = unix_bsd_hash(i);
400 	struct sock *s;
401 
402 	spin_lock(&bsd_socket_locks[hash]);
403 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404 		struct dentry *dentry = unix_sk(s)->path.dentry;
405 
406 		if (dentry && d_backing_inode(dentry) == i) {
407 			sock_hold(s);
408 			spin_unlock(&bsd_socket_locks[hash]);
409 			return s;
410 		}
411 	}
412 	spin_unlock(&bsd_socket_locks[hash]);
413 	return NULL;
414 }
415 
416 /* Support code for asymmetrically connected dgram sockets
417  *
418  * If a datagram socket is connected to a socket not itself connected
419  * to the first socket (eg, /dev/log), clients may only enqueue more
420  * messages if the present receive queue of the server socket is not
421  * "too large". This means there's a second writeability condition
422  * poll and sendmsg need to test. The dgram recv code will do a wake
423  * up on the peer_wait wait queue of a socket upon reception of a
424  * datagram which needs to be propagated to sleeping would-be writers
425  * since these might not have sent anything so far. This can't be
426  * accomplished via poll_wait because the lifetime of the server
427  * socket might be less than that of its clients if these break their
428  * association with it or if the server socket is closed while clients
429  * are still connected to it and there's no way to inform "a polling
430  * implementation" that it should let go of a certain wait queue
431  *
432  * In order to propagate a wake up, a wait_queue_entry_t of the client
433  * socket is enqueued on the peer_wait queue of the server socket
434  * whose wake function does a wake_up on the ordinary client socket
435  * wait queue. This connection is established whenever a write (or
436  * poll for write) hit the flow control condition and broken when the
437  * association to the server socket is dissolved or after a wake up
438  * was relayed.
439  */
440 
441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
442 				      void *key)
443 {
444 	struct unix_sock *u;
445 	wait_queue_head_t *u_sleep;
446 
447 	u = container_of(q, struct unix_sock, peer_wake);
448 
449 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
450 			    q);
451 	u->peer_wake.private = NULL;
452 
453 	/* relaying can only happen while the wq still exists */
454 	u_sleep = sk_sleep(&u->sk);
455 	if (u_sleep)
456 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457 
458 	return 0;
459 }
460 
461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
462 {
463 	struct unix_sock *u, *u_other;
464 	int rc;
465 
466 	u = unix_sk(sk);
467 	u_other = unix_sk(other);
468 	rc = 0;
469 	spin_lock(&u_other->peer_wait.lock);
470 
471 	if (!u->peer_wake.private) {
472 		u->peer_wake.private = other;
473 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
474 
475 		rc = 1;
476 	}
477 
478 	spin_unlock(&u_other->peer_wait.lock);
479 	return rc;
480 }
481 
482 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483 					    struct sock *other)
484 {
485 	struct unix_sock *u, *u_other;
486 
487 	u = unix_sk(sk);
488 	u_other = unix_sk(other);
489 	spin_lock(&u_other->peer_wait.lock);
490 
491 	if (u->peer_wake.private == other) {
492 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
493 		u->peer_wake.private = NULL;
494 	}
495 
496 	spin_unlock(&u_other->peer_wait.lock);
497 }
498 
499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500 						   struct sock *other)
501 {
502 	unix_dgram_peer_wake_disconnect(sk, other);
503 	wake_up_interruptible_poll(sk_sleep(sk),
504 				   EPOLLOUT |
505 				   EPOLLWRNORM |
506 				   EPOLLWRBAND);
507 }
508 
509 /* preconditions:
510  *	- unix_peer(sk) == other
511  *	- association is stable
512  */
513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
514 {
515 	int connected;
516 
517 	connected = unix_dgram_peer_wake_connect(sk, other);
518 
519 	/* If other is SOCK_DEAD, we want to make sure we signal
520 	 * POLLOUT, such that a subsequent write() can get a
521 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522 	 * to other and its full, we will hang waiting for POLLOUT.
523 	 */
524 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
525 		return 1;
526 
527 	if (connected)
528 		unix_dgram_peer_wake_disconnect(sk, other);
529 
530 	return 0;
531 }
532 
533 static int unix_writable(const struct sock *sk)
534 {
535 	return sk->sk_state != TCP_LISTEN &&
536 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
537 }
538 
539 static void unix_write_space(struct sock *sk)
540 {
541 	struct socket_wq *wq;
542 
543 	rcu_read_lock();
544 	if (unix_writable(sk)) {
545 		wq = rcu_dereference(sk->sk_wq);
546 		if (skwq_has_sleeper(wq))
547 			wake_up_interruptible_sync_poll(&wq->wait,
548 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
549 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
550 	}
551 	rcu_read_unlock();
552 }
553 
554 /* When dgram socket disconnects (or changes its peer), we clear its receive
555  * queue of packets arrived from previous peer. First, it allows to do
556  * flow control based only on wmem_alloc; second, sk connected to peer
557  * may receive messages only from that peer. */
558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
559 {
560 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
561 		skb_queue_purge(&sk->sk_receive_queue);
562 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563 
564 		/* If one link of bidirectional dgram pipe is disconnected,
565 		 * we signal error. Messages are lost. Do not make this,
566 		 * when peer was not connected to us.
567 		 */
568 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
569 			WRITE_ONCE(other->sk_err, ECONNRESET);
570 			sk_error_report(other);
571 		}
572 	}
573 	other->sk_state = TCP_CLOSE;
574 }
575 
576 static void unix_sock_destructor(struct sock *sk)
577 {
578 	struct unix_sock *u = unix_sk(sk);
579 
580 	skb_queue_purge(&sk->sk_receive_queue);
581 
582 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
583 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
584 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
585 	if (!sock_flag(sk, SOCK_DEAD)) {
586 		pr_info("Attempt to release alive unix socket: %p\n", sk);
587 		return;
588 	}
589 
590 	if (u->addr)
591 		unix_release_addr(u->addr);
592 
593 	atomic_long_dec(&unix_nr_socks);
594 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
595 #ifdef UNIX_REFCNT_DEBUG
596 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
597 		atomic_long_read(&unix_nr_socks));
598 #endif
599 }
600 
601 static void unix_release_sock(struct sock *sk, int embrion)
602 {
603 	struct unix_sock *u = unix_sk(sk);
604 	struct sock *skpair;
605 	struct sk_buff *skb;
606 	struct path path;
607 	int state;
608 
609 	unix_remove_socket(sock_net(sk), sk);
610 	unix_remove_bsd_socket(sk);
611 
612 	/* Clear state */
613 	unix_state_lock(sk);
614 	sock_orphan(sk);
615 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
616 	path	     = u->path;
617 	u->path.dentry = NULL;
618 	u->path.mnt = NULL;
619 	state = sk->sk_state;
620 	sk->sk_state = TCP_CLOSE;
621 
622 	skpair = unix_peer(sk);
623 	unix_peer(sk) = NULL;
624 
625 	unix_state_unlock(sk);
626 
627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
628 	if (u->oob_skb) {
629 		kfree_skb(u->oob_skb);
630 		u->oob_skb = NULL;
631 	}
632 #endif
633 
634 	wake_up_interruptible_all(&u->peer_wait);
635 
636 	if (skpair != NULL) {
637 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
638 			unix_state_lock(skpair);
639 			/* No more writes */
640 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
641 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
642 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
643 			unix_state_unlock(skpair);
644 			skpair->sk_state_change(skpair);
645 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
646 		}
647 
648 		unix_dgram_peer_wake_disconnect(sk, skpair);
649 		sock_put(skpair); /* It may now die */
650 	}
651 
652 	/* Try to flush out this socket. Throw out buffers at least */
653 
654 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
655 		if (state == TCP_LISTEN)
656 			unix_release_sock(skb->sk, 1);
657 		/* passed fds are erased in the kfree_skb hook	      */
658 		UNIXCB(skb).consumed = skb->len;
659 		kfree_skb(skb);
660 	}
661 
662 	if (path.dentry)
663 		path_put(&path);
664 
665 	sock_put(sk);
666 
667 	/* ---- Socket is dead now and most probably destroyed ---- */
668 
669 	/*
670 	 * Fixme: BSD difference: In BSD all sockets connected to us get
671 	 *	  ECONNRESET and we die on the spot. In Linux we behave
672 	 *	  like files and pipes do and wait for the last
673 	 *	  dereference.
674 	 *
675 	 * Can't we simply set sock->err?
676 	 *
677 	 *	  What the above comment does talk about? --ANK(980817)
678 	 */
679 
680 	if (READ_ONCE(unix_tot_inflight))
681 		unix_gc();		/* Garbage collect fds */
682 }
683 
684 static void init_peercred(struct sock *sk)
685 {
686 	const struct cred *old_cred;
687 	struct pid *old_pid;
688 
689 	spin_lock(&sk->sk_peer_lock);
690 	old_pid = sk->sk_peer_pid;
691 	old_cred = sk->sk_peer_cred;
692 	sk->sk_peer_pid  = get_pid(task_tgid(current));
693 	sk->sk_peer_cred = get_current_cred();
694 	spin_unlock(&sk->sk_peer_lock);
695 
696 	put_pid(old_pid);
697 	put_cred(old_cred);
698 }
699 
700 static void copy_peercred(struct sock *sk, struct sock *peersk)
701 {
702 	const struct cred *old_cred;
703 	struct pid *old_pid;
704 
705 	if (sk < peersk) {
706 		spin_lock(&sk->sk_peer_lock);
707 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
708 	} else {
709 		spin_lock(&peersk->sk_peer_lock);
710 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711 	}
712 	old_pid = sk->sk_peer_pid;
713 	old_cred = sk->sk_peer_cred;
714 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
715 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
716 
717 	spin_unlock(&sk->sk_peer_lock);
718 	spin_unlock(&peersk->sk_peer_lock);
719 
720 	put_pid(old_pid);
721 	put_cred(old_cred);
722 }
723 
724 static int unix_listen(struct socket *sock, int backlog)
725 {
726 	int err;
727 	struct sock *sk = sock->sk;
728 	struct unix_sock *u = unix_sk(sk);
729 
730 	err = -EOPNOTSUPP;
731 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
732 		goto out;	/* Only stream/seqpacket sockets accept */
733 	err = -EINVAL;
734 	if (!u->addr)
735 		goto out;	/* No listens on an unbound socket */
736 	unix_state_lock(sk);
737 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
738 		goto out_unlock;
739 	if (backlog > sk->sk_max_ack_backlog)
740 		wake_up_interruptible_all(&u->peer_wait);
741 	sk->sk_max_ack_backlog	= backlog;
742 	sk->sk_state		= TCP_LISTEN;
743 	/* set credentials so connect can copy them */
744 	init_peercred(sk);
745 	err = 0;
746 
747 out_unlock:
748 	unix_state_unlock(sk);
749 out:
750 	return err;
751 }
752 
753 static int unix_release(struct socket *);
754 static int unix_bind(struct socket *, struct sockaddr *, int);
755 static int unix_stream_connect(struct socket *, struct sockaddr *,
756 			       int addr_len, int flags);
757 static int unix_socketpair(struct socket *, struct socket *);
758 static int unix_accept(struct socket *, struct socket *, int, bool);
759 static int unix_getname(struct socket *, struct sockaddr *, int);
760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
761 static __poll_t unix_dgram_poll(struct file *, struct socket *,
762 				    poll_table *);
763 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
764 #ifdef CONFIG_COMPAT
765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
766 #endif
767 static int unix_shutdown(struct socket *, int);
768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
771 				       struct pipe_inode_info *, size_t size,
772 				       unsigned int flags);
773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_dgram_connect(struct socket *, struct sockaddr *,
778 			      int, int);
779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
781 				  int);
782 
783 static int unix_set_peek_off(struct sock *sk, int val)
784 {
785 	struct unix_sock *u = unix_sk(sk);
786 
787 	if (mutex_lock_interruptible(&u->iolock))
788 		return -EINTR;
789 
790 	WRITE_ONCE(sk->sk_peek_off, val);
791 	mutex_unlock(&u->iolock);
792 
793 	return 0;
794 }
795 
796 #ifdef CONFIG_PROC_FS
797 static int unix_count_nr_fds(struct sock *sk)
798 {
799 	struct sk_buff *skb;
800 	struct unix_sock *u;
801 	int nr_fds = 0;
802 
803 	spin_lock(&sk->sk_receive_queue.lock);
804 	skb = skb_peek(&sk->sk_receive_queue);
805 	while (skb) {
806 		u = unix_sk(skb->sk);
807 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
808 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
809 	}
810 	spin_unlock(&sk->sk_receive_queue.lock);
811 
812 	return nr_fds;
813 }
814 
815 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
816 {
817 	struct sock *sk = sock->sk;
818 	unsigned char s_state;
819 	struct unix_sock *u;
820 	int nr_fds = 0;
821 
822 	if (sk) {
823 		s_state = READ_ONCE(sk->sk_state);
824 		u = unix_sk(sk);
825 
826 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
827 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
828 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
829 		 */
830 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
831 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
832 		else if (s_state == TCP_LISTEN)
833 			nr_fds = unix_count_nr_fds(sk);
834 
835 		seq_printf(m, "scm_fds: %u\n", nr_fds);
836 	}
837 }
838 #else
839 #define unix_show_fdinfo NULL
840 #endif
841 
842 static const struct proto_ops unix_stream_ops = {
843 	.family =	PF_UNIX,
844 	.owner =	THIS_MODULE,
845 	.release =	unix_release,
846 	.bind =		unix_bind,
847 	.connect =	unix_stream_connect,
848 	.socketpair =	unix_socketpair,
849 	.accept =	unix_accept,
850 	.getname =	unix_getname,
851 	.poll =		unix_poll,
852 	.ioctl =	unix_ioctl,
853 #ifdef CONFIG_COMPAT
854 	.compat_ioctl =	unix_compat_ioctl,
855 #endif
856 	.listen =	unix_listen,
857 	.shutdown =	unix_shutdown,
858 	.sendmsg =	unix_stream_sendmsg,
859 	.recvmsg =	unix_stream_recvmsg,
860 	.read_skb =	unix_stream_read_skb,
861 	.mmap =		sock_no_mmap,
862 	.splice_read =	unix_stream_splice_read,
863 	.set_peek_off =	unix_set_peek_off,
864 	.show_fdinfo =	unix_show_fdinfo,
865 };
866 
867 static const struct proto_ops unix_dgram_ops = {
868 	.family =	PF_UNIX,
869 	.owner =	THIS_MODULE,
870 	.release =	unix_release,
871 	.bind =		unix_bind,
872 	.connect =	unix_dgram_connect,
873 	.socketpair =	unix_socketpair,
874 	.accept =	sock_no_accept,
875 	.getname =	unix_getname,
876 	.poll =		unix_dgram_poll,
877 	.ioctl =	unix_ioctl,
878 #ifdef CONFIG_COMPAT
879 	.compat_ioctl =	unix_compat_ioctl,
880 #endif
881 	.listen =	sock_no_listen,
882 	.shutdown =	unix_shutdown,
883 	.sendmsg =	unix_dgram_sendmsg,
884 	.read_skb =	unix_read_skb,
885 	.recvmsg =	unix_dgram_recvmsg,
886 	.mmap =		sock_no_mmap,
887 	.set_peek_off =	unix_set_peek_off,
888 	.show_fdinfo =	unix_show_fdinfo,
889 };
890 
891 static const struct proto_ops unix_seqpacket_ops = {
892 	.family =	PF_UNIX,
893 	.owner =	THIS_MODULE,
894 	.release =	unix_release,
895 	.bind =		unix_bind,
896 	.connect =	unix_stream_connect,
897 	.socketpair =	unix_socketpair,
898 	.accept =	unix_accept,
899 	.getname =	unix_getname,
900 	.poll =		unix_dgram_poll,
901 	.ioctl =	unix_ioctl,
902 #ifdef CONFIG_COMPAT
903 	.compat_ioctl =	unix_compat_ioctl,
904 #endif
905 	.listen =	unix_listen,
906 	.shutdown =	unix_shutdown,
907 	.sendmsg =	unix_seqpacket_sendmsg,
908 	.recvmsg =	unix_seqpacket_recvmsg,
909 	.mmap =		sock_no_mmap,
910 	.set_peek_off =	unix_set_peek_off,
911 	.show_fdinfo =	unix_show_fdinfo,
912 };
913 
914 static void unix_close(struct sock *sk, long timeout)
915 {
916 	/* Nothing to do here, unix socket does not need a ->close().
917 	 * This is merely for sockmap.
918 	 */
919 }
920 
921 static void unix_unhash(struct sock *sk)
922 {
923 	/* Nothing to do here, unix socket does not need a ->unhash().
924 	 * This is merely for sockmap.
925 	 */
926 }
927 
928 static bool unix_bpf_bypass_getsockopt(int level, int optname)
929 {
930 	if (level == SOL_SOCKET) {
931 		switch (optname) {
932 		case SO_PEERPIDFD:
933 			return true;
934 		default:
935 			return false;
936 		}
937 	}
938 
939 	return false;
940 }
941 
942 struct proto unix_dgram_proto = {
943 	.name			= "UNIX",
944 	.owner			= THIS_MODULE,
945 	.obj_size		= sizeof(struct unix_sock),
946 	.close			= unix_close,
947 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
948 #ifdef CONFIG_BPF_SYSCALL
949 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
950 #endif
951 };
952 
953 struct proto unix_stream_proto = {
954 	.name			= "UNIX-STREAM",
955 	.owner			= THIS_MODULE,
956 	.obj_size		= sizeof(struct unix_sock),
957 	.close			= unix_close,
958 	.unhash			= unix_unhash,
959 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
960 #ifdef CONFIG_BPF_SYSCALL
961 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
962 #endif
963 };
964 
965 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
966 {
967 	struct unix_sock *u;
968 	struct sock *sk;
969 	int err;
970 
971 	atomic_long_inc(&unix_nr_socks);
972 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
973 		err = -ENFILE;
974 		goto err;
975 	}
976 
977 	if (type == SOCK_STREAM)
978 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
979 	else /*dgram and  seqpacket */
980 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
981 
982 	if (!sk) {
983 		err = -ENOMEM;
984 		goto err;
985 	}
986 
987 	sock_init_data(sock, sk);
988 
989 	sk->sk_hash		= unix_unbound_hash(sk);
990 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
991 	sk->sk_write_space	= unix_write_space;
992 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
993 	sk->sk_destruct		= unix_sock_destructor;
994 	u = unix_sk(sk);
995 	u->inflight = 0;
996 	u->path.dentry = NULL;
997 	u->path.mnt = NULL;
998 	spin_lock_init(&u->lock);
999 	INIT_LIST_HEAD(&u->link);
1000 	mutex_init(&u->iolock); /* single task reading lock */
1001 	mutex_init(&u->bindlock); /* single task binding lock */
1002 	init_waitqueue_head(&u->peer_wait);
1003 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1004 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1005 	unix_insert_unbound_socket(net, sk);
1006 
1007 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1008 
1009 	return sk;
1010 
1011 err:
1012 	atomic_long_dec(&unix_nr_socks);
1013 	return ERR_PTR(err);
1014 }
1015 
1016 static int unix_create(struct net *net, struct socket *sock, int protocol,
1017 		       int kern)
1018 {
1019 	struct sock *sk;
1020 
1021 	if (protocol && protocol != PF_UNIX)
1022 		return -EPROTONOSUPPORT;
1023 
1024 	sock->state = SS_UNCONNECTED;
1025 
1026 	switch (sock->type) {
1027 	case SOCK_STREAM:
1028 		sock->ops = &unix_stream_ops;
1029 		break;
1030 		/*
1031 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1032 		 *	nothing uses it.
1033 		 */
1034 	case SOCK_RAW:
1035 		sock->type = SOCK_DGRAM;
1036 		fallthrough;
1037 	case SOCK_DGRAM:
1038 		sock->ops = &unix_dgram_ops;
1039 		break;
1040 	case SOCK_SEQPACKET:
1041 		sock->ops = &unix_seqpacket_ops;
1042 		break;
1043 	default:
1044 		return -ESOCKTNOSUPPORT;
1045 	}
1046 
1047 	sk = unix_create1(net, sock, kern, sock->type);
1048 	if (IS_ERR(sk))
1049 		return PTR_ERR(sk);
1050 
1051 	return 0;
1052 }
1053 
1054 static int unix_release(struct socket *sock)
1055 {
1056 	struct sock *sk = sock->sk;
1057 
1058 	if (!sk)
1059 		return 0;
1060 
1061 	sk->sk_prot->close(sk, 0);
1062 	unix_release_sock(sk, 0);
1063 	sock->sk = NULL;
1064 
1065 	return 0;
1066 }
1067 
1068 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1069 				  int type)
1070 {
1071 	struct inode *inode;
1072 	struct path path;
1073 	struct sock *sk;
1074 	int err;
1075 
1076 	unix_mkname_bsd(sunaddr, addr_len);
1077 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1078 	if (err)
1079 		goto fail;
1080 
1081 	err = path_permission(&path, MAY_WRITE);
1082 	if (err)
1083 		goto path_put;
1084 
1085 	err = -ECONNREFUSED;
1086 	inode = d_backing_inode(path.dentry);
1087 	if (!S_ISSOCK(inode->i_mode))
1088 		goto path_put;
1089 
1090 	sk = unix_find_socket_byinode(inode);
1091 	if (!sk)
1092 		goto path_put;
1093 
1094 	err = -EPROTOTYPE;
1095 	if (sk->sk_type == type)
1096 		touch_atime(&path);
1097 	else
1098 		goto sock_put;
1099 
1100 	path_put(&path);
1101 
1102 	return sk;
1103 
1104 sock_put:
1105 	sock_put(sk);
1106 path_put:
1107 	path_put(&path);
1108 fail:
1109 	return ERR_PTR(err);
1110 }
1111 
1112 static struct sock *unix_find_abstract(struct net *net,
1113 				       struct sockaddr_un *sunaddr,
1114 				       int addr_len, int type)
1115 {
1116 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1117 	struct dentry *dentry;
1118 	struct sock *sk;
1119 
1120 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1121 	if (!sk)
1122 		return ERR_PTR(-ECONNREFUSED);
1123 
1124 	dentry = unix_sk(sk)->path.dentry;
1125 	if (dentry)
1126 		touch_atime(&unix_sk(sk)->path);
1127 
1128 	return sk;
1129 }
1130 
1131 static struct sock *unix_find_other(struct net *net,
1132 				    struct sockaddr_un *sunaddr,
1133 				    int addr_len, int type)
1134 {
1135 	struct sock *sk;
1136 
1137 	if (sunaddr->sun_path[0])
1138 		sk = unix_find_bsd(sunaddr, addr_len, type);
1139 	else
1140 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1141 
1142 	return sk;
1143 }
1144 
1145 static int unix_autobind(struct sock *sk)
1146 {
1147 	unsigned int new_hash, old_hash = sk->sk_hash;
1148 	struct unix_sock *u = unix_sk(sk);
1149 	struct net *net = sock_net(sk);
1150 	struct unix_address *addr;
1151 	u32 lastnum, ordernum;
1152 	int err;
1153 
1154 	err = mutex_lock_interruptible(&u->bindlock);
1155 	if (err)
1156 		return err;
1157 
1158 	if (u->addr)
1159 		goto out;
1160 
1161 	err = -ENOMEM;
1162 	addr = kzalloc(sizeof(*addr) +
1163 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1164 	if (!addr)
1165 		goto out;
1166 
1167 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1168 	addr->name->sun_family = AF_UNIX;
1169 	refcount_set(&addr->refcnt, 1);
1170 
1171 	ordernum = get_random_u32();
1172 	lastnum = ordernum & 0xFFFFF;
1173 retry:
1174 	ordernum = (ordernum + 1) & 0xFFFFF;
1175 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1176 
1177 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1178 	unix_table_double_lock(net, old_hash, new_hash);
1179 
1180 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1181 		unix_table_double_unlock(net, old_hash, new_hash);
1182 
1183 		/* __unix_find_socket_byname() may take long time if many names
1184 		 * are already in use.
1185 		 */
1186 		cond_resched();
1187 
1188 		if (ordernum == lastnum) {
1189 			/* Give up if all names seems to be in use. */
1190 			err = -ENOSPC;
1191 			unix_release_addr(addr);
1192 			goto out;
1193 		}
1194 
1195 		goto retry;
1196 	}
1197 
1198 	__unix_set_addr_hash(net, sk, addr, new_hash);
1199 	unix_table_double_unlock(net, old_hash, new_hash);
1200 	err = 0;
1201 
1202 out:	mutex_unlock(&u->bindlock);
1203 	return err;
1204 }
1205 
1206 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1207 			 int addr_len)
1208 {
1209 	umode_t mode = S_IFSOCK |
1210 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1211 	unsigned int new_hash, old_hash = sk->sk_hash;
1212 	struct unix_sock *u = unix_sk(sk);
1213 	struct net *net = sock_net(sk);
1214 	struct mnt_idmap *idmap;
1215 	struct unix_address *addr;
1216 	struct dentry *dentry;
1217 	struct path parent;
1218 	int err;
1219 
1220 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1221 	addr = unix_create_addr(sunaddr, addr_len);
1222 	if (!addr)
1223 		return -ENOMEM;
1224 
1225 	/*
1226 	 * Get the parent directory, calculate the hash for last
1227 	 * component.
1228 	 */
1229 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1230 	if (IS_ERR(dentry)) {
1231 		err = PTR_ERR(dentry);
1232 		goto out;
1233 	}
1234 
1235 	/*
1236 	 * All right, let's create it.
1237 	 */
1238 	idmap = mnt_idmap(parent.mnt);
1239 	err = security_path_mknod(&parent, dentry, mode, 0);
1240 	if (!err)
1241 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1242 	if (err)
1243 		goto out_path;
1244 	err = mutex_lock_interruptible(&u->bindlock);
1245 	if (err)
1246 		goto out_unlink;
1247 	if (u->addr)
1248 		goto out_unlock;
1249 
1250 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1251 	unix_table_double_lock(net, old_hash, new_hash);
1252 	u->path.mnt = mntget(parent.mnt);
1253 	u->path.dentry = dget(dentry);
1254 	__unix_set_addr_hash(net, sk, addr, new_hash);
1255 	unix_table_double_unlock(net, old_hash, new_hash);
1256 	unix_insert_bsd_socket(sk);
1257 	mutex_unlock(&u->bindlock);
1258 	done_path_create(&parent, dentry);
1259 	return 0;
1260 
1261 out_unlock:
1262 	mutex_unlock(&u->bindlock);
1263 	err = -EINVAL;
1264 out_unlink:
1265 	/* failed after successful mknod?  unlink what we'd created... */
1266 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1267 out_path:
1268 	done_path_create(&parent, dentry);
1269 out:
1270 	unix_release_addr(addr);
1271 	return err == -EEXIST ? -EADDRINUSE : err;
1272 }
1273 
1274 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1275 			      int addr_len)
1276 {
1277 	unsigned int new_hash, old_hash = sk->sk_hash;
1278 	struct unix_sock *u = unix_sk(sk);
1279 	struct net *net = sock_net(sk);
1280 	struct unix_address *addr;
1281 	int err;
1282 
1283 	addr = unix_create_addr(sunaddr, addr_len);
1284 	if (!addr)
1285 		return -ENOMEM;
1286 
1287 	err = mutex_lock_interruptible(&u->bindlock);
1288 	if (err)
1289 		goto out;
1290 
1291 	if (u->addr) {
1292 		err = -EINVAL;
1293 		goto out_mutex;
1294 	}
1295 
1296 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1297 	unix_table_double_lock(net, old_hash, new_hash);
1298 
1299 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1300 		goto out_spin;
1301 
1302 	__unix_set_addr_hash(net, sk, addr, new_hash);
1303 	unix_table_double_unlock(net, old_hash, new_hash);
1304 	mutex_unlock(&u->bindlock);
1305 	return 0;
1306 
1307 out_spin:
1308 	unix_table_double_unlock(net, old_hash, new_hash);
1309 	err = -EADDRINUSE;
1310 out_mutex:
1311 	mutex_unlock(&u->bindlock);
1312 out:
1313 	unix_release_addr(addr);
1314 	return err;
1315 }
1316 
1317 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1318 {
1319 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1320 	struct sock *sk = sock->sk;
1321 	int err;
1322 
1323 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1324 	    sunaddr->sun_family == AF_UNIX)
1325 		return unix_autobind(sk);
1326 
1327 	err = unix_validate_addr(sunaddr, addr_len);
1328 	if (err)
1329 		return err;
1330 
1331 	if (sunaddr->sun_path[0])
1332 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1333 	else
1334 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1335 
1336 	return err;
1337 }
1338 
1339 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1340 {
1341 	if (unlikely(sk1 == sk2) || !sk2) {
1342 		unix_state_lock(sk1);
1343 		return;
1344 	}
1345 	if (sk1 > sk2)
1346 		swap(sk1, sk2);
1347 
1348 	unix_state_lock(sk1);
1349 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1350 }
1351 
1352 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1353 {
1354 	if (unlikely(sk1 == sk2) || !sk2) {
1355 		unix_state_unlock(sk1);
1356 		return;
1357 	}
1358 	unix_state_unlock(sk1);
1359 	unix_state_unlock(sk2);
1360 }
1361 
1362 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1363 			      int alen, int flags)
1364 {
1365 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1366 	struct sock *sk = sock->sk;
1367 	struct sock *other;
1368 	int err;
1369 
1370 	err = -EINVAL;
1371 	if (alen < offsetofend(struct sockaddr, sa_family))
1372 		goto out;
1373 
1374 	if (addr->sa_family != AF_UNSPEC) {
1375 		err = unix_validate_addr(sunaddr, alen);
1376 		if (err)
1377 			goto out;
1378 
1379 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1380 		if (err)
1381 			goto out;
1382 
1383 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1384 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1385 		    !unix_sk(sk)->addr) {
1386 			err = unix_autobind(sk);
1387 			if (err)
1388 				goto out;
1389 		}
1390 
1391 restart:
1392 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1393 		if (IS_ERR(other)) {
1394 			err = PTR_ERR(other);
1395 			goto out;
1396 		}
1397 
1398 		unix_state_double_lock(sk, other);
1399 
1400 		/* Apparently VFS overslept socket death. Retry. */
1401 		if (sock_flag(other, SOCK_DEAD)) {
1402 			unix_state_double_unlock(sk, other);
1403 			sock_put(other);
1404 			goto restart;
1405 		}
1406 
1407 		err = -EPERM;
1408 		if (!unix_may_send(sk, other))
1409 			goto out_unlock;
1410 
1411 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1412 		if (err)
1413 			goto out_unlock;
1414 
1415 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1416 	} else {
1417 		/*
1418 		 *	1003.1g breaking connected state with AF_UNSPEC
1419 		 */
1420 		other = NULL;
1421 		unix_state_double_lock(sk, other);
1422 	}
1423 
1424 	/*
1425 	 * If it was connected, reconnect.
1426 	 */
1427 	if (unix_peer(sk)) {
1428 		struct sock *old_peer = unix_peer(sk);
1429 
1430 		unix_peer(sk) = other;
1431 		if (!other)
1432 			sk->sk_state = TCP_CLOSE;
1433 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1434 
1435 		unix_state_double_unlock(sk, other);
1436 
1437 		if (other != old_peer)
1438 			unix_dgram_disconnected(sk, old_peer);
1439 		sock_put(old_peer);
1440 	} else {
1441 		unix_peer(sk) = other;
1442 		unix_state_double_unlock(sk, other);
1443 	}
1444 
1445 	return 0;
1446 
1447 out_unlock:
1448 	unix_state_double_unlock(sk, other);
1449 	sock_put(other);
1450 out:
1451 	return err;
1452 }
1453 
1454 static long unix_wait_for_peer(struct sock *other, long timeo)
1455 	__releases(&unix_sk(other)->lock)
1456 {
1457 	struct unix_sock *u = unix_sk(other);
1458 	int sched;
1459 	DEFINE_WAIT(wait);
1460 
1461 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1462 
1463 	sched = !sock_flag(other, SOCK_DEAD) &&
1464 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1465 		unix_recvq_full_lockless(other);
1466 
1467 	unix_state_unlock(other);
1468 
1469 	if (sched)
1470 		timeo = schedule_timeout(timeo);
1471 
1472 	finish_wait(&u->peer_wait, &wait);
1473 	return timeo;
1474 }
1475 
1476 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1477 			       int addr_len, int flags)
1478 {
1479 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1480 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1481 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1482 	struct net *net = sock_net(sk);
1483 	struct sk_buff *skb = NULL;
1484 	long timeo;
1485 	int err;
1486 	int st;
1487 
1488 	err = unix_validate_addr(sunaddr, addr_len);
1489 	if (err)
1490 		goto out;
1491 
1492 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1493 	if (err)
1494 		goto out;
1495 
1496 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1497 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1498 		err = unix_autobind(sk);
1499 		if (err)
1500 			goto out;
1501 	}
1502 
1503 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1504 
1505 	/* First of all allocate resources.
1506 	   If we will make it after state is locked,
1507 	   we will have to recheck all again in any case.
1508 	 */
1509 
1510 	/* create new sock for complete connection */
1511 	newsk = unix_create1(net, NULL, 0, sock->type);
1512 	if (IS_ERR(newsk)) {
1513 		err = PTR_ERR(newsk);
1514 		newsk = NULL;
1515 		goto out;
1516 	}
1517 
1518 	err = -ENOMEM;
1519 
1520 	/* Allocate skb for sending to listening sock */
1521 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1522 	if (skb == NULL)
1523 		goto out;
1524 
1525 restart:
1526 	/*  Find listening sock. */
1527 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1528 	if (IS_ERR(other)) {
1529 		err = PTR_ERR(other);
1530 		other = NULL;
1531 		goto out;
1532 	}
1533 
1534 	/* Latch state of peer */
1535 	unix_state_lock(other);
1536 
1537 	/* Apparently VFS overslept socket death. Retry. */
1538 	if (sock_flag(other, SOCK_DEAD)) {
1539 		unix_state_unlock(other);
1540 		sock_put(other);
1541 		goto restart;
1542 	}
1543 
1544 	err = -ECONNREFUSED;
1545 	if (other->sk_state != TCP_LISTEN)
1546 		goto out_unlock;
1547 	if (other->sk_shutdown & RCV_SHUTDOWN)
1548 		goto out_unlock;
1549 
1550 	if (unix_recvq_full(other)) {
1551 		err = -EAGAIN;
1552 		if (!timeo)
1553 			goto out_unlock;
1554 
1555 		timeo = unix_wait_for_peer(other, timeo);
1556 
1557 		err = sock_intr_errno(timeo);
1558 		if (signal_pending(current))
1559 			goto out;
1560 		sock_put(other);
1561 		goto restart;
1562 	}
1563 
1564 	/* Latch our state.
1565 
1566 	   It is tricky place. We need to grab our state lock and cannot
1567 	   drop lock on peer. It is dangerous because deadlock is
1568 	   possible. Connect to self case and simultaneous
1569 	   attempt to connect are eliminated by checking socket
1570 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1571 	   check this before attempt to grab lock.
1572 
1573 	   Well, and we have to recheck the state after socket locked.
1574 	 */
1575 	st = sk->sk_state;
1576 
1577 	switch (st) {
1578 	case TCP_CLOSE:
1579 		/* This is ok... continue with connect */
1580 		break;
1581 	case TCP_ESTABLISHED:
1582 		/* Socket is already connected */
1583 		err = -EISCONN;
1584 		goto out_unlock;
1585 	default:
1586 		err = -EINVAL;
1587 		goto out_unlock;
1588 	}
1589 
1590 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1591 
1592 	if (sk->sk_state != st) {
1593 		unix_state_unlock(sk);
1594 		unix_state_unlock(other);
1595 		sock_put(other);
1596 		goto restart;
1597 	}
1598 
1599 	err = security_unix_stream_connect(sk, other, newsk);
1600 	if (err) {
1601 		unix_state_unlock(sk);
1602 		goto out_unlock;
1603 	}
1604 
1605 	/* The way is open! Fastly set all the necessary fields... */
1606 
1607 	sock_hold(sk);
1608 	unix_peer(newsk)	= sk;
1609 	newsk->sk_state		= TCP_ESTABLISHED;
1610 	newsk->sk_type		= sk->sk_type;
1611 	init_peercred(newsk);
1612 	newu = unix_sk(newsk);
1613 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1614 	otheru = unix_sk(other);
1615 
1616 	/* copy address information from listening to new sock
1617 	 *
1618 	 * The contents of *(otheru->addr) and otheru->path
1619 	 * are seen fully set up here, since we have found
1620 	 * otheru in hash under its lock.  Insertion into the
1621 	 * hash chain we'd found it in had been done in an
1622 	 * earlier critical area protected by the chain's lock,
1623 	 * the same one where we'd set *(otheru->addr) contents,
1624 	 * as well as otheru->path and otheru->addr itself.
1625 	 *
1626 	 * Using smp_store_release() here to set newu->addr
1627 	 * is enough to make those stores, as well as stores
1628 	 * to newu->path visible to anyone who gets newu->addr
1629 	 * by smp_load_acquire().  IOW, the same warranties
1630 	 * as for unix_sock instances bound in unix_bind() or
1631 	 * in unix_autobind().
1632 	 */
1633 	if (otheru->path.dentry) {
1634 		path_get(&otheru->path);
1635 		newu->path = otheru->path;
1636 	}
1637 	refcount_inc(&otheru->addr->refcnt);
1638 	smp_store_release(&newu->addr, otheru->addr);
1639 
1640 	/* Set credentials */
1641 	copy_peercred(sk, other);
1642 
1643 	sock->state	= SS_CONNECTED;
1644 	sk->sk_state	= TCP_ESTABLISHED;
1645 	sock_hold(newsk);
1646 
1647 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1648 	unix_peer(sk)	= newsk;
1649 
1650 	unix_state_unlock(sk);
1651 
1652 	/* take ten and send info to listening sock */
1653 	spin_lock(&other->sk_receive_queue.lock);
1654 	__skb_queue_tail(&other->sk_receive_queue, skb);
1655 	spin_unlock(&other->sk_receive_queue.lock);
1656 	unix_state_unlock(other);
1657 	other->sk_data_ready(other);
1658 	sock_put(other);
1659 	return 0;
1660 
1661 out_unlock:
1662 	if (other)
1663 		unix_state_unlock(other);
1664 
1665 out:
1666 	kfree_skb(skb);
1667 	if (newsk)
1668 		unix_release_sock(newsk, 0);
1669 	if (other)
1670 		sock_put(other);
1671 	return err;
1672 }
1673 
1674 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1675 {
1676 	struct sock *ska = socka->sk, *skb = sockb->sk;
1677 
1678 	/* Join our sockets back to back */
1679 	sock_hold(ska);
1680 	sock_hold(skb);
1681 	unix_peer(ska) = skb;
1682 	unix_peer(skb) = ska;
1683 	init_peercred(ska);
1684 	init_peercred(skb);
1685 
1686 	ska->sk_state = TCP_ESTABLISHED;
1687 	skb->sk_state = TCP_ESTABLISHED;
1688 	socka->state  = SS_CONNECTED;
1689 	sockb->state  = SS_CONNECTED;
1690 	return 0;
1691 }
1692 
1693 static void unix_sock_inherit_flags(const struct socket *old,
1694 				    struct socket *new)
1695 {
1696 	if (test_bit(SOCK_PASSCRED, &old->flags))
1697 		set_bit(SOCK_PASSCRED, &new->flags);
1698 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1699 		set_bit(SOCK_PASSPIDFD, &new->flags);
1700 	if (test_bit(SOCK_PASSSEC, &old->flags))
1701 		set_bit(SOCK_PASSSEC, &new->flags);
1702 }
1703 
1704 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1705 		       bool kern)
1706 {
1707 	struct sock *sk = sock->sk;
1708 	struct sock *tsk;
1709 	struct sk_buff *skb;
1710 	int err;
1711 
1712 	err = -EOPNOTSUPP;
1713 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1714 		goto out;
1715 
1716 	err = -EINVAL;
1717 	if (sk->sk_state != TCP_LISTEN)
1718 		goto out;
1719 
1720 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1721 	 * so that no locks are necessary.
1722 	 */
1723 
1724 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1725 				&err);
1726 	if (!skb) {
1727 		/* This means receive shutdown. */
1728 		if (err == 0)
1729 			err = -EINVAL;
1730 		goto out;
1731 	}
1732 
1733 	tsk = skb->sk;
1734 	skb_free_datagram(sk, skb);
1735 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1736 
1737 	/* attach accepted sock to socket */
1738 	unix_state_lock(tsk);
1739 	newsock->state = SS_CONNECTED;
1740 	unix_sock_inherit_flags(sock, newsock);
1741 	sock_graft(tsk, newsock);
1742 	unix_state_unlock(tsk);
1743 	return 0;
1744 
1745 out:
1746 	return err;
1747 }
1748 
1749 
1750 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1751 {
1752 	struct sock *sk = sock->sk;
1753 	struct unix_address *addr;
1754 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1755 	int err = 0;
1756 
1757 	if (peer) {
1758 		sk = unix_peer_get(sk);
1759 
1760 		err = -ENOTCONN;
1761 		if (!sk)
1762 			goto out;
1763 		err = 0;
1764 	} else {
1765 		sock_hold(sk);
1766 	}
1767 
1768 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1769 	if (!addr) {
1770 		sunaddr->sun_family = AF_UNIX;
1771 		sunaddr->sun_path[0] = 0;
1772 		err = offsetof(struct sockaddr_un, sun_path);
1773 	} else {
1774 		err = addr->len;
1775 		memcpy(sunaddr, addr->name, addr->len);
1776 
1777 		if (peer)
1778 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1779 					       CGROUP_UNIX_GETPEERNAME);
1780 		else
1781 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1782 					       CGROUP_UNIX_GETSOCKNAME);
1783 	}
1784 	sock_put(sk);
1785 out:
1786 	return err;
1787 }
1788 
1789 /* The "user->unix_inflight" variable is protected by the garbage
1790  * collection lock, and we just read it locklessly here. If you go
1791  * over the limit, there might be a tiny race in actually noticing
1792  * it across threads. Tough.
1793  */
1794 static inline bool too_many_unix_fds(struct task_struct *p)
1795 {
1796 	struct user_struct *user = current_user();
1797 
1798 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1799 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1800 	return false;
1801 }
1802 
1803 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1804 {
1805 	int i;
1806 
1807 	if (too_many_unix_fds(current))
1808 		return -ETOOMANYREFS;
1809 
1810 	/* Need to duplicate file references for the sake of garbage
1811 	 * collection.  Otherwise a socket in the fps might become a
1812 	 * candidate for GC while the skb is not yet queued.
1813 	 */
1814 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1815 	if (!UNIXCB(skb).fp)
1816 		return -ENOMEM;
1817 
1818 	for (i = scm->fp->count - 1; i >= 0; i--)
1819 		unix_inflight(scm->fp->user, scm->fp->fp[i]);
1820 
1821 	return 0;
1822 }
1823 
1824 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1825 {
1826 	int i;
1827 
1828 	scm->fp = UNIXCB(skb).fp;
1829 	UNIXCB(skb).fp = NULL;
1830 
1831 	for (i = scm->fp->count - 1; i >= 0; i--)
1832 		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1833 }
1834 
1835 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1836 {
1837 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1838 
1839 	/*
1840 	 * Garbage collection of unix sockets starts by selecting a set of
1841 	 * candidate sockets which have reference only from being in flight
1842 	 * (total_refs == inflight_refs).  This condition is checked once during
1843 	 * the candidate collection phase, and candidates are marked as such, so
1844 	 * that non-candidates can later be ignored.  While inflight_refs is
1845 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1846 	 * is an instantaneous decision.
1847 	 *
1848 	 * Once a candidate, however, the socket must not be reinstalled into a
1849 	 * file descriptor while the garbage collection is in progress.
1850 	 *
1851 	 * If the above conditions are met, then the directed graph of
1852 	 * candidates (*) does not change while unix_gc_lock is held.
1853 	 *
1854 	 * Any operations that changes the file count through file descriptors
1855 	 * (dup, close, sendmsg) does not change the graph since candidates are
1856 	 * not installed in fds.
1857 	 *
1858 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1859 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1860 	 * serialized with garbage collection.
1861 	 *
1862 	 * MSG_PEEK is special in that it does not change the inflight count,
1863 	 * yet does install the socket into an fd.  The following lock/unlock
1864 	 * pair is to ensure serialization with garbage collection.  It must be
1865 	 * done between incrementing the file count and installing the file into
1866 	 * an fd.
1867 	 *
1868 	 * If garbage collection starts after the barrier provided by the
1869 	 * lock/unlock, then it will see the elevated refcount and not mark this
1870 	 * as a candidate.  If a garbage collection is already in progress
1871 	 * before the file count was incremented, then the lock/unlock pair will
1872 	 * ensure that garbage collection is finished before progressing to
1873 	 * installing the fd.
1874 	 *
1875 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1876 	 * which is on the queue of listening socket A.
1877 	 */
1878 	spin_lock(&unix_gc_lock);
1879 	spin_unlock(&unix_gc_lock);
1880 }
1881 
1882 static void unix_destruct_scm(struct sk_buff *skb)
1883 {
1884 	struct scm_cookie scm;
1885 
1886 	memset(&scm, 0, sizeof(scm));
1887 	scm.pid  = UNIXCB(skb).pid;
1888 	if (UNIXCB(skb).fp)
1889 		unix_detach_fds(&scm, skb);
1890 
1891 	/* Alas, it calls VFS */
1892 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1893 	scm_destroy(&scm);
1894 	sock_wfree(skb);
1895 }
1896 
1897 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1898 {
1899 	int err = 0;
1900 
1901 	UNIXCB(skb).pid  = get_pid(scm->pid);
1902 	UNIXCB(skb).uid = scm->creds.uid;
1903 	UNIXCB(skb).gid = scm->creds.gid;
1904 	UNIXCB(skb).fp = NULL;
1905 	unix_get_secdata(scm, skb);
1906 	if (scm->fp && send_fds)
1907 		err = unix_attach_fds(scm, skb);
1908 
1909 	skb->destructor = unix_destruct_scm;
1910 	return err;
1911 }
1912 
1913 static bool unix_passcred_enabled(const struct socket *sock,
1914 				  const struct sock *other)
1915 {
1916 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1917 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1918 	       !other->sk_socket ||
1919 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1920 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1921 }
1922 
1923 /*
1924  * Some apps rely on write() giving SCM_CREDENTIALS
1925  * We include credentials if source or destination socket
1926  * asserted SOCK_PASSCRED.
1927  */
1928 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1929 			    const struct sock *other)
1930 {
1931 	if (UNIXCB(skb).pid)
1932 		return;
1933 	if (unix_passcred_enabled(sock, other)) {
1934 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1935 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1936 	}
1937 }
1938 
1939 static bool unix_skb_scm_eq(struct sk_buff *skb,
1940 			    struct scm_cookie *scm)
1941 {
1942 	return UNIXCB(skb).pid == scm->pid &&
1943 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1944 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1945 	       unix_secdata_eq(scm, skb);
1946 }
1947 
1948 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1949 {
1950 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1951 	struct unix_sock *u = unix_sk(sk);
1952 
1953 	if (unlikely(fp && fp->count))
1954 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1955 }
1956 
1957 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1958 {
1959 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1960 	struct unix_sock *u = unix_sk(sk);
1961 
1962 	if (unlikely(fp && fp->count))
1963 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1964 }
1965 
1966 /*
1967  *	Send AF_UNIX data.
1968  */
1969 
1970 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1971 			      size_t len)
1972 {
1973 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1974 	struct sock *sk = sock->sk, *other = NULL;
1975 	struct unix_sock *u = unix_sk(sk);
1976 	struct scm_cookie scm;
1977 	struct sk_buff *skb;
1978 	int data_len = 0;
1979 	int sk_locked;
1980 	long timeo;
1981 	int err;
1982 
1983 	err = scm_send(sock, msg, &scm, false);
1984 	if (err < 0)
1985 		return err;
1986 
1987 	wait_for_unix_gc(scm.fp);
1988 
1989 	err = -EOPNOTSUPP;
1990 	if (msg->msg_flags&MSG_OOB)
1991 		goto out;
1992 
1993 	if (msg->msg_namelen) {
1994 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1995 		if (err)
1996 			goto out;
1997 
1998 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1999 							    msg->msg_name,
2000 							    &msg->msg_namelen,
2001 							    NULL);
2002 		if (err)
2003 			goto out;
2004 	} else {
2005 		sunaddr = NULL;
2006 		err = -ENOTCONN;
2007 		other = unix_peer_get(sk);
2008 		if (!other)
2009 			goto out;
2010 	}
2011 
2012 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2013 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
2014 		err = unix_autobind(sk);
2015 		if (err)
2016 			goto out;
2017 	}
2018 
2019 	err = -EMSGSIZE;
2020 	if (len > sk->sk_sndbuf - 32)
2021 		goto out;
2022 
2023 	if (len > SKB_MAX_ALLOC) {
2024 		data_len = min_t(size_t,
2025 				 len - SKB_MAX_ALLOC,
2026 				 MAX_SKB_FRAGS * PAGE_SIZE);
2027 		data_len = PAGE_ALIGN(data_len);
2028 
2029 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2030 	}
2031 
2032 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2033 				   msg->msg_flags & MSG_DONTWAIT, &err,
2034 				   PAGE_ALLOC_COSTLY_ORDER);
2035 	if (skb == NULL)
2036 		goto out;
2037 
2038 	err = unix_scm_to_skb(&scm, skb, true);
2039 	if (err < 0)
2040 		goto out_free;
2041 
2042 	skb_put(skb, len - data_len);
2043 	skb->data_len = data_len;
2044 	skb->len = len;
2045 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2046 	if (err)
2047 		goto out_free;
2048 
2049 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2050 
2051 restart:
2052 	if (!other) {
2053 		err = -ECONNRESET;
2054 		if (sunaddr == NULL)
2055 			goto out_free;
2056 
2057 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2058 					sk->sk_type);
2059 		if (IS_ERR(other)) {
2060 			err = PTR_ERR(other);
2061 			other = NULL;
2062 			goto out_free;
2063 		}
2064 	}
2065 
2066 	if (sk_filter(other, skb) < 0) {
2067 		/* Toss the packet but do not return any error to the sender */
2068 		err = len;
2069 		goto out_free;
2070 	}
2071 
2072 	sk_locked = 0;
2073 	unix_state_lock(other);
2074 restart_locked:
2075 	err = -EPERM;
2076 	if (!unix_may_send(sk, other))
2077 		goto out_unlock;
2078 
2079 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2080 		/*
2081 		 *	Check with 1003.1g - what should
2082 		 *	datagram error
2083 		 */
2084 		unix_state_unlock(other);
2085 		sock_put(other);
2086 
2087 		if (!sk_locked)
2088 			unix_state_lock(sk);
2089 
2090 		err = 0;
2091 		if (sk->sk_type == SOCK_SEQPACKET) {
2092 			/* We are here only when racing with unix_release_sock()
2093 			 * is clearing @other. Never change state to TCP_CLOSE
2094 			 * unlike SOCK_DGRAM wants.
2095 			 */
2096 			unix_state_unlock(sk);
2097 			err = -EPIPE;
2098 		} else if (unix_peer(sk) == other) {
2099 			unix_peer(sk) = NULL;
2100 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2101 
2102 			sk->sk_state = TCP_CLOSE;
2103 			unix_state_unlock(sk);
2104 
2105 			unix_dgram_disconnected(sk, other);
2106 			sock_put(other);
2107 			err = -ECONNREFUSED;
2108 		} else {
2109 			unix_state_unlock(sk);
2110 		}
2111 
2112 		other = NULL;
2113 		if (err)
2114 			goto out_free;
2115 		goto restart;
2116 	}
2117 
2118 	err = -EPIPE;
2119 	if (other->sk_shutdown & RCV_SHUTDOWN)
2120 		goto out_unlock;
2121 
2122 	if (sk->sk_type != SOCK_SEQPACKET) {
2123 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2124 		if (err)
2125 			goto out_unlock;
2126 	}
2127 
2128 	/* other == sk && unix_peer(other) != sk if
2129 	 * - unix_peer(sk) == NULL, destination address bound to sk
2130 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2131 	 */
2132 	if (other != sk &&
2133 	    unlikely(unix_peer(other) != sk &&
2134 	    unix_recvq_full_lockless(other))) {
2135 		if (timeo) {
2136 			timeo = unix_wait_for_peer(other, timeo);
2137 
2138 			err = sock_intr_errno(timeo);
2139 			if (signal_pending(current))
2140 				goto out_free;
2141 
2142 			goto restart;
2143 		}
2144 
2145 		if (!sk_locked) {
2146 			unix_state_unlock(other);
2147 			unix_state_double_lock(sk, other);
2148 		}
2149 
2150 		if (unix_peer(sk) != other ||
2151 		    unix_dgram_peer_wake_me(sk, other)) {
2152 			err = -EAGAIN;
2153 			sk_locked = 1;
2154 			goto out_unlock;
2155 		}
2156 
2157 		if (!sk_locked) {
2158 			sk_locked = 1;
2159 			goto restart_locked;
2160 		}
2161 	}
2162 
2163 	if (unlikely(sk_locked))
2164 		unix_state_unlock(sk);
2165 
2166 	if (sock_flag(other, SOCK_RCVTSTAMP))
2167 		__net_timestamp(skb);
2168 	maybe_add_creds(skb, sock, other);
2169 	scm_stat_add(other, skb);
2170 	skb_queue_tail(&other->sk_receive_queue, skb);
2171 	unix_state_unlock(other);
2172 	other->sk_data_ready(other);
2173 	sock_put(other);
2174 	scm_destroy(&scm);
2175 	return len;
2176 
2177 out_unlock:
2178 	if (sk_locked)
2179 		unix_state_unlock(sk);
2180 	unix_state_unlock(other);
2181 out_free:
2182 	kfree_skb(skb);
2183 out:
2184 	if (other)
2185 		sock_put(other);
2186 	scm_destroy(&scm);
2187 	return err;
2188 }
2189 
2190 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2191  * bytes, and a minimum of a full page.
2192  */
2193 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2194 
2195 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2196 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2197 		     struct scm_cookie *scm, bool fds_sent)
2198 {
2199 	struct unix_sock *ousk = unix_sk(other);
2200 	struct sk_buff *skb;
2201 	int err = 0;
2202 
2203 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2204 
2205 	if (!skb)
2206 		return err;
2207 
2208 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2209 	if (err < 0) {
2210 		kfree_skb(skb);
2211 		return err;
2212 	}
2213 	skb_put(skb, 1);
2214 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2215 
2216 	if (err) {
2217 		kfree_skb(skb);
2218 		return err;
2219 	}
2220 
2221 	unix_state_lock(other);
2222 
2223 	if (sock_flag(other, SOCK_DEAD) ||
2224 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2225 		unix_state_unlock(other);
2226 		kfree_skb(skb);
2227 		return -EPIPE;
2228 	}
2229 
2230 	maybe_add_creds(skb, sock, other);
2231 	skb_get(skb);
2232 
2233 	if (ousk->oob_skb)
2234 		consume_skb(ousk->oob_skb);
2235 
2236 	WRITE_ONCE(ousk->oob_skb, skb);
2237 
2238 	scm_stat_add(other, skb);
2239 	skb_queue_tail(&other->sk_receive_queue, skb);
2240 	sk_send_sigurg(other);
2241 	unix_state_unlock(other);
2242 	other->sk_data_ready(other);
2243 
2244 	return err;
2245 }
2246 #endif
2247 
2248 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2249 			       size_t len)
2250 {
2251 	struct sock *sk = sock->sk;
2252 	struct sock *other = NULL;
2253 	int err, size;
2254 	struct sk_buff *skb;
2255 	int sent = 0;
2256 	struct scm_cookie scm;
2257 	bool fds_sent = false;
2258 	int data_len;
2259 
2260 	err = scm_send(sock, msg, &scm, false);
2261 	if (err < 0)
2262 		return err;
2263 
2264 	wait_for_unix_gc(scm.fp);
2265 
2266 	err = -EOPNOTSUPP;
2267 	if (msg->msg_flags & MSG_OOB) {
2268 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2269 		if (len)
2270 			len--;
2271 		else
2272 #endif
2273 			goto out_err;
2274 	}
2275 
2276 	if (msg->msg_namelen) {
2277 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2278 		goto out_err;
2279 	} else {
2280 		err = -ENOTCONN;
2281 		other = unix_peer(sk);
2282 		if (!other)
2283 			goto out_err;
2284 	}
2285 
2286 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2287 		goto pipe_err;
2288 
2289 	while (sent < len) {
2290 		size = len - sent;
2291 
2292 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2293 			skb = sock_alloc_send_pskb(sk, 0, 0,
2294 						   msg->msg_flags & MSG_DONTWAIT,
2295 						   &err, 0);
2296 		} else {
2297 			/* Keep two messages in the pipe so it schedules better */
2298 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2299 
2300 			/* allow fallback to order-0 allocations */
2301 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2302 
2303 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2304 
2305 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2306 
2307 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2308 						   msg->msg_flags & MSG_DONTWAIT, &err,
2309 						   get_order(UNIX_SKB_FRAGS_SZ));
2310 		}
2311 		if (!skb)
2312 			goto out_err;
2313 
2314 		/* Only send the fds in the first buffer */
2315 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2316 		if (err < 0) {
2317 			kfree_skb(skb);
2318 			goto out_err;
2319 		}
2320 		fds_sent = true;
2321 
2322 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2323 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2324 						   sk->sk_allocation);
2325 			if (err < 0) {
2326 				kfree_skb(skb);
2327 				goto out_err;
2328 			}
2329 			size = err;
2330 			refcount_add(size, &sk->sk_wmem_alloc);
2331 		} else {
2332 			skb_put(skb, size - data_len);
2333 			skb->data_len = data_len;
2334 			skb->len = size;
2335 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2336 			if (err) {
2337 				kfree_skb(skb);
2338 				goto out_err;
2339 			}
2340 		}
2341 
2342 		unix_state_lock(other);
2343 
2344 		if (sock_flag(other, SOCK_DEAD) ||
2345 		    (other->sk_shutdown & RCV_SHUTDOWN))
2346 			goto pipe_err_free;
2347 
2348 		maybe_add_creds(skb, sock, other);
2349 		scm_stat_add(other, skb);
2350 		skb_queue_tail(&other->sk_receive_queue, skb);
2351 		unix_state_unlock(other);
2352 		other->sk_data_ready(other);
2353 		sent += size;
2354 	}
2355 
2356 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2357 	if (msg->msg_flags & MSG_OOB) {
2358 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2359 		if (err)
2360 			goto out_err;
2361 		sent++;
2362 	}
2363 #endif
2364 
2365 	scm_destroy(&scm);
2366 
2367 	return sent;
2368 
2369 pipe_err_free:
2370 	unix_state_unlock(other);
2371 	kfree_skb(skb);
2372 pipe_err:
2373 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2374 		send_sig(SIGPIPE, current, 0);
2375 	err = -EPIPE;
2376 out_err:
2377 	scm_destroy(&scm);
2378 	return sent ? : err;
2379 }
2380 
2381 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2382 				  size_t len)
2383 {
2384 	int err;
2385 	struct sock *sk = sock->sk;
2386 
2387 	err = sock_error(sk);
2388 	if (err)
2389 		return err;
2390 
2391 	if (sk->sk_state != TCP_ESTABLISHED)
2392 		return -ENOTCONN;
2393 
2394 	if (msg->msg_namelen)
2395 		msg->msg_namelen = 0;
2396 
2397 	return unix_dgram_sendmsg(sock, msg, len);
2398 }
2399 
2400 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2401 				  size_t size, int flags)
2402 {
2403 	struct sock *sk = sock->sk;
2404 
2405 	if (sk->sk_state != TCP_ESTABLISHED)
2406 		return -ENOTCONN;
2407 
2408 	return unix_dgram_recvmsg(sock, msg, size, flags);
2409 }
2410 
2411 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2412 {
2413 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2414 
2415 	if (addr) {
2416 		msg->msg_namelen = addr->len;
2417 		memcpy(msg->msg_name, addr->name, addr->len);
2418 	}
2419 }
2420 
2421 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2422 			 int flags)
2423 {
2424 	struct scm_cookie scm;
2425 	struct socket *sock = sk->sk_socket;
2426 	struct unix_sock *u = unix_sk(sk);
2427 	struct sk_buff *skb, *last;
2428 	long timeo;
2429 	int skip;
2430 	int err;
2431 
2432 	err = -EOPNOTSUPP;
2433 	if (flags&MSG_OOB)
2434 		goto out;
2435 
2436 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2437 
2438 	do {
2439 		mutex_lock(&u->iolock);
2440 
2441 		skip = sk_peek_offset(sk, flags);
2442 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2443 					      &skip, &err, &last);
2444 		if (skb) {
2445 			if (!(flags & MSG_PEEK))
2446 				scm_stat_del(sk, skb);
2447 			break;
2448 		}
2449 
2450 		mutex_unlock(&u->iolock);
2451 
2452 		if (err != -EAGAIN)
2453 			break;
2454 	} while (timeo &&
2455 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2456 					      &err, &timeo, last));
2457 
2458 	if (!skb) { /* implies iolock unlocked */
2459 		unix_state_lock(sk);
2460 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2461 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2462 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2463 			err = 0;
2464 		unix_state_unlock(sk);
2465 		goto out;
2466 	}
2467 
2468 	if (wq_has_sleeper(&u->peer_wait))
2469 		wake_up_interruptible_sync_poll(&u->peer_wait,
2470 						EPOLLOUT | EPOLLWRNORM |
2471 						EPOLLWRBAND);
2472 
2473 	if (msg->msg_name) {
2474 		unix_copy_addr(msg, skb->sk);
2475 
2476 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2477 						      msg->msg_name,
2478 						      &msg->msg_namelen);
2479 	}
2480 
2481 	if (size > skb->len - skip)
2482 		size = skb->len - skip;
2483 	else if (size < skb->len - skip)
2484 		msg->msg_flags |= MSG_TRUNC;
2485 
2486 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2487 	if (err)
2488 		goto out_free;
2489 
2490 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2491 		__sock_recv_timestamp(msg, sk, skb);
2492 
2493 	memset(&scm, 0, sizeof(scm));
2494 
2495 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2496 	unix_set_secdata(&scm, skb);
2497 
2498 	if (!(flags & MSG_PEEK)) {
2499 		if (UNIXCB(skb).fp)
2500 			unix_detach_fds(&scm, skb);
2501 
2502 		sk_peek_offset_bwd(sk, skb->len);
2503 	} else {
2504 		/* It is questionable: on PEEK we could:
2505 		   - do not return fds - good, but too simple 8)
2506 		   - return fds, and do not return them on read (old strategy,
2507 		     apparently wrong)
2508 		   - clone fds (I chose it for now, it is the most universal
2509 		     solution)
2510 
2511 		   POSIX 1003.1g does not actually define this clearly
2512 		   at all. POSIX 1003.1g doesn't define a lot of things
2513 		   clearly however!
2514 
2515 		*/
2516 
2517 		sk_peek_offset_fwd(sk, size);
2518 
2519 		if (UNIXCB(skb).fp)
2520 			unix_peek_fds(&scm, skb);
2521 	}
2522 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2523 
2524 	scm_recv_unix(sock, msg, &scm, flags);
2525 
2526 out_free:
2527 	skb_free_datagram(sk, skb);
2528 	mutex_unlock(&u->iolock);
2529 out:
2530 	return err;
2531 }
2532 
2533 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2534 			      int flags)
2535 {
2536 	struct sock *sk = sock->sk;
2537 
2538 #ifdef CONFIG_BPF_SYSCALL
2539 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2540 
2541 	if (prot != &unix_dgram_proto)
2542 		return prot->recvmsg(sk, msg, size, flags, NULL);
2543 #endif
2544 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2545 }
2546 
2547 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2548 {
2549 	struct unix_sock *u = unix_sk(sk);
2550 	struct sk_buff *skb;
2551 	int err;
2552 
2553 	mutex_lock(&u->iolock);
2554 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2555 	mutex_unlock(&u->iolock);
2556 	if (!skb)
2557 		return err;
2558 
2559 	return recv_actor(sk, skb);
2560 }
2561 
2562 /*
2563  *	Sleep until more data has arrived. But check for races..
2564  */
2565 static long unix_stream_data_wait(struct sock *sk, long timeo,
2566 				  struct sk_buff *last, unsigned int last_len,
2567 				  bool freezable)
2568 {
2569 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2570 	struct sk_buff *tail;
2571 	DEFINE_WAIT(wait);
2572 
2573 	unix_state_lock(sk);
2574 
2575 	for (;;) {
2576 		prepare_to_wait(sk_sleep(sk), &wait, state);
2577 
2578 		tail = skb_peek_tail(&sk->sk_receive_queue);
2579 		if (tail != last ||
2580 		    (tail && tail->len != last_len) ||
2581 		    sk->sk_err ||
2582 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2583 		    signal_pending(current) ||
2584 		    !timeo)
2585 			break;
2586 
2587 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2588 		unix_state_unlock(sk);
2589 		timeo = schedule_timeout(timeo);
2590 		unix_state_lock(sk);
2591 
2592 		if (sock_flag(sk, SOCK_DEAD))
2593 			break;
2594 
2595 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2596 	}
2597 
2598 	finish_wait(sk_sleep(sk), &wait);
2599 	unix_state_unlock(sk);
2600 	return timeo;
2601 }
2602 
2603 static unsigned int unix_skb_len(const struct sk_buff *skb)
2604 {
2605 	return skb->len - UNIXCB(skb).consumed;
2606 }
2607 
2608 struct unix_stream_read_state {
2609 	int (*recv_actor)(struct sk_buff *, int, int,
2610 			  struct unix_stream_read_state *);
2611 	struct socket *socket;
2612 	struct msghdr *msg;
2613 	struct pipe_inode_info *pipe;
2614 	size_t size;
2615 	int flags;
2616 	unsigned int splice_flags;
2617 };
2618 
2619 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2620 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2621 {
2622 	struct socket *sock = state->socket;
2623 	struct sock *sk = sock->sk;
2624 	struct unix_sock *u = unix_sk(sk);
2625 	int chunk = 1;
2626 	struct sk_buff *oob_skb;
2627 
2628 	mutex_lock(&u->iolock);
2629 	unix_state_lock(sk);
2630 
2631 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2632 		unix_state_unlock(sk);
2633 		mutex_unlock(&u->iolock);
2634 		return -EINVAL;
2635 	}
2636 
2637 	oob_skb = u->oob_skb;
2638 
2639 	if (!(state->flags & MSG_PEEK))
2640 		WRITE_ONCE(u->oob_skb, NULL);
2641 	else
2642 		skb_get(oob_skb);
2643 	unix_state_unlock(sk);
2644 
2645 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2646 
2647 	if (!(state->flags & MSG_PEEK))
2648 		UNIXCB(oob_skb).consumed += 1;
2649 
2650 	consume_skb(oob_skb);
2651 
2652 	mutex_unlock(&u->iolock);
2653 
2654 	if (chunk < 0)
2655 		return -EFAULT;
2656 
2657 	state->msg->msg_flags |= MSG_OOB;
2658 	return 1;
2659 }
2660 
2661 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2662 				  int flags, int copied)
2663 {
2664 	struct unix_sock *u = unix_sk(sk);
2665 
2666 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2667 		skb_unlink(skb, &sk->sk_receive_queue);
2668 		consume_skb(skb);
2669 		skb = NULL;
2670 	} else {
2671 		if (skb == u->oob_skb) {
2672 			if (copied) {
2673 				skb = NULL;
2674 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2675 				if (!(flags & MSG_PEEK)) {
2676 					WRITE_ONCE(u->oob_skb, NULL);
2677 					consume_skb(skb);
2678 				}
2679 			} else if (!(flags & MSG_PEEK)) {
2680 				skb_unlink(skb, &sk->sk_receive_queue);
2681 				consume_skb(skb);
2682 				skb = skb_peek(&sk->sk_receive_queue);
2683 			}
2684 		}
2685 	}
2686 	return skb;
2687 }
2688 #endif
2689 
2690 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2691 {
2692 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2693 		return -ENOTCONN;
2694 
2695 	return unix_read_skb(sk, recv_actor);
2696 }
2697 
2698 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2699 				    bool freezable)
2700 {
2701 	struct scm_cookie scm;
2702 	struct socket *sock = state->socket;
2703 	struct sock *sk = sock->sk;
2704 	struct unix_sock *u = unix_sk(sk);
2705 	int copied = 0;
2706 	int flags = state->flags;
2707 	int noblock = flags & MSG_DONTWAIT;
2708 	bool check_creds = false;
2709 	int target;
2710 	int err = 0;
2711 	long timeo;
2712 	int skip;
2713 	size_t size = state->size;
2714 	unsigned int last_len;
2715 
2716 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2717 		err = -EINVAL;
2718 		goto out;
2719 	}
2720 
2721 	if (unlikely(flags & MSG_OOB)) {
2722 		err = -EOPNOTSUPP;
2723 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2724 		err = unix_stream_recv_urg(state);
2725 #endif
2726 		goto out;
2727 	}
2728 
2729 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2730 	timeo = sock_rcvtimeo(sk, noblock);
2731 
2732 	memset(&scm, 0, sizeof(scm));
2733 
2734 	/* Lock the socket to prevent queue disordering
2735 	 * while sleeps in memcpy_tomsg
2736 	 */
2737 	mutex_lock(&u->iolock);
2738 
2739 	skip = max(sk_peek_offset(sk, flags), 0);
2740 
2741 	do {
2742 		int chunk;
2743 		bool drop_skb;
2744 		struct sk_buff *skb, *last;
2745 
2746 redo:
2747 		unix_state_lock(sk);
2748 		if (sock_flag(sk, SOCK_DEAD)) {
2749 			err = -ECONNRESET;
2750 			goto unlock;
2751 		}
2752 		last = skb = skb_peek(&sk->sk_receive_queue);
2753 		last_len = last ? last->len : 0;
2754 
2755 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2756 		if (skb) {
2757 			skb = manage_oob(skb, sk, flags, copied);
2758 			if (!skb) {
2759 				unix_state_unlock(sk);
2760 				if (copied)
2761 					break;
2762 				goto redo;
2763 			}
2764 		}
2765 #endif
2766 again:
2767 		if (skb == NULL) {
2768 			if (copied >= target)
2769 				goto unlock;
2770 
2771 			/*
2772 			 *	POSIX 1003.1g mandates this order.
2773 			 */
2774 
2775 			err = sock_error(sk);
2776 			if (err)
2777 				goto unlock;
2778 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2779 				goto unlock;
2780 
2781 			unix_state_unlock(sk);
2782 			if (!timeo) {
2783 				err = -EAGAIN;
2784 				break;
2785 			}
2786 
2787 			mutex_unlock(&u->iolock);
2788 
2789 			timeo = unix_stream_data_wait(sk, timeo, last,
2790 						      last_len, freezable);
2791 
2792 			if (signal_pending(current)) {
2793 				err = sock_intr_errno(timeo);
2794 				scm_destroy(&scm);
2795 				goto out;
2796 			}
2797 
2798 			mutex_lock(&u->iolock);
2799 			goto redo;
2800 unlock:
2801 			unix_state_unlock(sk);
2802 			break;
2803 		}
2804 
2805 		while (skip >= unix_skb_len(skb)) {
2806 			skip -= unix_skb_len(skb);
2807 			last = skb;
2808 			last_len = skb->len;
2809 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2810 			if (!skb)
2811 				goto again;
2812 		}
2813 
2814 		unix_state_unlock(sk);
2815 
2816 		if (check_creds) {
2817 			/* Never glue messages from different writers */
2818 			if (!unix_skb_scm_eq(skb, &scm))
2819 				break;
2820 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2821 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2822 			/* Copy credentials */
2823 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2824 			unix_set_secdata(&scm, skb);
2825 			check_creds = true;
2826 		}
2827 
2828 		/* Copy address just once */
2829 		if (state->msg && state->msg->msg_name) {
2830 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2831 					 state->msg->msg_name);
2832 			unix_copy_addr(state->msg, skb->sk);
2833 
2834 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2835 							      state->msg->msg_name,
2836 							      &state->msg->msg_namelen);
2837 
2838 			sunaddr = NULL;
2839 		}
2840 
2841 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2842 		skb_get(skb);
2843 		chunk = state->recv_actor(skb, skip, chunk, state);
2844 		drop_skb = !unix_skb_len(skb);
2845 		/* skb is only safe to use if !drop_skb */
2846 		consume_skb(skb);
2847 		if (chunk < 0) {
2848 			if (copied == 0)
2849 				copied = -EFAULT;
2850 			break;
2851 		}
2852 		copied += chunk;
2853 		size -= chunk;
2854 
2855 		if (drop_skb) {
2856 			/* the skb was touched by a concurrent reader;
2857 			 * we should not expect anything from this skb
2858 			 * anymore and assume it invalid - we can be
2859 			 * sure it was dropped from the socket queue
2860 			 *
2861 			 * let's report a short read
2862 			 */
2863 			err = 0;
2864 			break;
2865 		}
2866 
2867 		/* Mark read part of skb as used */
2868 		if (!(flags & MSG_PEEK)) {
2869 			UNIXCB(skb).consumed += chunk;
2870 
2871 			sk_peek_offset_bwd(sk, chunk);
2872 
2873 			if (UNIXCB(skb).fp) {
2874 				scm_stat_del(sk, skb);
2875 				unix_detach_fds(&scm, skb);
2876 			}
2877 
2878 			if (unix_skb_len(skb))
2879 				break;
2880 
2881 			skb_unlink(skb, &sk->sk_receive_queue);
2882 			consume_skb(skb);
2883 
2884 			if (scm.fp)
2885 				break;
2886 		} else {
2887 			/* It is questionable, see note in unix_dgram_recvmsg.
2888 			 */
2889 			if (UNIXCB(skb).fp)
2890 				unix_peek_fds(&scm, skb);
2891 
2892 			sk_peek_offset_fwd(sk, chunk);
2893 
2894 			if (UNIXCB(skb).fp)
2895 				break;
2896 
2897 			skip = 0;
2898 			last = skb;
2899 			last_len = skb->len;
2900 			unix_state_lock(sk);
2901 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2902 			if (skb)
2903 				goto again;
2904 			unix_state_unlock(sk);
2905 			break;
2906 		}
2907 	} while (size);
2908 
2909 	mutex_unlock(&u->iolock);
2910 	if (state->msg)
2911 		scm_recv_unix(sock, state->msg, &scm, flags);
2912 	else
2913 		scm_destroy(&scm);
2914 out:
2915 	return copied ? : err;
2916 }
2917 
2918 static int unix_stream_read_actor(struct sk_buff *skb,
2919 				  int skip, int chunk,
2920 				  struct unix_stream_read_state *state)
2921 {
2922 	int ret;
2923 
2924 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2925 				    state->msg, chunk);
2926 	return ret ?: chunk;
2927 }
2928 
2929 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2930 			  size_t size, int flags)
2931 {
2932 	struct unix_stream_read_state state = {
2933 		.recv_actor = unix_stream_read_actor,
2934 		.socket = sk->sk_socket,
2935 		.msg = msg,
2936 		.size = size,
2937 		.flags = flags
2938 	};
2939 
2940 	return unix_stream_read_generic(&state, true);
2941 }
2942 
2943 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2944 			       size_t size, int flags)
2945 {
2946 	struct unix_stream_read_state state = {
2947 		.recv_actor = unix_stream_read_actor,
2948 		.socket = sock,
2949 		.msg = msg,
2950 		.size = size,
2951 		.flags = flags
2952 	};
2953 
2954 #ifdef CONFIG_BPF_SYSCALL
2955 	struct sock *sk = sock->sk;
2956 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2957 
2958 	if (prot != &unix_stream_proto)
2959 		return prot->recvmsg(sk, msg, size, flags, NULL);
2960 #endif
2961 	return unix_stream_read_generic(&state, true);
2962 }
2963 
2964 static int unix_stream_splice_actor(struct sk_buff *skb,
2965 				    int skip, int chunk,
2966 				    struct unix_stream_read_state *state)
2967 {
2968 	return skb_splice_bits(skb, state->socket->sk,
2969 			       UNIXCB(skb).consumed + skip,
2970 			       state->pipe, chunk, state->splice_flags);
2971 }
2972 
2973 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2974 				       struct pipe_inode_info *pipe,
2975 				       size_t size, unsigned int flags)
2976 {
2977 	struct unix_stream_read_state state = {
2978 		.recv_actor = unix_stream_splice_actor,
2979 		.socket = sock,
2980 		.pipe = pipe,
2981 		.size = size,
2982 		.splice_flags = flags,
2983 	};
2984 
2985 	if (unlikely(*ppos))
2986 		return -ESPIPE;
2987 
2988 	if (sock->file->f_flags & O_NONBLOCK ||
2989 	    flags & SPLICE_F_NONBLOCK)
2990 		state.flags = MSG_DONTWAIT;
2991 
2992 	return unix_stream_read_generic(&state, false);
2993 }
2994 
2995 static int unix_shutdown(struct socket *sock, int mode)
2996 {
2997 	struct sock *sk = sock->sk;
2998 	struct sock *other;
2999 
3000 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3001 		return -EINVAL;
3002 	/* This maps:
3003 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3004 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3005 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3006 	 */
3007 	++mode;
3008 
3009 	unix_state_lock(sk);
3010 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3011 	other = unix_peer(sk);
3012 	if (other)
3013 		sock_hold(other);
3014 	unix_state_unlock(sk);
3015 	sk->sk_state_change(sk);
3016 
3017 	if (other &&
3018 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3019 
3020 		int peer_mode = 0;
3021 		const struct proto *prot = READ_ONCE(other->sk_prot);
3022 
3023 		if (prot->unhash)
3024 			prot->unhash(other);
3025 		if (mode&RCV_SHUTDOWN)
3026 			peer_mode |= SEND_SHUTDOWN;
3027 		if (mode&SEND_SHUTDOWN)
3028 			peer_mode |= RCV_SHUTDOWN;
3029 		unix_state_lock(other);
3030 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3031 		unix_state_unlock(other);
3032 		other->sk_state_change(other);
3033 		if (peer_mode == SHUTDOWN_MASK)
3034 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3035 		else if (peer_mode & RCV_SHUTDOWN)
3036 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3037 	}
3038 	if (other)
3039 		sock_put(other);
3040 
3041 	return 0;
3042 }
3043 
3044 long unix_inq_len(struct sock *sk)
3045 {
3046 	struct sk_buff *skb;
3047 	long amount = 0;
3048 
3049 	if (sk->sk_state == TCP_LISTEN)
3050 		return -EINVAL;
3051 
3052 	spin_lock(&sk->sk_receive_queue.lock);
3053 	if (sk->sk_type == SOCK_STREAM ||
3054 	    sk->sk_type == SOCK_SEQPACKET) {
3055 		skb_queue_walk(&sk->sk_receive_queue, skb)
3056 			amount += unix_skb_len(skb);
3057 	} else {
3058 		skb = skb_peek(&sk->sk_receive_queue);
3059 		if (skb)
3060 			amount = skb->len;
3061 	}
3062 	spin_unlock(&sk->sk_receive_queue.lock);
3063 
3064 	return amount;
3065 }
3066 EXPORT_SYMBOL_GPL(unix_inq_len);
3067 
3068 long unix_outq_len(struct sock *sk)
3069 {
3070 	return sk_wmem_alloc_get(sk);
3071 }
3072 EXPORT_SYMBOL_GPL(unix_outq_len);
3073 
3074 static int unix_open_file(struct sock *sk)
3075 {
3076 	struct path path;
3077 	struct file *f;
3078 	int fd;
3079 
3080 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3081 		return -EPERM;
3082 
3083 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3084 		return -ENOENT;
3085 
3086 	path = unix_sk(sk)->path;
3087 	if (!path.dentry)
3088 		return -ENOENT;
3089 
3090 	path_get(&path);
3091 
3092 	fd = get_unused_fd_flags(O_CLOEXEC);
3093 	if (fd < 0)
3094 		goto out;
3095 
3096 	f = dentry_open(&path, O_PATH, current_cred());
3097 	if (IS_ERR(f)) {
3098 		put_unused_fd(fd);
3099 		fd = PTR_ERR(f);
3100 		goto out;
3101 	}
3102 
3103 	fd_install(fd, f);
3104 out:
3105 	path_put(&path);
3106 
3107 	return fd;
3108 }
3109 
3110 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3111 {
3112 	struct sock *sk = sock->sk;
3113 	long amount = 0;
3114 	int err;
3115 
3116 	switch (cmd) {
3117 	case SIOCOUTQ:
3118 		amount = unix_outq_len(sk);
3119 		err = put_user(amount, (int __user *)arg);
3120 		break;
3121 	case SIOCINQ:
3122 		amount = unix_inq_len(sk);
3123 		if (amount < 0)
3124 			err = amount;
3125 		else
3126 			err = put_user(amount, (int __user *)arg);
3127 		break;
3128 	case SIOCUNIXFILE:
3129 		err = unix_open_file(sk);
3130 		break;
3131 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3132 	case SIOCATMARK:
3133 		{
3134 			struct sk_buff *skb;
3135 			int answ = 0;
3136 
3137 			skb = skb_peek(&sk->sk_receive_queue);
3138 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3139 				answ = 1;
3140 			err = put_user(answ, (int __user *)arg);
3141 		}
3142 		break;
3143 #endif
3144 	default:
3145 		err = -ENOIOCTLCMD;
3146 		break;
3147 	}
3148 	return err;
3149 }
3150 
3151 #ifdef CONFIG_COMPAT
3152 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3153 {
3154 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3155 }
3156 #endif
3157 
3158 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3159 {
3160 	struct sock *sk = sock->sk;
3161 	__poll_t mask;
3162 	u8 shutdown;
3163 
3164 	sock_poll_wait(file, sock, wait);
3165 	mask = 0;
3166 	shutdown = READ_ONCE(sk->sk_shutdown);
3167 
3168 	/* exceptional events? */
3169 	if (READ_ONCE(sk->sk_err))
3170 		mask |= EPOLLERR;
3171 	if (shutdown == SHUTDOWN_MASK)
3172 		mask |= EPOLLHUP;
3173 	if (shutdown & RCV_SHUTDOWN)
3174 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3175 
3176 	/* readable? */
3177 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3178 		mask |= EPOLLIN | EPOLLRDNORM;
3179 	if (sk_is_readable(sk))
3180 		mask |= EPOLLIN | EPOLLRDNORM;
3181 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3182 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3183 		mask |= EPOLLPRI;
3184 #endif
3185 
3186 	/* Connection-based need to check for termination and startup */
3187 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3188 	    sk->sk_state == TCP_CLOSE)
3189 		mask |= EPOLLHUP;
3190 
3191 	/*
3192 	 * we set writable also when the other side has shut down the
3193 	 * connection. This prevents stuck sockets.
3194 	 */
3195 	if (unix_writable(sk))
3196 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3197 
3198 	return mask;
3199 }
3200 
3201 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3202 				    poll_table *wait)
3203 {
3204 	struct sock *sk = sock->sk, *other;
3205 	unsigned int writable;
3206 	__poll_t mask;
3207 	u8 shutdown;
3208 
3209 	sock_poll_wait(file, sock, wait);
3210 	mask = 0;
3211 	shutdown = READ_ONCE(sk->sk_shutdown);
3212 
3213 	/* exceptional events? */
3214 	if (READ_ONCE(sk->sk_err) ||
3215 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3216 		mask |= EPOLLERR |
3217 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3218 
3219 	if (shutdown & RCV_SHUTDOWN)
3220 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3221 	if (shutdown == SHUTDOWN_MASK)
3222 		mask |= EPOLLHUP;
3223 
3224 	/* readable? */
3225 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3226 		mask |= EPOLLIN | EPOLLRDNORM;
3227 	if (sk_is_readable(sk))
3228 		mask |= EPOLLIN | EPOLLRDNORM;
3229 
3230 	/* Connection-based need to check for termination and startup */
3231 	if (sk->sk_type == SOCK_SEQPACKET) {
3232 		if (sk->sk_state == TCP_CLOSE)
3233 			mask |= EPOLLHUP;
3234 		/* connection hasn't started yet? */
3235 		if (sk->sk_state == TCP_SYN_SENT)
3236 			return mask;
3237 	}
3238 
3239 	/* No write status requested, avoid expensive OUT tests. */
3240 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3241 		return mask;
3242 
3243 	writable = unix_writable(sk);
3244 	if (writable) {
3245 		unix_state_lock(sk);
3246 
3247 		other = unix_peer(sk);
3248 		if (other && unix_peer(other) != sk &&
3249 		    unix_recvq_full_lockless(other) &&
3250 		    unix_dgram_peer_wake_me(sk, other))
3251 			writable = 0;
3252 
3253 		unix_state_unlock(sk);
3254 	}
3255 
3256 	if (writable)
3257 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3258 	else
3259 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3260 
3261 	return mask;
3262 }
3263 
3264 #ifdef CONFIG_PROC_FS
3265 
3266 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3267 
3268 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3269 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3270 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3271 
3272 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3273 {
3274 	unsigned long offset = get_offset(*pos);
3275 	unsigned long bucket = get_bucket(*pos);
3276 	unsigned long count = 0;
3277 	struct sock *sk;
3278 
3279 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3280 	     sk; sk = sk_next(sk)) {
3281 		if (++count == offset)
3282 			break;
3283 	}
3284 
3285 	return sk;
3286 }
3287 
3288 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3289 {
3290 	unsigned long bucket = get_bucket(*pos);
3291 	struct net *net = seq_file_net(seq);
3292 	struct sock *sk;
3293 
3294 	while (bucket < UNIX_HASH_SIZE) {
3295 		spin_lock(&net->unx.table.locks[bucket]);
3296 
3297 		sk = unix_from_bucket(seq, pos);
3298 		if (sk)
3299 			return sk;
3300 
3301 		spin_unlock(&net->unx.table.locks[bucket]);
3302 
3303 		*pos = set_bucket_offset(++bucket, 1);
3304 	}
3305 
3306 	return NULL;
3307 }
3308 
3309 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3310 				  loff_t *pos)
3311 {
3312 	unsigned long bucket = get_bucket(*pos);
3313 
3314 	sk = sk_next(sk);
3315 	if (sk)
3316 		return sk;
3317 
3318 
3319 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3320 
3321 	*pos = set_bucket_offset(++bucket, 1);
3322 
3323 	return unix_get_first(seq, pos);
3324 }
3325 
3326 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3327 {
3328 	if (!*pos)
3329 		return SEQ_START_TOKEN;
3330 
3331 	return unix_get_first(seq, pos);
3332 }
3333 
3334 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3335 {
3336 	++*pos;
3337 
3338 	if (v == SEQ_START_TOKEN)
3339 		return unix_get_first(seq, pos);
3340 
3341 	return unix_get_next(seq, v, pos);
3342 }
3343 
3344 static void unix_seq_stop(struct seq_file *seq, void *v)
3345 {
3346 	struct sock *sk = v;
3347 
3348 	if (sk)
3349 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3350 }
3351 
3352 static int unix_seq_show(struct seq_file *seq, void *v)
3353 {
3354 
3355 	if (v == SEQ_START_TOKEN)
3356 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3357 			 "Inode Path\n");
3358 	else {
3359 		struct sock *s = v;
3360 		struct unix_sock *u = unix_sk(s);
3361 		unix_state_lock(s);
3362 
3363 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3364 			s,
3365 			refcount_read(&s->sk_refcnt),
3366 			0,
3367 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3368 			s->sk_type,
3369 			s->sk_socket ?
3370 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3371 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3372 			sock_i_ino(s));
3373 
3374 		if (u->addr) {	// under a hash table lock here
3375 			int i, len;
3376 			seq_putc(seq, ' ');
3377 
3378 			i = 0;
3379 			len = u->addr->len -
3380 				offsetof(struct sockaddr_un, sun_path);
3381 			if (u->addr->name->sun_path[0]) {
3382 				len--;
3383 			} else {
3384 				seq_putc(seq, '@');
3385 				i++;
3386 			}
3387 			for ( ; i < len; i++)
3388 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3389 					 '@');
3390 		}
3391 		unix_state_unlock(s);
3392 		seq_putc(seq, '\n');
3393 	}
3394 
3395 	return 0;
3396 }
3397 
3398 static const struct seq_operations unix_seq_ops = {
3399 	.start  = unix_seq_start,
3400 	.next   = unix_seq_next,
3401 	.stop   = unix_seq_stop,
3402 	.show   = unix_seq_show,
3403 };
3404 
3405 #ifdef CONFIG_BPF_SYSCALL
3406 struct bpf_unix_iter_state {
3407 	struct seq_net_private p;
3408 	unsigned int cur_sk;
3409 	unsigned int end_sk;
3410 	unsigned int max_sk;
3411 	struct sock **batch;
3412 	bool st_bucket_done;
3413 };
3414 
3415 struct bpf_iter__unix {
3416 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3417 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3418 	uid_t uid __aligned(8);
3419 };
3420 
3421 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3422 			      struct unix_sock *unix_sk, uid_t uid)
3423 {
3424 	struct bpf_iter__unix ctx;
3425 
3426 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3427 	ctx.meta = meta;
3428 	ctx.unix_sk = unix_sk;
3429 	ctx.uid = uid;
3430 	return bpf_iter_run_prog(prog, &ctx);
3431 }
3432 
3433 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3434 
3435 {
3436 	struct bpf_unix_iter_state *iter = seq->private;
3437 	unsigned int expected = 1;
3438 	struct sock *sk;
3439 
3440 	sock_hold(start_sk);
3441 	iter->batch[iter->end_sk++] = start_sk;
3442 
3443 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3444 		if (iter->end_sk < iter->max_sk) {
3445 			sock_hold(sk);
3446 			iter->batch[iter->end_sk++] = sk;
3447 		}
3448 
3449 		expected++;
3450 	}
3451 
3452 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3453 
3454 	return expected;
3455 }
3456 
3457 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3458 {
3459 	while (iter->cur_sk < iter->end_sk)
3460 		sock_put(iter->batch[iter->cur_sk++]);
3461 }
3462 
3463 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3464 				       unsigned int new_batch_sz)
3465 {
3466 	struct sock **new_batch;
3467 
3468 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3469 			     GFP_USER | __GFP_NOWARN);
3470 	if (!new_batch)
3471 		return -ENOMEM;
3472 
3473 	bpf_iter_unix_put_batch(iter);
3474 	kvfree(iter->batch);
3475 	iter->batch = new_batch;
3476 	iter->max_sk = new_batch_sz;
3477 
3478 	return 0;
3479 }
3480 
3481 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3482 					loff_t *pos)
3483 {
3484 	struct bpf_unix_iter_state *iter = seq->private;
3485 	unsigned int expected;
3486 	bool resized = false;
3487 	struct sock *sk;
3488 
3489 	if (iter->st_bucket_done)
3490 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3491 
3492 again:
3493 	/* Get a new batch */
3494 	iter->cur_sk = 0;
3495 	iter->end_sk = 0;
3496 
3497 	sk = unix_get_first(seq, pos);
3498 	if (!sk)
3499 		return NULL; /* Done */
3500 
3501 	expected = bpf_iter_unix_hold_batch(seq, sk);
3502 
3503 	if (iter->end_sk == expected) {
3504 		iter->st_bucket_done = true;
3505 		return sk;
3506 	}
3507 
3508 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3509 		resized = true;
3510 		goto again;
3511 	}
3512 
3513 	return sk;
3514 }
3515 
3516 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3517 {
3518 	if (!*pos)
3519 		return SEQ_START_TOKEN;
3520 
3521 	/* bpf iter does not support lseek, so it always
3522 	 * continue from where it was stop()-ped.
3523 	 */
3524 	return bpf_iter_unix_batch(seq, pos);
3525 }
3526 
3527 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3528 {
3529 	struct bpf_unix_iter_state *iter = seq->private;
3530 	struct sock *sk;
3531 
3532 	/* Whenever seq_next() is called, the iter->cur_sk is
3533 	 * done with seq_show(), so advance to the next sk in
3534 	 * the batch.
3535 	 */
3536 	if (iter->cur_sk < iter->end_sk)
3537 		sock_put(iter->batch[iter->cur_sk++]);
3538 
3539 	++*pos;
3540 
3541 	if (iter->cur_sk < iter->end_sk)
3542 		sk = iter->batch[iter->cur_sk];
3543 	else
3544 		sk = bpf_iter_unix_batch(seq, pos);
3545 
3546 	return sk;
3547 }
3548 
3549 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3550 {
3551 	struct bpf_iter_meta meta;
3552 	struct bpf_prog *prog;
3553 	struct sock *sk = v;
3554 	uid_t uid;
3555 	bool slow;
3556 	int ret;
3557 
3558 	if (v == SEQ_START_TOKEN)
3559 		return 0;
3560 
3561 	slow = lock_sock_fast(sk);
3562 
3563 	if (unlikely(sk_unhashed(sk))) {
3564 		ret = SEQ_SKIP;
3565 		goto unlock;
3566 	}
3567 
3568 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3569 	meta.seq = seq;
3570 	prog = bpf_iter_get_info(&meta, false);
3571 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3572 unlock:
3573 	unlock_sock_fast(sk, slow);
3574 	return ret;
3575 }
3576 
3577 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3578 {
3579 	struct bpf_unix_iter_state *iter = seq->private;
3580 	struct bpf_iter_meta meta;
3581 	struct bpf_prog *prog;
3582 
3583 	if (!v) {
3584 		meta.seq = seq;
3585 		prog = bpf_iter_get_info(&meta, true);
3586 		if (prog)
3587 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3588 	}
3589 
3590 	if (iter->cur_sk < iter->end_sk)
3591 		bpf_iter_unix_put_batch(iter);
3592 }
3593 
3594 static const struct seq_operations bpf_iter_unix_seq_ops = {
3595 	.start	= bpf_iter_unix_seq_start,
3596 	.next	= bpf_iter_unix_seq_next,
3597 	.stop	= bpf_iter_unix_seq_stop,
3598 	.show	= bpf_iter_unix_seq_show,
3599 };
3600 #endif
3601 #endif
3602 
3603 static const struct net_proto_family unix_family_ops = {
3604 	.family = PF_UNIX,
3605 	.create = unix_create,
3606 	.owner	= THIS_MODULE,
3607 };
3608 
3609 
3610 static int __net_init unix_net_init(struct net *net)
3611 {
3612 	int i;
3613 
3614 	net->unx.sysctl_max_dgram_qlen = 10;
3615 	if (unix_sysctl_register(net))
3616 		goto out;
3617 
3618 #ifdef CONFIG_PROC_FS
3619 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3620 			     sizeof(struct seq_net_private)))
3621 		goto err_sysctl;
3622 #endif
3623 
3624 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3625 					      sizeof(spinlock_t), GFP_KERNEL);
3626 	if (!net->unx.table.locks)
3627 		goto err_proc;
3628 
3629 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3630 						sizeof(struct hlist_head),
3631 						GFP_KERNEL);
3632 	if (!net->unx.table.buckets)
3633 		goto free_locks;
3634 
3635 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3636 		spin_lock_init(&net->unx.table.locks[i]);
3637 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3638 	}
3639 
3640 	return 0;
3641 
3642 free_locks:
3643 	kvfree(net->unx.table.locks);
3644 err_proc:
3645 #ifdef CONFIG_PROC_FS
3646 	remove_proc_entry("unix", net->proc_net);
3647 err_sysctl:
3648 #endif
3649 	unix_sysctl_unregister(net);
3650 out:
3651 	return -ENOMEM;
3652 }
3653 
3654 static void __net_exit unix_net_exit(struct net *net)
3655 {
3656 	kvfree(net->unx.table.buckets);
3657 	kvfree(net->unx.table.locks);
3658 	unix_sysctl_unregister(net);
3659 	remove_proc_entry("unix", net->proc_net);
3660 }
3661 
3662 static struct pernet_operations unix_net_ops = {
3663 	.init = unix_net_init,
3664 	.exit = unix_net_exit,
3665 };
3666 
3667 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3668 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3669 		     struct unix_sock *unix_sk, uid_t uid)
3670 
3671 #define INIT_BATCH_SZ 16
3672 
3673 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3674 {
3675 	struct bpf_unix_iter_state *iter = priv_data;
3676 	int err;
3677 
3678 	err = bpf_iter_init_seq_net(priv_data, aux);
3679 	if (err)
3680 		return err;
3681 
3682 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3683 	if (err) {
3684 		bpf_iter_fini_seq_net(priv_data);
3685 		return err;
3686 	}
3687 
3688 	return 0;
3689 }
3690 
3691 static void bpf_iter_fini_unix(void *priv_data)
3692 {
3693 	struct bpf_unix_iter_state *iter = priv_data;
3694 
3695 	bpf_iter_fini_seq_net(priv_data);
3696 	kvfree(iter->batch);
3697 }
3698 
3699 static const struct bpf_iter_seq_info unix_seq_info = {
3700 	.seq_ops		= &bpf_iter_unix_seq_ops,
3701 	.init_seq_private	= bpf_iter_init_unix,
3702 	.fini_seq_private	= bpf_iter_fini_unix,
3703 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3704 };
3705 
3706 static const struct bpf_func_proto *
3707 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3708 			     const struct bpf_prog *prog)
3709 {
3710 	switch (func_id) {
3711 	case BPF_FUNC_setsockopt:
3712 		return &bpf_sk_setsockopt_proto;
3713 	case BPF_FUNC_getsockopt:
3714 		return &bpf_sk_getsockopt_proto;
3715 	default:
3716 		return NULL;
3717 	}
3718 }
3719 
3720 static struct bpf_iter_reg unix_reg_info = {
3721 	.target			= "unix",
3722 	.ctx_arg_info_size	= 1,
3723 	.ctx_arg_info		= {
3724 		{ offsetof(struct bpf_iter__unix, unix_sk),
3725 		  PTR_TO_BTF_ID_OR_NULL },
3726 	},
3727 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3728 	.seq_info		= &unix_seq_info,
3729 };
3730 
3731 static void __init bpf_iter_register(void)
3732 {
3733 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3734 	if (bpf_iter_reg_target(&unix_reg_info))
3735 		pr_warn("Warning: could not register bpf iterator unix\n");
3736 }
3737 #endif
3738 
3739 static int __init af_unix_init(void)
3740 {
3741 	int i, rc = -1;
3742 
3743 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3744 
3745 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3746 		spin_lock_init(&bsd_socket_locks[i]);
3747 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3748 	}
3749 
3750 	rc = proto_register(&unix_dgram_proto, 1);
3751 	if (rc != 0) {
3752 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3753 		goto out;
3754 	}
3755 
3756 	rc = proto_register(&unix_stream_proto, 1);
3757 	if (rc != 0) {
3758 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3759 		proto_unregister(&unix_dgram_proto);
3760 		goto out;
3761 	}
3762 
3763 	sock_register(&unix_family_ops);
3764 	register_pernet_subsys(&unix_net_ops);
3765 	unix_bpf_build_proto();
3766 
3767 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3768 	bpf_iter_register();
3769 #endif
3770 
3771 out:
3772 	return rc;
3773 }
3774 
3775 /* Later than subsys_initcall() because we depend on stuff initialised there */
3776 fs_initcall(af_unix_init);
3777