xref: /linux/net/unix/af_unix.c (revision 6500780cffa7f221431fa4a2ec1c2f6bc51dcb6b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 #include "scm.h"
122 
123 static atomic_long_t unix_nr_socks;
124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
126 
127 /* SMP locking strategy:
128  *    hash table is protected with spinlock.
129  *    each socket state is protected by separate spinlock.
130  */
131 
132 static unsigned int unix_unbound_hash(struct sock *sk)
133 {
134 	unsigned long hash = (unsigned long)sk;
135 
136 	hash ^= hash >> 16;
137 	hash ^= hash >> 8;
138 	hash ^= sk->sk_type;
139 
140 	return hash & UNIX_HASH_MOD;
141 }
142 
143 static unsigned int unix_bsd_hash(struct inode *i)
144 {
145 	return i->i_ino & UNIX_HASH_MOD;
146 }
147 
148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
149 				       int addr_len, int type)
150 {
151 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
152 	unsigned int hash;
153 
154 	hash = (__force unsigned int)csum_fold(csum);
155 	hash ^= hash >> 8;
156 	hash ^= type;
157 
158 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
159 }
160 
161 static void unix_table_double_lock(struct net *net,
162 				   unsigned int hash1, unsigned int hash2)
163 {
164 	if (hash1 == hash2) {
165 		spin_lock(&net->unx.table.locks[hash1]);
166 		return;
167 	}
168 
169 	if (hash1 > hash2)
170 		swap(hash1, hash2);
171 
172 	spin_lock(&net->unx.table.locks[hash1]);
173 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
174 }
175 
176 static void unix_table_double_unlock(struct net *net,
177 				     unsigned int hash1, unsigned int hash2)
178 {
179 	if (hash1 == hash2) {
180 		spin_unlock(&net->unx.table.locks[hash1]);
181 		return;
182 	}
183 
184 	spin_unlock(&net->unx.table.locks[hash1]);
185 	spin_unlock(&net->unx.table.locks[hash2]);
186 }
187 
188 #ifdef CONFIG_SECURITY_NETWORK
189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
190 {
191 	UNIXCB(skb).secid = scm->secid;
192 }
193 
194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
195 {
196 	scm->secid = UNIXCB(skb).secid;
197 }
198 
199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
200 {
201 	return (scm->secid == UNIXCB(skb).secid);
202 }
203 #else
204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
205 { }
206 
207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
208 { }
209 
210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
211 {
212 	return true;
213 }
214 #endif /* CONFIG_SECURITY_NETWORK */
215 
216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
217 {
218 	return unix_peer(osk) == sk;
219 }
220 
221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
222 {
223 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
224 }
225 
226 static inline int unix_recvq_full(const struct sock *sk)
227 {
228 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
229 }
230 
231 static inline int unix_recvq_full_lockless(const struct sock *sk)
232 {
233 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
234 		READ_ONCE(sk->sk_max_ack_backlog);
235 }
236 
237 struct sock *unix_peer_get(struct sock *s)
238 {
239 	struct sock *peer;
240 
241 	unix_state_lock(s);
242 	peer = unix_peer(s);
243 	if (peer)
244 		sock_hold(peer);
245 	unix_state_unlock(s);
246 	return peer;
247 }
248 EXPORT_SYMBOL_GPL(unix_peer_get);
249 
250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
251 					     int addr_len)
252 {
253 	struct unix_address *addr;
254 
255 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
256 	if (!addr)
257 		return NULL;
258 
259 	refcount_set(&addr->refcnt, 1);
260 	addr->len = addr_len;
261 	memcpy(addr->name, sunaddr, addr_len);
262 
263 	return addr;
264 }
265 
266 static inline void unix_release_addr(struct unix_address *addr)
267 {
268 	if (refcount_dec_and_test(&addr->refcnt))
269 		kfree(addr);
270 }
271 
272 /*
273  *	Check unix socket name:
274  *		- should be not zero length.
275  *	        - if started by not zero, should be NULL terminated (FS object)
276  *		- if started by zero, it is abstract name.
277  */
278 
279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
280 {
281 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
282 	    addr_len > sizeof(*sunaddr))
283 		return -EINVAL;
284 
285 	if (sunaddr->sun_family != AF_UNIX)
286 		return -EINVAL;
287 
288 	return 0;
289 }
290 
291 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
292 {
293 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
294 	short offset = offsetof(struct sockaddr_storage, __data);
295 
296 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
297 
298 	/* This may look like an off by one error but it is a bit more
299 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
300 	 * sun_path[108] doesn't as such exist.  However in kernel space
301 	 * we are guaranteed that it is a valid memory location in our
302 	 * kernel address buffer because syscall functions always pass
303 	 * a pointer of struct sockaddr_storage which has a bigger buffer
304 	 * than 108.  Also, we must terminate sun_path for strlen() in
305 	 * getname_kernel().
306 	 */
307 	addr->__data[addr_len - offset] = 0;
308 
309 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
310 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
311 	 * know the actual buffer.
312 	 */
313 	return strlen(addr->__data) + offset + 1;
314 }
315 
316 static void __unix_remove_socket(struct sock *sk)
317 {
318 	sk_del_node_init(sk);
319 }
320 
321 static void __unix_insert_socket(struct net *net, struct sock *sk)
322 {
323 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
324 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
325 }
326 
327 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
328 				 struct unix_address *addr, unsigned int hash)
329 {
330 	__unix_remove_socket(sk);
331 	smp_store_release(&unix_sk(sk)->addr, addr);
332 
333 	sk->sk_hash = hash;
334 	__unix_insert_socket(net, sk);
335 }
336 
337 static void unix_remove_socket(struct net *net, struct sock *sk)
338 {
339 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
340 	__unix_remove_socket(sk);
341 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
342 }
343 
344 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
345 {
346 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
347 	__unix_insert_socket(net, sk);
348 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
349 }
350 
351 static void unix_insert_bsd_socket(struct sock *sk)
352 {
353 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
354 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
355 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
356 }
357 
358 static void unix_remove_bsd_socket(struct sock *sk)
359 {
360 	if (!hlist_unhashed(&sk->sk_bind_node)) {
361 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
362 		__sk_del_bind_node(sk);
363 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
364 
365 		sk_node_init(&sk->sk_bind_node);
366 	}
367 }
368 
369 static struct sock *__unix_find_socket_byname(struct net *net,
370 					      struct sockaddr_un *sunname,
371 					      int len, unsigned int hash)
372 {
373 	struct sock *s;
374 
375 	sk_for_each(s, &net->unx.table.buckets[hash]) {
376 		struct unix_sock *u = unix_sk(s);
377 
378 		if (u->addr->len == len &&
379 		    !memcmp(u->addr->name, sunname, len))
380 			return s;
381 	}
382 	return NULL;
383 }
384 
385 static inline struct sock *unix_find_socket_byname(struct net *net,
386 						   struct sockaddr_un *sunname,
387 						   int len, unsigned int hash)
388 {
389 	struct sock *s;
390 
391 	spin_lock(&net->unx.table.locks[hash]);
392 	s = __unix_find_socket_byname(net, sunname, len, hash);
393 	if (s)
394 		sock_hold(s);
395 	spin_unlock(&net->unx.table.locks[hash]);
396 	return s;
397 }
398 
399 static struct sock *unix_find_socket_byinode(struct inode *i)
400 {
401 	unsigned int hash = unix_bsd_hash(i);
402 	struct sock *s;
403 
404 	spin_lock(&bsd_socket_locks[hash]);
405 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
406 		struct dentry *dentry = unix_sk(s)->path.dentry;
407 
408 		if (dentry && d_backing_inode(dentry) == i) {
409 			sock_hold(s);
410 			spin_unlock(&bsd_socket_locks[hash]);
411 			return s;
412 		}
413 	}
414 	spin_unlock(&bsd_socket_locks[hash]);
415 	return NULL;
416 }
417 
418 /* Support code for asymmetrically connected dgram sockets
419  *
420  * If a datagram socket is connected to a socket not itself connected
421  * to the first socket (eg, /dev/log), clients may only enqueue more
422  * messages if the present receive queue of the server socket is not
423  * "too large". This means there's a second writeability condition
424  * poll and sendmsg need to test. The dgram recv code will do a wake
425  * up on the peer_wait wait queue of a socket upon reception of a
426  * datagram which needs to be propagated to sleeping would-be writers
427  * since these might not have sent anything so far. This can't be
428  * accomplished via poll_wait because the lifetime of the server
429  * socket might be less than that of its clients if these break their
430  * association with it or if the server socket is closed while clients
431  * are still connected to it and there's no way to inform "a polling
432  * implementation" that it should let go of a certain wait queue
433  *
434  * In order to propagate a wake up, a wait_queue_entry_t of the client
435  * socket is enqueued on the peer_wait queue of the server socket
436  * whose wake function does a wake_up on the ordinary client socket
437  * wait queue. This connection is established whenever a write (or
438  * poll for write) hit the flow control condition and broken when the
439  * association to the server socket is dissolved or after a wake up
440  * was relayed.
441  */
442 
443 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
444 				      void *key)
445 {
446 	struct unix_sock *u;
447 	wait_queue_head_t *u_sleep;
448 
449 	u = container_of(q, struct unix_sock, peer_wake);
450 
451 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
452 			    q);
453 	u->peer_wake.private = NULL;
454 
455 	/* relaying can only happen while the wq still exists */
456 	u_sleep = sk_sleep(&u->sk);
457 	if (u_sleep)
458 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
459 
460 	return 0;
461 }
462 
463 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
464 {
465 	struct unix_sock *u, *u_other;
466 	int rc;
467 
468 	u = unix_sk(sk);
469 	u_other = unix_sk(other);
470 	rc = 0;
471 	spin_lock(&u_other->peer_wait.lock);
472 
473 	if (!u->peer_wake.private) {
474 		u->peer_wake.private = other;
475 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
476 
477 		rc = 1;
478 	}
479 
480 	spin_unlock(&u_other->peer_wait.lock);
481 	return rc;
482 }
483 
484 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
485 					    struct sock *other)
486 {
487 	struct unix_sock *u, *u_other;
488 
489 	u = unix_sk(sk);
490 	u_other = unix_sk(other);
491 	spin_lock(&u_other->peer_wait.lock);
492 
493 	if (u->peer_wake.private == other) {
494 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
495 		u->peer_wake.private = NULL;
496 	}
497 
498 	spin_unlock(&u_other->peer_wait.lock);
499 }
500 
501 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
502 						   struct sock *other)
503 {
504 	unix_dgram_peer_wake_disconnect(sk, other);
505 	wake_up_interruptible_poll(sk_sleep(sk),
506 				   EPOLLOUT |
507 				   EPOLLWRNORM |
508 				   EPOLLWRBAND);
509 }
510 
511 /* preconditions:
512  *	- unix_peer(sk) == other
513  *	- association is stable
514  */
515 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
516 {
517 	int connected;
518 
519 	connected = unix_dgram_peer_wake_connect(sk, other);
520 
521 	/* If other is SOCK_DEAD, we want to make sure we signal
522 	 * POLLOUT, such that a subsequent write() can get a
523 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
524 	 * to other and its full, we will hang waiting for POLLOUT.
525 	 */
526 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
527 		return 1;
528 
529 	if (connected)
530 		unix_dgram_peer_wake_disconnect(sk, other);
531 
532 	return 0;
533 }
534 
535 static int unix_writable(const struct sock *sk)
536 {
537 	return sk->sk_state != TCP_LISTEN &&
538 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
539 }
540 
541 static void unix_write_space(struct sock *sk)
542 {
543 	struct socket_wq *wq;
544 
545 	rcu_read_lock();
546 	if (unix_writable(sk)) {
547 		wq = rcu_dereference(sk->sk_wq);
548 		if (skwq_has_sleeper(wq))
549 			wake_up_interruptible_sync_poll(&wq->wait,
550 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
551 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
552 	}
553 	rcu_read_unlock();
554 }
555 
556 /* When dgram socket disconnects (or changes its peer), we clear its receive
557  * queue of packets arrived from previous peer. First, it allows to do
558  * flow control based only on wmem_alloc; second, sk connected to peer
559  * may receive messages only from that peer. */
560 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
561 {
562 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
563 		skb_queue_purge(&sk->sk_receive_queue);
564 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
565 
566 		/* If one link of bidirectional dgram pipe is disconnected,
567 		 * we signal error. Messages are lost. Do not make this,
568 		 * when peer was not connected to us.
569 		 */
570 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
571 			WRITE_ONCE(other->sk_err, ECONNRESET);
572 			sk_error_report(other);
573 		}
574 	}
575 	other->sk_state = TCP_CLOSE;
576 }
577 
578 static void unix_sock_destructor(struct sock *sk)
579 {
580 	struct unix_sock *u = unix_sk(sk);
581 
582 	skb_queue_purge(&sk->sk_receive_queue);
583 
584 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
585 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
586 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
587 	if (!sock_flag(sk, SOCK_DEAD)) {
588 		pr_info("Attempt to release alive unix socket: %p\n", sk);
589 		return;
590 	}
591 
592 	if (u->addr)
593 		unix_release_addr(u->addr);
594 
595 	atomic_long_dec(&unix_nr_socks);
596 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
597 #ifdef UNIX_REFCNT_DEBUG
598 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
599 		atomic_long_read(&unix_nr_socks));
600 #endif
601 }
602 
603 static void unix_release_sock(struct sock *sk, int embrion)
604 {
605 	struct unix_sock *u = unix_sk(sk);
606 	struct sock *skpair;
607 	struct sk_buff *skb;
608 	struct path path;
609 	int state;
610 
611 	unix_remove_socket(sock_net(sk), sk);
612 	unix_remove_bsd_socket(sk);
613 
614 	/* Clear state */
615 	unix_state_lock(sk);
616 	sock_orphan(sk);
617 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
618 	path	     = u->path;
619 	u->path.dentry = NULL;
620 	u->path.mnt = NULL;
621 	state = sk->sk_state;
622 	sk->sk_state = TCP_CLOSE;
623 
624 	skpair = unix_peer(sk);
625 	unix_peer(sk) = NULL;
626 
627 	unix_state_unlock(sk);
628 
629 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
630 	if (u->oob_skb) {
631 		kfree_skb(u->oob_skb);
632 		u->oob_skb = NULL;
633 	}
634 #endif
635 
636 	wake_up_interruptible_all(&u->peer_wait);
637 
638 	if (skpair != NULL) {
639 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
640 			unix_state_lock(skpair);
641 			/* No more writes */
642 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
643 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
644 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
645 			unix_state_unlock(skpair);
646 			skpair->sk_state_change(skpair);
647 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
648 		}
649 
650 		unix_dgram_peer_wake_disconnect(sk, skpair);
651 		sock_put(skpair); /* It may now die */
652 	}
653 
654 	/* Try to flush out this socket. Throw out buffers at least */
655 
656 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
657 		if (state == TCP_LISTEN)
658 			unix_release_sock(skb->sk, 1);
659 		/* passed fds are erased in the kfree_skb hook	      */
660 		UNIXCB(skb).consumed = skb->len;
661 		kfree_skb(skb);
662 	}
663 
664 	if (path.dentry)
665 		path_put(&path);
666 
667 	sock_put(sk);
668 
669 	/* ---- Socket is dead now and most probably destroyed ---- */
670 
671 	/*
672 	 * Fixme: BSD difference: In BSD all sockets connected to us get
673 	 *	  ECONNRESET and we die on the spot. In Linux we behave
674 	 *	  like files and pipes do and wait for the last
675 	 *	  dereference.
676 	 *
677 	 * Can't we simply set sock->err?
678 	 *
679 	 *	  What the above comment does talk about? --ANK(980817)
680 	 */
681 
682 	if (READ_ONCE(unix_tot_inflight))
683 		unix_gc();		/* Garbage collect fds */
684 }
685 
686 static void init_peercred(struct sock *sk)
687 {
688 	const struct cred *old_cred;
689 	struct pid *old_pid;
690 
691 	spin_lock(&sk->sk_peer_lock);
692 	old_pid = sk->sk_peer_pid;
693 	old_cred = sk->sk_peer_cred;
694 	sk->sk_peer_pid  = get_pid(task_tgid(current));
695 	sk->sk_peer_cred = get_current_cred();
696 	spin_unlock(&sk->sk_peer_lock);
697 
698 	put_pid(old_pid);
699 	put_cred(old_cred);
700 }
701 
702 static void copy_peercred(struct sock *sk, struct sock *peersk)
703 {
704 	const struct cred *old_cred;
705 	struct pid *old_pid;
706 
707 	if (sk < peersk) {
708 		spin_lock(&sk->sk_peer_lock);
709 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
710 	} else {
711 		spin_lock(&peersk->sk_peer_lock);
712 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
713 	}
714 	old_pid = sk->sk_peer_pid;
715 	old_cred = sk->sk_peer_cred;
716 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
717 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
718 
719 	spin_unlock(&sk->sk_peer_lock);
720 	spin_unlock(&peersk->sk_peer_lock);
721 
722 	put_pid(old_pid);
723 	put_cred(old_cred);
724 }
725 
726 static int unix_listen(struct socket *sock, int backlog)
727 {
728 	int err;
729 	struct sock *sk = sock->sk;
730 	struct unix_sock *u = unix_sk(sk);
731 
732 	err = -EOPNOTSUPP;
733 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
734 		goto out;	/* Only stream/seqpacket sockets accept */
735 	err = -EINVAL;
736 	if (!u->addr)
737 		goto out;	/* No listens on an unbound socket */
738 	unix_state_lock(sk);
739 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
740 		goto out_unlock;
741 	if (backlog > sk->sk_max_ack_backlog)
742 		wake_up_interruptible_all(&u->peer_wait);
743 	sk->sk_max_ack_backlog	= backlog;
744 	sk->sk_state		= TCP_LISTEN;
745 	/* set credentials so connect can copy them */
746 	init_peercred(sk);
747 	err = 0;
748 
749 out_unlock:
750 	unix_state_unlock(sk);
751 out:
752 	return err;
753 }
754 
755 static int unix_release(struct socket *);
756 static int unix_bind(struct socket *, struct sockaddr *, int);
757 static int unix_stream_connect(struct socket *, struct sockaddr *,
758 			       int addr_len, int flags);
759 static int unix_socketpair(struct socket *, struct socket *);
760 static int unix_accept(struct socket *, struct socket *, int, bool);
761 static int unix_getname(struct socket *, struct sockaddr *, int);
762 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
763 static __poll_t unix_dgram_poll(struct file *, struct socket *,
764 				    poll_table *);
765 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
766 #ifdef CONFIG_COMPAT
767 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
768 #endif
769 static int unix_shutdown(struct socket *, int);
770 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
771 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
772 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
773 				       struct pipe_inode_info *, size_t size,
774 				       unsigned int flags);
775 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
776 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
777 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
778 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
779 static int unix_dgram_connect(struct socket *, struct sockaddr *,
780 			      int, int);
781 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
782 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
783 				  int);
784 
785 static int unix_set_peek_off(struct sock *sk, int val)
786 {
787 	struct unix_sock *u = unix_sk(sk);
788 
789 	if (mutex_lock_interruptible(&u->iolock))
790 		return -EINTR;
791 
792 	WRITE_ONCE(sk->sk_peek_off, val);
793 	mutex_unlock(&u->iolock);
794 
795 	return 0;
796 }
797 
798 #ifdef CONFIG_PROC_FS
799 static int unix_count_nr_fds(struct sock *sk)
800 {
801 	struct sk_buff *skb;
802 	struct unix_sock *u;
803 	int nr_fds = 0;
804 
805 	spin_lock(&sk->sk_receive_queue.lock);
806 	skb = skb_peek(&sk->sk_receive_queue);
807 	while (skb) {
808 		u = unix_sk(skb->sk);
809 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
810 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
811 	}
812 	spin_unlock(&sk->sk_receive_queue.lock);
813 
814 	return nr_fds;
815 }
816 
817 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
818 {
819 	struct sock *sk = sock->sk;
820 	unsigned char s_state;
821 	struct unix_sock *u;
822 	int nr_fds = 0;
823 
824 	if (sk) {
825 		s_state = READ_ONCE(sk->sk_state);
826 		u = unix_sk(sk);
827 
828 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
829 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
830 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
831 		 */
832 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
833 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
834 		else if (s_state == TCP_LISTEN)
835 			nr_fds = unix_count_nr_fds(sk);
836 
837 		seq_printf(m, "scm_fds: %u\n", nr_fds);
838 	}
839 }
840 #else
841 #define unix_show_fdinfo NULL
842 #endif
843 
844 static const struct proto_ops unix_stream_ops = {
845 	.family =	PF_UNIX,
846 	.owner =	THIS_MODULE,
847 	.release =	unix_release,
848 	.bind =		unix_bind,
849 	.connect =	unix_stream_connect,
850 	.socketpair =	unix_socketpair,
851 	.accept =	unix_accept,
852 	.getname =	unix_getname,
853 	.poll =		unix_poll,
854 	.ioctl =	unix_ioctl,
855 #ifdef CONFIG_COMPAT
856 	.compat_ioctl =	unix_compat_ioctl,
857 #endif
858 	.listen =	unix_listen,
859 	.shutdown =	unix_shutdown,
860 	.sendmsg =	unix_stream_sendmsg,
861 	.recvmsg =	unix_stream_recvmsg,
862 	.read_skb =	unix_stream_read_skb,
863 	.mmap =		sock_no_mmap,
864 	.splice_read =	unix_stream_splice_read,
865 	.set_peek_off =	unix_set_peek_off,
866 	.show_fdinfo =	unix_show_fdinfo,
867 };
868 
869 static const struct proto_ops unix_dgram_ops = {
870 	.family =	PF_UNIX,
871 	.owner =	THIS_MODULE,
872 	.release =	unix_release,
873 	.bind =		unix_bind,
874 	.connect =	unix_dgram_connect,
875 	.socketpair =	unix_socketpair,
876 	.accept =	sock_no_accept,
877 	.getname =	unix_getname,
878 	.poll =		unix_dgram_poll,
879 	.ioctl =	unix_ioctl,
880 #ifdef CONFIG_COMPAT
881 	.compat_ioctl =	unix_compat_ioctl,
882 #endif
883 	.listen =	sock_no_listen,
884 	.shutdown =	unix_shutdown,
885 	.sendmsg =	unix_dgram_sendmsg,
886 	.read_skb =	unix_read_skb,
887 	.recvmsg =	unix_dgram_recvmsg,
888 	.mmap =		sock_no_mmap,
889 	.set_peek_off =	unix_set_peek_off,
890 	.show_fdinfo =	unix_show_fdinfo,
891 };
892 
893 static const struct proto_ops unix_seqpacket_ops = {
894 	.family =	PF_UNIX,
895 	.owner =	THIS_MODULE,
896 	.release =	unix_release,
897 	.bind =		unix_bind,
898 	.connect =	unix_stream_connect,
899 	.socketpair =	unix_socketpair,
900 	.accept =	unix_accept,
901 	.getname =	unix_getname,
902 	.poll =		unix_dgram_poll,
903 	.ioctl =	unix_ioctl,
904 #ifdef CONFIG_COMPAT
905 	.compat_ioctl =	unix_compat_ioctl,
906 #endif
907 	.listen =	unix_listen,
908 	.shutdown =	unix_shutdown,
909 	.sendmsg =	unix_seqpacket_sendmsg,
910 	.recvmsg =	unix_seqpacket_recvmsg,
911 	.mmap =		sock_no_mmap,
912 	.set_peek_off =	unix_set_peek_off,
913 	.show_fdinfo =	unix_show_fdinfo,
914 };
915 
916 static void unix_close(struct sock *sk, long timeout)
917 {
918 	/* Nothing to do here, unix socket does not need a ->close().
919 	 * This is merely for sockmap.
920 	 */
921 }
922 
923 static void unix_unhash(struct sock *sk)
924 {
925 	/* Nothing to do here, unix socket does not need a ->unhash().
926 	 * This is merely for sockmap.
927 	 */
928 }
929 
930 static bool unix_bpf_bypass_getsockopt(int level, int optname)
931 {
932 	if (level == SOL_SOCKET) {
933 		switch (optname) {
934 		case SO_PEERPIDFD:
935 			return true;
936 		default:
937 			return false;
938 		}
939 	}
940 
941 	return false;
942 }
943 
944 struct proto unix_dgram_proto = {
945 	.name			= "UNIX",
946 	.owner			= THIS_MODULE,
947 	.obj_size		= sizeof(struct unix_sock),
948 	.close			= unix_close,
949 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
950 #ifdef CONFIG_BPF_SYSCALL
951 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
952 #endif
953 };
954 
955 struct proto unix_stream_proto = {
956 	.name			= "UNIX-STREAM",
957 	.owner			= THIS_MODULE,
958 	.obj_size		= sizeof(struct unix_sock),
959 	.close			= unix_close,
960 	.unhash			= unix_unhash,
961 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
962 #ifdef CONFIG_BPF_SYSCALL
963 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
964 #endif
965 };
966 
967 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
968 {
969 	struct unix_sock *u;
970 	struct sock *sk;
971 	int err;
972 
973 	atomic_long_inc(&unix_nr_socks);
974 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
975 		err = -ENFILE;
976 		goto err;
977 	}
978 
979 	if (type == SOCK_STREAM)
980 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
981 	else /*dgram and  seqpacket */
982 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
983 
984 	if (!sk) {
985 		err = -ENOMEM;
986 		goto err;
987 	}
988 
989 	sock_init_data(sock, sk);
990 
991 	sk->sk_hash		= unix_unbound_hash(sk);
992 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
993 	sk->sk_write_space	= unix_write_space;
994 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
995 	sk->sk_destruct		= unix_sock_destructor;
996 	u = unix_sk(sk);
997 	u->inflight = 0;
998 	u->path.dentry = NULL;
999 	u->path.mnt = NULL;
1000 	spin_lock_init(&u->lock);
1001 	INIT_LIST_HEAD(&u->link);
1002 	mutex_init(&u->iolock); /* single task reading lock */
1003 	mutex_init(&u->bindlock); /* single task binding lock */
1004 	init_waitqueue_head(&u->peer_wait);
1005 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1006 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1007 	unix_insert_unbound_socket(net, sk);
1008 
1009 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1010 
1011 	return sk;
1012 
1013 err:
1014 	atomic_long_dec(&unix_nr_socks);
1015 	return ERR_PTR(err);
1016 }
1017 
1018 static int unix_create(struct net *net, struct socket *sock, int protocol,
1019 		       int kern)
1020 {
1021 	struct sock *sk;
1022 
1023 	if (protocol && protocol != PF_UNIX)
1024 		return -EPROTONOSUPPORT;
1025 
1026 	sock->state = SS_UNCONNECTED;
1027 
1028 	switch (sock->type) {
1029 	case SOCK_STREAM:
1030 		sock->ops = &unix_stream_ops;
1031 		break;
1032 		/*
1033 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1034 		 *	nothing uses it.
1035 		 */
1036 	case SOCK_RAW:
1037 		sock->type = SOCK_DGRAM;
1038 		fallthrough;
1039 	case SOCK_DGRAM:
1040 		sock->ops = &unix_dgram_ops;
1041 		break;
1042 	case SOCK_SEQPACKET:
1043 		sock->ops = &unix_seqpacket_ops;
1044 		break;
1045 	default:
1046 		return -ESOCKTNOSUPPORT;
1047 	}
1048 
1049 	sk = unix_create1(net, sock, kern, sock->type);
1050 	if (IS_ERR(sk))
1051 		return PTR_ERR(sk);
1052 
1053 	return 0;
1054 }
1055 
1056 static int unix_release(struct socket *sock)
1057 {
1058 	struct sock *sk = sock->sk;
1059 
1060 	if (!sk)
1061 		return 0;
1062 
1063 	sk->sk_prot->close(sk, 0);
1064 	unix_release_sock(sk, 0);
1065 	sock->sk = NULL;
1066 
1067 	return 0;
1068 }
1069 
1070 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1071 				  int type)
1072 {
1073 	struct inode *inode;
1074 	struct path path;
1075 	struct sock *sk;
1076 	int err;
1077 
1078 	unix_mkname_bsd(sunaddr, addr_len);
1079 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1080 	if (err)
1081 		goto fail;
1082 
1083 	err = path_permission(&path, MAY_WRITE);
1084 	if (err)
1085 		goto path_put;
1086 
1087 	err = -ECONNREFUSED;
1088 	inode = d_backing_inode(path.dentry);
1089 	if (!S_ISSOCK(inode->i_mode))
1090 		goto path_put;
1091 
1092 	sk = unix_find_socket_byinode(inode);
1093 	if (!sk)
1094 		goto path_put;
1095 
1096 	err = -EPROTOTYPE;
1097 	if (sk->sk_type == type)
1098 		touch_atime(&path);
1099 	else
1100 		goto sock_put;
1101 
1102 	path_put(&path);
1103 
1104 	return sk;
1105 
1106 sock_put:
1107 	sock_put(sk);
1108 path_put:
1109 	path_put(&path);
1110 fail:
1111 	return ERR_PTR(err);
1112 }
1113 
1114 static struct sock *unix_find_abstract(struct net *net,
1115 				       struct sockaddr_un *sunaddr,
1116 				       int addr_len, int type)
1117 {
1118 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1119 	struct dentry *dentry;
1120 	struct sock *sk;
1121 
1122 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1123 	if (!sk)
1124 		return ERR_PTR(-ECONNREFUSED);
1125 
1126 	dentry = unix_sk(sk)->path.dentry;
1127 	if (dentry)
1128 		touch_atime(&unix_sk(sk)->path);
1129 
1130 	return sk;
1131 }
1132 
1133 static struct sock *unix_find_other(struct net *net,
1134 				    struct sockaddr_un *sunaddr,
1135 				    int addr_len, int type)
1136 {
1137 	struct sock *sk;
1138 
1139 	if (sunaddr->sun_path[0])
1140 		sk = unix_find_bsd(sunaddr, addr_len, type);
1141 	else
1142 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1143 
1144 	return sk;
1145 }
1146 
1147 static int unix_autobind(struct sock *sk)
1148 {
1149 	unsigned int new_hash, old_hash = sk->sk_hash;
1150 	struct unix_sock *u = unix_sk(sk);
1151 	struct net *net = sock_net(sk);
1152 	struct unix_address *addr;
1153 	u32 lastnum, ordernum;
1154 	int err;
1155 
1156 	err = mutex_lock_interruptible(&u->bindlock);
1157 	if (err)
1158 		return err;
1159 
1160 	if (u->addr)
1161 		goto out;
1162 
1163 	err = -ENOMEM;
1164 	addr = kzalloc(sizeof(*addr) +
1165 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1166 	if (!addr)
1167 		goto out;
1168 
1169 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1170 	addr->name->sun_family = AF_UNIX;
1171 	refcount_set(&addr->refcnt, 1);
1172 
1173 	ordernum = get_random_u32();
1174 	lastnum = ordernum & 0xFFFFF;
1175 retry:
1176 	ordernum = (ordernum + 1) & 0xFFFFF;
1177 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1178 
1179 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1180 	unix_table_double_lock(net, old_hash, new_hash);
1181 
1182 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1183 		unix_table_double_unlock(net, old_hash, new_hash);
1184 
1185 		/* __unix_find_socket_byname() may take long time if many names
1186 		 * are already in use.
1187 		 */
1188 		cond_resched();
1189 
1190 		if (ordernum == lastnum) {
1191 			/* Give up if all names seems to be in use. */
1192 			err = -ENOSPC;
1193 			unix_release_addr(addr);
1194 			goto out;
1195 		}
1196 
1197 		goto retry;
1198 	}
1199 
1200 	__unix_set_addr_hash(net, sk, addr, new_hash);
1201 	unix_table_double_unlock(net, old_hash, new_hash);
1202 	err = 0;
1203 
1204 out:	mutex_unlock(&u->bindlock);
1205 	return err;
1206 }
1207 
1208 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1209 			 int addr_len)
1210 {
1211 	umode_t mode = S_IFSOCK |
1212 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1213 	unsigned int new_hash, old_hash = sk->sk_hash;
1214 	struct unix_sock *u = unix_sk(sk);
1215 	struct net *net = sock_net(sk);
1216 	struct mnt_idmap *idmap;
1217 	struct unix_address *addr;
1218 	struct dentry *dentry;
1219 	struct path parent;
1220 	int err;
1221 
1222 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1223 	addr = unix_create_addr(sunaddr, addr_len);
1224 	if (!addr)
1225 		return -ENOMEM;
1226 
1227 	/*
1228 	 * Get the parent directory, calculate the hash for last
1229 	 * component.
1230 	 */
1231 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1232 	if (IS_ERR(dentry)) {
1233 		err = PTR_ERR(dentry);
1234 		goto out;
1235 	}
1236 
1237 	/*
1238 	 * All right, let's create it.
1239 	 */
1240 	idmap = mnt_idmap(parent.mnt);
1241 	err = security_path_mknod(&parent, dentry, mode, 0);
1242 	if (!err)
1243 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1244 	if (err)
1245 		goto out_path;
1246 	err = mutex_lock_interruptible(&u->bindlock);
1247 	if (err)
1248 		goto out_unlink;
1249 	if (u->addr)
1250 		goto out_unlock;
1251 
1252 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1253 	unix_table_double_lock(net, old_hash, new_hash);
1254 	u->path.mnt = mntget(parent.mnt);
1255 	u->path.dentry = dget(dentry);
1256 	__unix_set_addr_hash(net, sk, addr, new_hash);
1257 	unix_table_double_unlock(net, old_hash, new_hash);
1258 	unix_insert_bsd_socket(sk);
1259 	mutex_unlock(&u->bindlock);
1260 	done_path_create(&parent, dentry);
1261 	return 0;
1262 
1263 out_unlock:
1264 	mutex_unlock(&u->bindlock);
1265 	err = -EINVAL;
1266 out_unlink:
1267 	/* failed after successful mknod?  unlink what we'd created... */
1268 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1269 out_path:
1270 	done_path_create(&parent, dentry);
1271 out:
1272 	unix_release_addr(addr);
1273 	return err == -EEXIST ? -EADDRINUSE : err;
1274 }
1275 
1276 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1277 			      int addr_len)
1278 {
1279 	unsigned int new_hash, old_hash = sk->sk_hash;
1280 	struct unix_sock *u = unix_sk(sk);
1281 	struct net *net = sock_net(sk);
1282 	struct unix_address *addr;
1283 	int err;
1284 
1285 	addr = unix_create_addr(sunaddr, addr_len);
1286 	if (!addr)
1287 		return -ENOMEM;
1288 
1289 	err = mutex_lock_interruptible(&u->bindlock);
1290 	if (err)
1291 		goto out;
1292 
1293 	if (u->addr) {
1294 		err = -EINVAL;
1295 		goto out_mutex;
1296 	}
1297 
1298 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1299 	unix_table_double_lock(net, old_hash, new_hash);
1300 
1301 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1302 		goto out_spin;
1303 
1304 	__unix_set_addr_hash(net, sk, addr, new_hash);
1305 	unix_table_double_unlock(net, old_hash, new_hash);
1306 	mutex_unlock(&u->bindlock);
1307 	return 0;
1308 
1309 out_spin:
1310 	unix_table_double_unlock(net, old_hash, new_hash);
1311 	err = -EADDRINUSE;
1312 out_mutex:
1313 	mutex_unlock(&u->bindlock);
1314 out:
1315 	unix_release_addr(addr);
1316 	return err;
1317 }
1318 
1319 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1320 {
1321 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1322 	struct sock *sk = sock->sk;
1323 	int err;
1324 
1325 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1326 	    sunaddr->sun_family == AF_UNIX)
1327 		return unix_autobind(sk);
1328 
1329 	err = unix_validate_addr(sunaddr, addr_len);
1330 	if (err)
1331 		return err;
1332 
1333 	if (sunaddr->sun_path[0])
1334 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1335 	else
1336 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1337 
1338 	return err;
1339 }
1340 
1341 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1342 {
1343 	if (unlikely(sk1 == sk2) || !sk2) {
1344 		unix_state_lock(sk1);
1345 		return;
1346 	}
1347 	if (sk1 < sk2) {
1348 		unix_state_lock(sk1);
1349 		unix_state_lock_nested(sk2);
1350 	} else {
1351 		unix_state_lock(sk2);
1352 		unix_state_lock_nested(sk1);
1353 	}
1354 }
1355 
1356 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1357 {
1358 	if (unlikely(sk1 == sk2) || !sk2) {
1359 		unix_state_unlock(sk1);
1360 		return;
1361 	}
1362 	unix_state_unlock(sk1);
1363 	unix_state_unlock(sk2);
1364 }
1365 
1366 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1367 			      int alen, int flags)
1368 {
1369 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1370 	struct sock *sk = sock->sk;
1371 	struct sock *other;
1372 	int err;
1373 
1374 	err = -EINVAL;
1375 	if (alen < offsetofend(struct sockaddr, sa_family))
1376 		goto out;
1377 
1378 	if (addr->sa_family != AF_UNSPEC) {
1379 		err = unix_validate_addr(sunaddr, alen);
1380 		if (err)
1381 			goto out;
1382 
1383 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1384 		if (err)
1385 			goto out;
1386 
1387 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1388 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1389 		    !unix_sk(sk)->addr) {
1390 			err = unix_autobind(sk);
1391 			if (err)
1392 				goto out;
1393 		}
1394 
1395 restart:
1396 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1397 		if (IS_ERR(other)) {
1398 			err = PTR_ERR(other);
1399 			goto out;
1400 		}
1401 
1402 		unix_state_double_lock(sk, other);
1403 
1404 		/* Apparently VFS overslept socket death. Retry. */
1405 		if (sock_flag(other, SOCK_DEAD)) {
1406 			unix_state_double_unlock(sk, other);
1407 			sock_put(other);
1408 			goto restart;
1409 		}
1410 
1411 		err = -EPERM;
1412 		if (!unix_may_send(sk, other))
1413 			goto out_unlock;
1414 
1415 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1416 		if (err)
1417 			goto out_unlock;
1418 
1419 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1420 	} else {
1421 		/*
1422 		 *	1003.1g breaking connected state with AF_UNSPEC
1423 		 */
1424 		other = NULL;
1425 		unix_state_double_lock(sk, other);
1426 	}
1427 
1428 	/*
1429 	 * If it was connected, reconnect.
1430 	 */
1431 	if (unix_peer(sk)) {
1432 		struct sock *old_peer = unix_peer(sk);
1433 
1434 		unix_peer(sk) = other;
1435 		if (!other)
1436 			sk->sk_state = TCP_CLOSE;
1437 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1438 
1439 		unix_state_double_unlock(sk, other);
1440 
1441 		if (other != old_peer)
1442 			unix_dgram_disconnected(sk, old_peer);
1443 		sock_put(old_peer);
1444 	} else {
1445 		unix_peer(sk) = other;
1446 		unix_state_double_unlock(sk, other);
1447 	}
1448 
1449 	return 0;
1450 
1451 out_unlock:
1452 	unix_state_double_unlock(sk, other);
1453 	sock_put(other);
1454 out:
1455 	return err;
1456 }
1457 
1458 static long unix_wait_for_peer(struct sock *other, long timeo)
1459 	__releases(&unix_sk(other)->lock)
1460 {
1461 	struct unix_sock *u = unix_sk(other);
1462 	int sched;
1463 	DEFINE_WAIT(wait);
1464 
1465 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1466 
1467 	sched = !sock_flag(other, SOCK_DEAD) &&
1468 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1469 		unix_recvq_full_lockless(other);
1470 
1471 	unix_state_unlock(other);
1472 
1473 	if (sched)
1474 		timeo = schedule_timeout(timeo);
1475 
1476 	finish_wait(&u->peer_wait, &wait);
1477 	return timeo;
1478 }
1479 
1480 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1481 			       int addr_len, int flags)
1482 {
1483 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1484 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1485 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1486 	struct net *net = sock_net(sk);
1487 	struct sk_buff *skb = NULL;
1488 	long timeo;
1489 	int err;
1490 	int st;
1491 
1492 	err = unix_validate_addr(sunaddr, addr_len);
1493 	if (err)
1494 		goto out;
1495 
1496 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1497 	if (err)
1498 		goto out;
1499 
1500 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1501 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1502 		err = unix_autobind(sk);
1503 		if (err)
1504 			goto out;
1505 	}
1506 
1507 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1508 
1509 	/* First of all allocate resources.
1510 	   If we will make it after state is locked,
1511 	   we will have to recheck all again in any case.
1512 	 */
1513 
1514 	/* create new sock for complete connection */
1515 	newsk = unix_create1(net, NULL, 0, sock->type);
1516 	if (IS_ERR(newsk)) {
1517 		err = PTR_ERR(newsk);
1518 		newsk = NULL;
1519 		goto out;
1520 	}
1521 
1522 	err = -ENOMEM;
1523 
1524 	/* Allocate skb for sending to listening sock */
1525 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1526 	if (skb == NULL)
1527 		goto out;
1528 
1529 restart:
1530 	/*  Find listening sock. */
1531 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1532 	if (IS_ERR(other)) {
1533 		err = PTR_ERR(other);
1534 		other = NULL;
1535 		goto out;
1536 	}
1537 
1538 	/* Latch state of peer */
1539 	unix_state_lock(other);
1540 
1541 	/* Apparently VFS overslept socket death. Retry. */
1542 	if (sock_flag(other, SOCK_DEAD)) {
1543 		unix_state_unlock(other);
1544 		sock_put(other);
1545 		goto restart;
1546 	}
1547 
1548 	err = -ECONNREFUSED;
1549 	if (other->sk_state != TCP_LISTEN)
1550 		goto out_unlock;
1551 	if (other->sk_shutdown & RCV_SHUTDOWN)
1552 		goto out_unlock;
1553 
1554 	if (unix_recvq_full(other)) {
1555 		err = -EAGAIN;
1556 		if (!timeo)
1557 			goto out_unlock;
1558 
1559 		timeo = unix_wait_for_peer(other, timeo);
1560 
1561 		err = sock_intr_errno(timeo);
1562 		if (signal_pending(current))
1563 			goto out;
1564 		sock_put(other);
1565 		goto restart;
1566 	}
1567 
1568 	/* Latch our state.
1569 
1570 	   It is tricky place. We need to grab our state lock and cannot
1571 	   drop lock on peer. It is dangerous because deadlock is
1572 	   possible. Connect to self case and simultaneous
1573 	   attempt to connect are eliminated by checking socket
1574 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1575 	   check this before attempt to grab lock.
1576 
1577 	   Well, and we have to recheck the state after socket locked.
1578 	 */
1579 	st = sk->sk_state;
1580 
1581 	switch (st) {
1582 	case TCP_CLOSE:
1583 		/* This is ok... continue with connect */
1584 		break;
1585 	case TCP_ESTABLISHED:
1586 		/* Socket is already connected */
1587 		err = -EISCONN;
1588 		goto out_unlock;
1589 	default:
1590 		err = -EINVAL;
1591 		goto out_unlock;
1592 	}
1593 
1594 	unix_state_lock_nested(sk);
1595 
1596 	if (sk->sk_state != st) {
1597 		unix_state_unlock(sk);
1598 		unix_state_unlock(other);
1599 		sock_put(other);
1600 		goto restart;
1601 	}
1602 
1603 	err = security_unix_stream_connect(sk, other, newsk);
1604 	if (err) {
1605 		unix_state_unlock(sk);
1606 		goto out_unlock;
1607 	}
1608 
1609 	/* The way is open! Fastly set all the necessary fields... */
1610 
1611 	sock_hold(sk);
1612 	unix_peer(newsk)	= sk;
1613 	newsk->sk_state		= TCP_ESTABLISHED;
1614 	newsk->sk_type		= sk->sk_type;
1615 	init_peercred(newsk);
1616 	newu = unix_sk(newsk);
1617 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1618 	otheru = unix_sk(other);
1619 
1620 	/* copy address information from listening to new sock
1621 	 *
1622 	 * The contents of *(otheru->addr) and otheru->path
1623 	 * are seen fully set up here, since we have found
1624 	 * otheru in hash under its lock.  Insertion into the
1625 	 * hash chain we'd found it in had been done in an
1626 	 * earlier critical area protected by the chain's lock,
1627 	 * the same one where we'd set *(otheru->addr) contents,
1628 	 * as well as otheru->path and otheru->addr itself.
1629 	 *
1630 	 * Using smp_store_release() here to set newu->addr
1631 	 * is enough to make those stores, as well as stores
1632 	 * to newu->path visible to anyone who gets newu->addr
1633 	 * by smp_load_acquire().  IOW, the same warranties
1634 	 * as for unix_sock instances bound in unix_bind() or
1635 	 * in unix_autobind().
1636 	 */
1637 	if (otheru->path.dentry) {
1638 		path_get(&otheru->path);
1639 		newu->path = otheru->path;
1640 	}
1641 	refcount_inc(&otheru->addr->refcnt);
1642 	smp_store_release(&newu->addr, otheru->addr);
1643 
1644 	/* Set credentials */
1645 	copy_peercred(sk, other);
1646 
1647 	sock->state	= SS_CONNECTED;
1648 	sk->sk_state	= TCP_ESTABLISHED;
1649 	sock_hold(newsk);
1650 
1651 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1652 	unix_peer(sk)	= newsk;
1653 
1654 	unix_state_unlock(sk);
1655 
1656 	/* take ten and send info to listening sock */
1657 	spin_lock(&other->sk_receive_queue.lock);
1658 	__skb_queue_tail(&other->sk_receive_queue, skb);
1659 	spin_unlock(&other->sk_receive_queue.lock);
1660 	unix_state_unlock(other);
1661 	other->sk_data_ready(other);
1662 	sock_put(other);
1663 	return 0;
1664 
1665 out_unlock:
1666 	if (other)
1667 		unix_state_unlock(other);
1668 
1669 out:
1670 	kfree_skb(skb);
1671 	if (newsk)
1672 		unix_release_sock(newsk, 0);
1673 	if (other)
1674 		sock_put(other);
1675 	return err;
1676 }
1677 
1678 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1679 {
1680 	struct sock *ska = socka->sk, *skb = sockb->sk;
1681 
1682 	/* Join our sockets back to back */
1683 	sock_hold(ska);
1684 	sock_hold(skb);
1685 	unix_peer(ska) = skb;
1686 	unix_peer(skb) = ska;
1687 	init_peercred(ska);
1688 	init_peercred(skb);
1689 
1690 	ska->sk_state = TCP_ESTABLISHED;
1691 	skb->sk_state = TCP_ESTABLISHED;
1692 	socka->state  = SS_CONNECTED;
1693 	sockb->state  = SS_CONNECTED;
1694 	return 0;
1695 }
1696 
1697 static void unix_sock_inherit_flags(const struct socket *old,
1698 				    struct socket *new)
1699 {
1700 	if (test_bit(SOCK_PASSCRED, &old->flags))
1701 		set_bit(SOCK_PASSCRED, &new->flags);
1702 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1703 		set_bit(SOCK_PASSPIDFD, &new->flags);
1704 	if (test_bit(SOCK_PASSSEC, &old->flags))
1705 		set_bit(SOCK_PASSSEC, &new->flags);
1706 }
1707 
1708 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1709 		       bool kern)
1710 {
1711 	struct sock *sk = sock->sk;
1712 	struct sock *tsk;
1713 	struct sk_buff *skb;
1714 	int err;
1715 
1716 	err = -EOPNOTSUPP;
1717 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1718 		goto out;
1719 
1720 	err = -EINVAL;
1721 	if (sk->sk_state != TCP_LISTEN)
1722 		goto out;
1723 
1724 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1725 	 * so that no locks are necessary.
1726 	 */
1727 
1728 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1729 				&err);
1730 	if (!skb) {
1731 		/* This means receive shutdown. */
1732 		if (err == 0)
1733 			err = -EINVAL;
1734 		goto out;
1735 	}
1736 
1737 	tsk = skb->sk;
1738 	skb_free_datagram(sk, skb);
1739 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1740 
1741 	/* attach accepted sock to socket */
1742 	unix_state_lock(tsk);
1743 	newsock->state = SS_CONNECTED;
1744 	unix_sock_inherit_flags(sock, newsock);
1745 	sock_graft(tsk, newsock);
1746 	unix_state_unlock(tsk);
1747 	return 0;
1748 
1749 out:
1750 	return err;
1751 }
1752 
1753 
1754 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1755 {
1756 	struct sock *sk = sock->sk;
1757 	struct unix_address *addr;
1758 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1759 	int err = 0;
1760 
1761 	if (peer) {
1762 		sk = unix_peer_get(sk);
1763 
1764 		err = -ENOTCONN;
1765 		if (!sk)
1766 			goto out;
1767 		err = 0;
1768 	} else {
1769 		sock_hold(sk);
1770 	}
1771 
1772 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1773 	if (!addr) {
1774 		sunaddr->sun_family = AF_UNIX;
1775 		sunaddr->sun_path[0] = 0;
1776 		err = offsetof(struct sockaddr_un, sun_path);
1777 	} else {
1778 		err = addr->len;
1779 		memcpy(sunaddr, addr->name, addr->len);
1780 
1781 		if (peer)
1782 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1783 					       CGROUP_UNIX_GETPEERNAME);
1784 		else
1785 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1786 					       CGROUP_UNIX_GETSOCKNAME);
1787 	}
1788 	sock_put(sk);
1789 out:
1790 	return err;
1791 }
1792 
1793 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1794 {
1795 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1796 
1797 	/*
1798 	 * Garbage collection of unix sockets starts by selecting a set of
1799 	 * candidate sockets which have reference only from being in flight
1800 	 * (total_refs == inflight_refs).  This condition is checked once during
1801 	 * the candidate collection phase, and candidates are marked as such, so
1802 	 * that non-candidates can later be ignored.  While inflight_refs is
1803 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1804 	 * is an instantaneous decision.
1805 	 *
1806 	 * Once a candidate, however, the socket must not be reinstalled into a
1807 	 * file descriptor while the garbage collection is in progress.
1808 	 *
1809 	 * If the above conditions are met, then the directed graph of
1810 	 * candidates (*) does not change while unix_gc_lock is held.
1811 	 *
1812 	 * Any operations that changes the file count through file descriptors
1813 	 * (dup, close, sendmsg) does not change the graph since candidates are
1814 	 * not installed in fds.
1815 	 *
1816 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1817 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1818 	 * serialized with garbage collection.
1819 	 *
1820 	 * MSG_PEEK is special in that it does not change the inflight count,
1821 	 * yet does install the socket into an fd.  The following lock/unlock
1822 	 * pair is to ensure serialization with garbage collection.  It must be
1823 	 * done between incrementing the file count and installing the file into
1824 	 * an fd.
1825 	 *
1826 	 * If garbage collection starts after the barrier provided by the
1827 	 * lock/unlock, then it will see the elevated refcount and not mark this
1828 	 * as a candidate.  If a garbage collection is already in progress
1829 	 * before the file count was incremented, then the lock/unlock pair will
1830 	 * ensure that garbage collection is finished before progressing to
1831 	 * installing the fd.
1832 	 *
1833 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1834 	 * which is on the queue of listening socket A.
1835 	 */
1836 	spin_lock(&unix_gc_lock);
1837 	spin_unlock(&unix_gc_lock);
1838 }
1839 
1840 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1841 {
1842 	int err = 0;
1843 
1844 	UNIXCB(skb).pid  = get_pid(scm->pid);
1845 	UNIXCB(skb).uid = scm->creds.uid;
1846 	UNIXCB(skb).gid = scm->creds.gid;
1847 	UNIXCB(skb).fp = NULL;
1848 	unix_get_secdata(scm, skb);
1849 	if (scm->fp && send_fds)
1850 		err = unix_attach_fds(scm, skb);
1851 
1852 	skb->destructor = unix_destruct_scm;
1853 	return err;
1854 }
1855 
1856 static bool unix_passcred_enabled(const struct socket *sock,
1857 				  const struct sock *other)
1858 {
1859 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1860 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1861 	       !other->sk_socket ||
1862 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1863 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1864 }
1865 
1866 /*
1867  * Some apps rely on write() giving SCM_CREDENTIALS
1868  * We include credentials if source or destination socket
1869  * asserted SOCK_PASSCRED.
1870  */
1871 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1872 			    const struct sock *other)
1873 {
1874 	if (UNIXCB(skb).pid)
1875 		return;
1876 	if (unix_passcred_enabled(sock, other)) {
1877 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1878 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1879 	}
1880 }
1881 
1882 static bool unix_skb_scm_eq(struct sk_buff *skb,
1883 			    struct scm_cookie *scm)
1884 {
1885 	return UNIXCB(skb).pid == scm->pid &&
1886 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1887 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1888 	       unix_secdata_eq(scm, skb);
1889 }
1890 
1891 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1892 {
1893 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1894 	struct unix_sock *u = unix_sk(sk);
1895 
1896 	if (unlikely(fp && fp->count))
1897 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1898 }
1899 
1900 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1901 {
1902 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1903 	struct unix_sock *u = unix_sk(sk);
1904 
1905 	if (unlikely(fp && fp->count))
1906 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1907 }
1908 
1909 /*
1910  *	Send AF_UNIX data.
1911  */
1912 
1913 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1914 			      size_t len)
1915 {
1916 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1917 	struct sock *sk = sock->sk, *other = NULL;
1918 	struct unix_sock *u = unix_sk(sk);
1919 	struct scm_cookie scm;
1920 	struct sk_buff *skb;
1921 	int data_len = 0;
1922 	int sk_locked;
1923 	long timeo;
1924 	int err;
1925 
1926 	err = scm_send(sock, msg, &scm, false);
1927 	if (err < 0)
1928 		return err;
1929 
1930 	wait_for_unix_gc(scm.fp);
1931 
1932 	err = -EOPNOTSUPP;
1933 	if (msg->msg_flags&MSG_OOB)
1934 		goto out;
1935 
1936 	if (msg->msg_namelen) {
1937 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1938 		if (err)
1939 			goto out;
1940 
1941 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1942 							    msg->msg_name,
1943 							    &msg->msg_namelen,
1944 							    NULL);
1945 		if (err)
1946 			goto out;
1947 	} else {
1948 		sunaddr = NULL;
1949 		err = -ENOTCONN;
1950 		other = unix_peer_get(sk);
1951 		if (!other)
1952 			goto out;
1953 	}
1954 
1955 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1956 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1957 		err = unix_autobind(sk);
1958 		if (err)
1959 			goto out;
1960 	}
1961 
1962 	err = -EMSGSIZE;
1963 	if (len > sk->sk_sndbuf - 32)
1964 		goto out;
1965 
1966 	if (len > SKB_MAX_ALLOC) {
1967 		data_len = min_t(size_t,
1968 				 len - SKB_MAX_ALLOC,
1969 				 MAX_SKB_FRAGS * PAGE_SIZE);
1970 		data_len = PAGE_ALIGN(data_len);
1971 
1972 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1973 	}
1974 
1975 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1976 				   msg->msg_flags & MSG_DONTWAIT, &err,
1977 				   PAGE_ALLOC_COSTLY_ORDER);
1978 	if (skb == NULL)
1979 		goto out;
1980 
1981 	err = unix_scm_to_skb(&scm, skb, true);
1982 	if (err < 0)
1983 		goto out_free;
1984 
1985 	skb_put(skb, len - data_len);
1986 	skb->data_len = data_len;
1987 	skb->len = len;
1988 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1989 	if (err)
1990 		goto out_free;
1991 
1992 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1993 
1994 restart:
1995 	if (!other) {
1996 		err = -ECONNRESET;
1997 		if (sunaddr == NULL)
1998 			goto out_free;
1999 
2000 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2001 					sk->sk_type);
2002 		if (IS_ERR(other)) {
2003 			err = PTR_ERR(other);
2004 			other = NULL;
2005 			goto out_free;
2006 		}
2007 	}
2008 
2009 	if (sk_filter(other, skb) < 0) {
2010 		/* Toss the packet but do not return any error to the sender */
2011 		err = len;
2012 		goto out_free;
2013 	}
2014 
2015 	sk_locked = 0;
2016 	unix_state_lock(other);
2017 restart_locked:
2018 	err = -EPERM;
2019 	if (!unix_may_send(sk, other))
2020 		goto out_unlock;
2021 
2022 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2023 		/*
2024 		 *	Check with 1003.1g - what should
2025 		 *	datagram error
2026 		 */
2027 		unix_state_unlock(other);
2028 		sock_put(other);
2029 
2030 		if (!sk_locked)
2031 			unix_state_lock(sk);
2032 
2033 		err = 0;
2034 		if (sk->sk_type == SOCK_SEQPACKET) {
2035 			/* We are here only when racing with unix_release_sock()
2036 			 * is clearing @other. Never change state to TCP_CLOSE
2037 			 * unlike SOCK_DGRAM wants.
2038 			 */
2039 			unix_state_unlock(sk);
2040 			err = -EPIPE;
2041 		} else if (unix_peer(sk) == other) {
2042 			unix_peer(sk) = NULL;
2043 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2044 
2045 			sk->sk_state = TCP_CLOSE;
2046 			unix_state_unlock(sk);
2047 
2048 			unix_dgram_disconnected(sk, other);
2049 			sock_put(other);
2050 			err = -ECONNREFUSED;
2051 		} else {
2052 			unix_state_unlock(sk);
2053 		}
2054 
2055 		other = NULL;
2056 		if (err)
2057 			goto out_free;
2058 		goto restart;
2059 	}
2060 
2061 	err = -EPIPE;
2062 	if (other->sk_shutdown & RCV_SHUTDOWN)
2063 		goto out_unlock;
2064 
2065 	if (sk->sk_type != SOCK_SEQPACKET) {
2066 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2067 		if (err)
2068 			goto out_unlock;
2069 	}
2070 
2071 	/* other == sk && unix_peer(other) != sk if
2072 	 * - unix_peer(sk) == NULL, destination address bound to sk
2073 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2074 	 */
2075 	if (other != sk &&
2076 	    unlikely(unix_peer(other) != sk &&
2077 	    unix_recvq_full_lockless(other))) {
2078 		if (timeo) {
2079 			timeo = unix_wait_for_peer(other, timeo);
2080 
2081 			err = sock_intr_errno(timeo);
2082 			if (signal_pending(current))
2083 				goto out_free;
2084 
2085 			goto restart;
2086 		}
2087 
2088 		if (!sk_locked) {
2089 			unix_state_unlock(other);
2090 			unix_state_double_lock(sk, other);
2091 		}
2092 
2093 		if (unix_peer(sk) != other ||
2094 		    unix_dgram_peer_wake_me(sk, other)) {
2095 			err = -EAGAIN;
2096 			sk_locked = 1;
2097 			goto out_unlock;
2098 		}
2099 
2100 		if (!sk_locked) {
2101 			sk_locked = 1;
2102 			goto restart_locked;
2103 		}
2104 	}
2105 
2106 	if (unlikely(sk_locked))
2107 		unix_state_unlock(sk);
2108 
2109 	if (sock_flag(other, SOCK_RCVTSTAMP))
2110 		__net_timestamp(skb);
2111 	maybe_add_creds(skb, sock, other);
2112 	scm_stat_add(other, skb);
2113 	skb_queue_tail(&other->sk_receive_queue, skb);
2114 	unix_state_unlock(other);
2115 	other->sk_data_ready(other);
2116 	sock_put(other);
2117 	scm_destroy(&scm);
2118 	return len;
2119 
2120 out_unlock:
2121 	if (sk_locked)
2122 		unix_state_unlock(sk);
2123 	unix_state_unlock(other);
2124 out_free:
2125 	kfree_skb(skb);
2126 out:
2127 	if (other)
2128 		sock_put(other);
2129 	scm_destroy(&scm);
2130 	return err;
2131 }
2132 
2133 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2134  * bytes, and a minimum of a full page.
2135  */
2136 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2137 
2138 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2139 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2140 		     struct scm_cookie *scm, bool fds_sent)
2141 {
2142 	struct unix_sock *ousk = unix_sk(other);
2143 	struct sk_buff *skb;
2144 	int err = 0;
2145 
2146 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2147 
2148 	if (!skb)
2149 		return err;
2150 
2151 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2152 	if (err < 0) {
2153 		kfree_skb(skb);
2154 		return err;
2155 	}
2156 	skb_put(skb, 1);
2157 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2158 
2159 	if (err) {
2160 		kfree_skb(skb);
2161 		return err;
2162 	}
2163 
2164 	unix_state_lock(other);
2165 
2166 	if (sock_flag(other, SOCK_DEAD) ||
2167 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2168 		unix_state_unlock(other);
2169 		kfree_skb(skb);
2170 		return -EPIPE;
2171 	}
2172 
2173 	maybe_add_creds(skb, sock, other);
2174 	skb_get(skb);
2175 
2176 	if (ousk->oob_skb)
2177 		consume_skb(ousk->oob_skb);
2178 
2179 	WRITE_ONCE(ousk->oob_skb, skb);
2180 
2181 	scm_stat_add(other, skb);
2182 	skb_queue_tail(&other->sk_receive_queue, skb);
2183 	sk_send_sigurg(other);
2184 	unix_state_unlock(other);
2185 	other->sk_data_ready(other);
2186 
2187 	return err;
2188 }
2189 #endif
2190 
2191 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2192 			       size_t len)
2193 {
2194 	struct sock *sk = sock->sk;
2195 	struct sock *other = NULL;
2196 	int err, size;
2197 	struct sk_buff *skb;
2198 	int sent = 0;
2199 	struct scm_cookie scm;
2200 	bool fds_sent = false;
2201 	int data_len;
2202 
2203 	err = scm_send(sock, msg, &scm, false);
2204 	if (err < 0)
2205 		return err;
2206 
2207 	wait_for_unix_gc(scm.fp);
2208 
2209 	err = -EOPNOTSUPP;
2210 	if (msg->msg_flags & MSG_OOB) {
2211 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2212 		if (len)
2213 			len--;
2214 		else
2215 #endif
2216 			goto out_err;
2217 	}
2218 
2219 	if (msg->msg_namelen) {
2220 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2221 		goto out_err;
2222 	} else {
2223 		err = -ENOTCONN;
2224 		other = unix_peer(sk);
2225 		if (!other)
2226 			goto out_err;
2227 	}
2228 
2229 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2230 		goto pipe_err;
2231 
2232 	while (sent < len) {
2233 		size = len - sent;
2234 
2235 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2236 			skb = sock_alloc_send_pskb(sk, 0, 0,
2237 						   msg->msg_flags & MSG_DONTWAIT,
2238 						   &err, 0);
2239 		} else {
2240 			/* Keep two messages in the pipe so it schedules better */
2241 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2242 
2243 			/* allow fallback to order-0 allocations */
2244 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2245 
2246 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2247 
2248 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2249 
2250 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2251 						   msg->msg_flags & MSG_DONTWAIT, &err,
2252 						   get_order(UNIX_SKB_FRAGS_SZ));
2253 		}
2254 		if (!skb)
2255 			goto out_err;
2256 
2257 		/* Only send the fds in the first buffer */
2258 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2259 		if (err < 0) {
2260 			kfree_skb(skb);
2261 			goto out_err;
2262 		}
2263 		fds_sent = true;
2264 
2265 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2266 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2267 						   sk->sk_allocation);
2268 			if (err < 0) {
2269 				kfree_skb(skb);
2270 				goto out_err;
2271 			}
2272 			size = err;
2273 			refcount_add(size, &sk->sk_wmem_alloc);
2274 		} else {
2275 			skb_put(skb, size - data_len);
2276 			skb->data_len = data_len;
2277 			skb->len = size;
2278 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2279 			if (err) {
2280 				kfree_skb(skb);
2281 				goto out_err;
2282 			}
2283 		}
2284 
2285 		unix_state_lock(other);
2286 
2287 		if (sock_flag(other, SOCK_DEAD) ||
2288 		    (other->sk_shutdown & RCV_SHUTDOWN))
2289 			goto pipe_err_free;
2290 
2291 		maybe_add_creds(skb, sock, other);
2292 		scm_stat_add(other, skb);
2293 		skb_queue_tail(&other->sk_receive_queue, skb);
2294 		unix_state_unlock(other);
2295 		other->sk_data_ready(other);
2296 		sent += size;
2297 	}
2298 
2299 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2300 	if (msg->msg_flags & MSG_OOB) {
2301 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2302 		if (err)
2303 			goto out_err;
2304 		sent++;
2305 	}
2306 #endif
2307 
2308 	scm_destroy(&scm);
2309 
2310 	return sent;
2311 
2312 pipe_err_free:
2313 	unix_state_unlock(other);
2314 	kfree_skb(skb);
2315 pipe_err:
2316 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2317 		send_sig(SIGPIPE, current, 0);
2318 	err = -EPIPE;
2319 out_err:
2320 	scm_destroy(&scm);
2321 	return sent ? : err;
2322 }
2323 
2324 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2325 				  size_t len)
2326 {
2327 	int err;
2328 	struct sock *sk = sock->sk;
2329 
2330 	err = sock_error(sk);
2331 	if (err)
2332 		return err;
2333 
2334 	if (sk->sk_state != TCP_ESTABLISHED)
2335 		return -ENOTCONN;
2336 
2337 	if (msg->msg_namelen)
2338 		msg->msg_namelen = 0;
2339 
2340 	return unix_dgram_sendmsg(sock, msg, len);
2341 }
2342 
2343 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2344 				  size_t size, int flags)
2345 {
2346 	struct sock *sk = sock->sk;
2347 
2348 	if (sk->sk_state != TCP_ESTABLISHED)
2349 		return -ENOTCONN;
2350 
2351 	return unix_dgram_recvmsg(sock, msg, size, flags);
2352 }
2353 
2354 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2355 {
2356 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2357 
2358 	if (addr) {
2359 		msg->msg_namelen = addr->len;
2360 		memcpy(msg->msg_name, addr->name, addr->len);
2361 	}
2362 }
2363 
2364 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2365 			 int flags)
2366 {
2367 	struct scm_cookie scm;
2368 	struct socket *sock = sk->sk_socket;
2369 	struct unix_sock *u = unix_sk(sk);
2370 	struct sk_buff *skb, *last;
2371 	long timeo;
2372 	int skip;
2373 	int err;
2374 
2375 	err = -EOPNOTSUPP;
2376 	if (flags&MSG_OOB)
2377 		goto out;
2378 
2379 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2380 
2381 	do {
2382 		mutex_lock(&u->iolock);
2383 
2384 		skip = sk_peek_offset(sk, flags);
2385 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2386 					      &skip, &err, &last);
2387 		if (skb) {
2388 			if (!(flags & MSG_PEEK))
2389 				scm_stat_del(sk, skb);
2390 			break;
2391 		}
2392 
2393 		mutex_unlock(&u->iolock);
2394 
2395 		if (err != -EAGAIN)
2396 			break;
2397 	} while (timeo &&
2398 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2399 					      &err, &timeo, last));
2400 
2401 	if (!skb) { /* implies iolock unlocked */
2402 		unix_state_lock(sk);
2403 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2404 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2405 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2406 			err = 0;
2407 		unix_state_unlock(sk);
2408 		goto out;
2409 	}
2410 
2411 	if (wq_has_sleeper(&u->peer_wait))
2412 		wake_up_interruptible_sync_poll(&u->peer_wait,
2413 						EPOLLOUT | EPOLLWRNORM |
2414 						EPOLLWRBAND);
2415 
2416 	if (msg->msg_name) {
2417 		unix_copy_addr(msg, skb->sk);
2418 
2419 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2420 						      msg->msg_name,
2421 						      &msg->msg_namelen);
2422 	}
2423 
2424 	if (size > skb->len - skip)
2425 		size = skb->len - skip;
2426 	else if (size < skb->len - skip)
2427 		msg->msg_flags |= MSG_TRUNC;
2428 
2429 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2430 	if (err)
2431 		goto out_free;
2432 
2433 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2434 		__sock_recv_timestamp(msg, sk, skb);
2435 
2436 	memset(&scm, 0, sizeof(scm));
2437 
2438 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2439 	unix_set_secdata(&scm, skb);
2440 
2441 	if (!(flags & MSG_PEEK)) {
2442 		if (UNIXCB(skb).fp)
2443 			unix_detach_fds(&scm, skb);
2444 
2445 		sk_peek_offset_bwd(sk, skb->len);
2446 	} else {
2447 		/* It is questionable: on PEEK we could:
2448 		   - do not return fds - good, but too simple 8)
2449 		   - return fds, and do not return them on read (old strategy,
2450 		     apparently wrong)
2451 		   - clone fds (I chose it for now, it is the most universal
2452 		     solution)
2453 
2454 		   POSIX 1003.1g does not actually define this clearly
2455 		   at all. POSIX 1003.1g doesn't define a lot of things
2456 		   clearly however!
2457 
2458 		*/
2459 
2460 		sk_peek_offset_fwd(sk, size);
2461 
2462 		if (UNIXCB(skb).fp)
2463 			unix_peek_fds(&scm, skb);
2464 	}
2465 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2466 
2467 	scm_recv_unix(sock, msg, &scm, flags);
2468 
2469 out_free:
2470 	skb_free_datagram(sk, skb);
2471 	mutex_unlock(&u->iolock);
2472 out:
2473 	return err;
2474 }
2475 
2476 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2477 			      int flags)
2478 {
2479 	struct sock *sk = sock->sk;
2480 
2481 #ifdef CONFIG_BPF_SYSCALL
2482 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2483 
2484 	if (prot != &unix_dgram_proto)
2485 		return prot->recvmsg(sk, msg, size, flags, NULL);
2486 #endif
2487 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2488 }
2489 
2490 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2491 {
2492 	struct unix_sock *u = unix_sk(sk);
2493 	struct sk_buff *skb;
2494 	int err;
2495 
2496 	mutex_lock(&u->iolock);
2497 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2498 	mutex_unlock(&u->iolock);
2499 	if (!skb)
2500 		return err;
2501 
2502 	return recv_actor(sk, skb);
2503 }
2504 
2505 /*
2506  *	Sleep until more data has arrived. But check for races..
2507  */
2508 static long unix_stream_data_wait(struct sock *sk, long timeo,
2509 				  struct sk_buff *last, unsigned int last_len,
2510 				  bool freezable)
2511 {
2512 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2513 	struct sk_buff *tail;
2514 	DEFINE_WAIT(wait);
2515 
2516 	unix_state_lock(sk);
2517 
2518 	for (;;) {
2519 		prepare_to_wait(sk_sleep(sk), &wait, state);
2520 
2521 		tail = skb_peek_tail(&sk->sk_receive_queue);
2522 		if (tail != last ||
2523 		    (tail && tail->len != last_len) ||
2524 		    sk->sk_err ||
2525 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2526 		    signal_pending(current) ||
2527 		    !timeo)
2528 			break;
2529 
2530 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2531 		unix_state_unlock(sk);
2532 		timeo = schedule_timeout(timeo);
2533 		unix_state_lock(sk);
2534 
2535 		if (sock_flag(sk, SOCK_DEAD))
2536 			break;
2537 
2538 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2539 	}
2540 
2541 	finish_wait(sk_sleep(sk), &wait);
2542 	unix_state_unlock(sk);
2543 	return timeo;
2544 }
2545 
2546 static unsigned int unix_skb_len(const struct sk_buff *skb)
2547 {
2548 	return skb->len - UNIXCB(skb).consumed;
2549 }
2550 
2551 struct unix_stream_read_state {
2552 	int (*recv_actor)(struct sk_buff *, int, int,
2553 			  struct unix_stream_read_state *);
2554 	struct socket *socket;
2555 	struct msghdr *msg;
2556 	struct pipe_inode_info *pipe;
2557 	size_t size;
2558 	int flags;
2559 	unsigned int splice_flags;
2560 };
2561 
2562 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2563 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2564 {
2565 	struct socket *sock = state->socket;
2566 	struct sock *sk = sock->sk;
2567 	struct unix_sock *u = unix_sk(sk);
2568 	int chunk = 1;
2569 	struct sk_buff *oob_skb;
2570 
2571 	mutex_lock(&u->iolock);
2572 	unix_state_lock(sk);
2573 
2574 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2575 		unix_state_unlock(sk);
2576 		mutex_unlock(&u->iolock);
2577 		return -EINVAL;
2578 	}
2579 
2580 	oob_skb = u->oob_skb;
2581 
2582 	if (!(state->flags & MSG_PEEK))
2583 		WRITE_ONCE(u->oob_skb, NULL);
2584 	else
2585 		skb_get(oob_skb);
2586 	unix_state_unlock(sk);
2587 
2588 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2589 
2590 	if (!(state->flags & MSG_PEEK))
2591 		UNIXCB(oob_skb).consumed += 1;
2592 
2593 	consume_skb(oob_skb);
2594 
2595 	mutex_unlock(&u->iolock);
2596 
2597 	if (chunk < 0)
2598 		return -EFAULT;
2599 
2600 	state->msg->msg_flags |= MSG_OOB;
2601 	return 1;
2602 }
2603 
2604 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2605 				  int flags, int copied)
2606 {
2607 	struct unix_sock *u = unix_sk(sk);
2608 
2609 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2610 		skb_unlink(skb, &sk->sk_receive_queue);
2611 		consume_skb(skb);
2612 		skb = NULL;
2613 	} else {
2614 		if (skb == u->oob_skb) {
2615 			if (copied) {
2616 				skb = NULL;
2617 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2618 				if (!(flags & MSG_PEEK)) {
2619 					WRITE_ONCE(u->oob_skb, NULL);
2620 					consume_skb(skb);
2621 				}
2622 			} else if (!(flags & MSG_PEEK)) {
2623 				skb_unlink(skb, &sk->sk_receive_queue);
2624 				consume_skb(skb);
2625 				skb = skb_peek(&sk->sk_receive_queue);
2626 			}
2627 		}
2628 	}
2629 	return skb;
2630 }
2631 #endif
2632 
2633 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2634 {
2635 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2636 		return -ENOTCONN;
2637 
2638 	return unix_read_skb(sk, recv_actor);
2639 }
2640 
2641 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2642 				    bool freezable)
2643 {
2644 	struct scm_cookie scm;
2645 	struct socket *sock = state->socket;
2646 	struct sock *sk = sock->sk;
2647 	struct unix_sock *u = unix_sk(sk);
2648 	int copied = 0;
2649 	int flags = state->flags;
2650 	int noblock = flags & MSG_DONTWAIT;
2651 	bool check_creds = false;
2652 	int target;
2653 	int err = 0;
2654 	long timeo;
2655 	int skip;
2656 	size_t size = state->size;
2657 	unsigned int last_len;
2658 
2659 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2660 		err = -EINVAL;
2661 		goto out;
2662 	}
2663 
2664 	if (unlikely(flags & MSG_OOB)) {
2665 		err = -EOPNOTSUPP;
2666 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2667 		err = unix_stream_recv_urg(state);
2668 #endif
2669 		goto out;
2670 	}
2671 
2672 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2673 	timeo = sock_rcvtimeo(sk, noblock);
2674 
2675 	memset(&scm, 0, sizeof(scm));
2676 
2677 	/* Lock the socket to prevent queue disordering
2678 	 * while sleeps in memcpy_tomsg
2679 	 */
2680 	mutex_lock(&u->iolock);
2681 
2682 	skip = max(sk_peek_offset(sk, flags), 0);
2683 
2684 	do {
2685 		int chunk;
2686 		bool drop_skb;
2687 		struct sk_buff *skb, *last;
2688 
2689 redo:
2690 		unix_state_lock(sk);
2691 		if (sock_flag(sk, SOCK_DEAD)) {
2692 			err = -ECONNRESET;
2693 			goto unlock;
2694 		}
2695 		last = skb = skb_peek(&sk->sk_receive_queue);
2696 		last_len = last ? last->len : 0;
2697 
2698 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2699 		if (skb) {
2700 			skb = manage_oob(skb, sk, flags, copied);
2701 			if (!skb) {
2702 				unix_state_unlock(sk);
2703 				if (copied)
2704 					break;
2705 				goto redo;
2706 			}
2707 		}
2708 #endif
2709 again:
2710 		if (skb == NULL) {
2711 			if (copied >= target)
2712 				goto unlock;
2713 
2714 			/*
2715 			 *	POSIX 1003.1g mandates this order.
2716 			 */
2717 
2718 			err = sock_error(sk);
2719 			if (err)
2720 				goto unlock;
2721 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2722 				goto unlock;
2723 
2724 			unix_state_unlock(sk);
2725 			if (!timeo) {
2726 				err = -EAGAIN;
2727 				break;
2728 			}
2729 
2730 			mutex_unlock(&u->iolock);
2731 
2732 			timeo = unix_stream_data_wait(sk, timeo, last,
2733 						      last_len, freezable);
2734 
2735 			if (signal_pending(current)) {
2736 				err = sock_intr_errno(timeo);
2737 				scm_destroy(&scm);
2738 				goto out;
2739 			}
2740 
2741 			mutex_lock(&u->iolock);
2742 			goto redo;
2743 unlock:
2744 			unix_state_unlock(sk);
2745 			break;
2746 		}
2747 
2748 		while (skip >= unix_skb_len(skb)) {
2749 			skip -= unix_skb_len(skb);
2750 			last = skb;
2751 			last_len = skb->len;
2752 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2753 			if (!skb)
2754 				goto again;
2755 		}
2756 
2757 		unix_state_unlock(sk);
2758 
2759 		if (check_creds) {
2760 			/* Never glue messages from different writers */
2761 			if (!unix_skb_scm_eq(skb, &scm))
2762 				break;
2763 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2764 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2765 			/* Copy credentials */
2766 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2767 			unix_set_secdata(&scm, skb);
2768 			check_creds = true;
2769 		}
2770 
2771 		/* Copy address just once */
2772 		if (state->msg && state->msg->msg_name) {
2773 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2774 					 state->msg->msg_name);
2775 			unix_copy_addr(state->msg, skb->sk);
2776 
2777 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2778 							      state->msg->msg_name,
2779 							      &state->msg->msg_namelen);
2780 
2781 			sunaddr = NULL;
2782 		}
2783 
2784 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2785 		skb_get(skb);
2786 		chunk = state->recv_actor(skb, skip, chunk, state);
2787 		drop_skb = !unix_skb_len(skb);
2788 		/* skb is only safe to use if !drop_skb */
2789 		consume_skb(skb);
2790 		if (chunk < 0) {
2791 			if (copied == 0)
2792 				copied = -EFAULT;
2793 			break;
2794 		}
2795 		copied += chunk;
2796 		size -= chunk;
2797 
2798 		if (drop_skb) {
2799 			/* the skb was touched by a concurrent reader;
2800 			 * we should not expect anything from this skb
2801 			 * anymore and assume it invalid - we can be
2802 			 * sure it was dropped from the socket queue
2803 			 *
2804 			 * let's report a short read
2805 			 */
2806 			err = 0;
2807 			break;
2808 		}
2809 
2810 		/* Mark read part of skb as used */
2811 		if (!(flags & MSG_PEEK)) {
2812 			UNIXCB(skb).consumed += chunk;
2813 
2814 			sk_peek_offset_bwd(sk, chunk);
2815 
2816 			if (UNIXCB(skb).fp) {
2817 				scm_stat_del(sk, skb);
2818 				unix_detach_fds(&scm, skb);
2819 			}
2820 
2821 			if (unix_skb_len(skb))
2822 				break;
2823 
2824 			skb_unlink(skb, &sk->sk_receive_queue);
2825 			consume_skb(skb);
2826 
2827 			if (scm.fp)
2828 				break;
2829 		} else {
2830 			/* It is questionable, see note in unix_dgram_recvmsg.
2831 			 */
2832 			if (UNIXCB(skb).fp)
2833 				unix_peek_fds(&scm, skb);
2834 
2835 			sk_peek_offset_fwd(sk, chunk);
2836 
2837 			if (UNIXCB(skb).fp)
2838 				break;
2839 
2840 			skip = 0;
2841 			last = skb;
2842 			last_len = skb->len;
2843 			unix_state_lock(sk);
2844 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2845 			if (skb)
2846 				goto again;
2847 			unix_state_unlock(sk);
2848 			break;
2849 		}
2850 	} while (size);
2851 
2852 	mutex_unlock(&u->iolock);
2853 	if (state->msg)
2854 		scm_recv_unix(sock, state->msg, &scm, flags);
2855 	else
2856 		scm_destroy(&scm);
2857 out:
2858 	return copied ? : err;
2859 }
2860 
2861 static int unix_stream_read_actor(struct sk_buff *skb,
2862 				  int skip, int chunk,
2863 				  struct unix_stream_read_state *state)
2864 {
2865 	int ret;
2866 
2867 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2868 				    state->msg, chunk);
2869 	return ret ?: chunk;
2870 }
2871 
2872 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2873 			  size_t size, int flags)
2874 {
2875 	struct unix_stream_read_state state = {
2876 		.recv_actor = unix_stream_read_actor,
2877 		.socket = sk->sk_socket,
2878 		.msg = msg,
2879 		.size = size,
2880 		.flags = flags
2881 	};
2882 
2883 	return unix_stream_read_generic(&state, true);
2884 }
2885 
2886 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2887 			       size_t size, int flags)
2888 {
2889 	struct unix_stream_read_state state = {
2890 		.recv_actor = unix_stream_read_actor,
2891 		.socket = sock,
2892 		.msg = msg,
2893 		.size = size,
2894 		.flags = flags
2895 	};
2896 
2897 #ifdef CONFIG_BPF_SYSCALL
2898 	struct sock *sk = sock->sk;
2899 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2900 
2901 	if (prot != &unix_stream_proto)
2902 		return prot->recvmsg(sk, msg, size, flags, NULL);
2903 #endif
2904 	return unix_stream_read_generic(&state, true);
2905 }
2906 
2907 static int unix_stream_splice_actor(struct sk_buff *skb,
2908 				    int skip, int chunk,
2909 				    struct unix_stream_read_state *state)
2910 {
2911 	return skb_splice_bits(skb, state->socket->sk,
2912 			       UNIXCB(skb).consumed + skip,
2913 			       state->pipe, chunk, state->splice_flags);
2914 }
2915 
2916 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2917 				       struct pipe_inode_info *pipe,
2918 				       size_t size, unsigned int flags)
2919 {
2920 	struct unix_stream_read_state state = {
2921 		.recv_actor = unix_stream_splice_actor,
2922 		.socket = sock,
2923 		.pipe = pipe,
2924 		.size = size,
2925 		.splice_flags = flags,
2926 	};
2927 
2928 	if (unlikely(*ppos))
2929 		return -ESPIPE;
2930 
2931 	if (sock->file->f_flags & O_NONBLOCK ||
2932 	    flags & SPLICE_F_NONBLOCK)
2933 		state.flags = MSG_DONTWAIT;
2934 
2935 	return unix_stream_read_generic(&state, false);
2936 }
2937 
2938 static int unix_shutdown(struct socket *sock, int mode)
2939 {
2940 	struct sock *sk = sock->sk;
2941 	struct sock *other;
2942 
2943 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2944 		return -EINVAL;
2945 	/* This maps:
2946 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2947 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2948 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2949 	 */
2950 	++mode;
2951 
2952 	unix_state_lock(sk);
2953 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2954 	other = unix_peer(sk);
2955 	if (other)
2956 		sock_hold(other);
2957 	unix_state_unlock(sk);
2958 	sk->sk_state_change(sk);
2959 
2960 	if (other &&
2961 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2962 
2963 		int peer_mode = 0;
2964 		const struct proto *prot = READ_ONCE(other->sk_prot);
2965 
2966 		if (prot->unhash)
2967 			prot->unhash(other);
2968 		if (mode&RCV_SHUTDOWN)
2969 			peer_mode |= SEND_SHUTDOWN;
2970 		if (mode&SEND_SHUTDOWN)
2971 			peer_mode |= RCV_SHUTDOWN;
2972 		unix_state_lock(other);
2973 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2974 		unix_state_unlock(other);
2975 		other->sk_state_change(other);
2976 		if (peer_mode == SHUTDOWN_MASK)
2977 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2978 		else if (peer_mode & RCV_SHUTDOWN)
2979 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2980 	}
2981 	if (other)
2982 		sock_put(other);
2983 
2984 	return 0;
2985 }
2986 
2987 long unix_inq_len(struct sock *sk)
2988 {
2989 	struct sk_buff *skb;
2990 	long amount = 0;
2991 
2992 	if (sk->sk_state == TCP_LISTEN)
2993 		return -EINVAL;
2994 
2995 	spin_lock(&sk->sk_receive_queue.lock);
2996 	if (sk->sk_type == SOCK_STREAM ||
2997 	    sk->sk_type == SOCK_SEQPACKET) {
2998 		skb_queue_walk(&sk->sk_receive_queue, skb)
2999 			amount += unix_skb_len(skb);
3000 	} else {
3001 		skb = skb_peek(&sk->sk_receive_queue);
3002 		if (skb)
3003 			amount = skb->len;
3004 	}
3005 	spin_unlock(&sk->sk_receive_queue.lock);
3006 
3007 	return amount;
3008 }
3009 EXPORT_SYMBOL_GPL(unix_inq_len);
3010 
3011 long unix_outq_len(struct sock *sk)
3012 {
3013 	return sk_wmem_alloc_get(sk);
3014 }
3015 EXPORT_SYMBOL_GPL(unix_outq_len);
3016 
3017 static int unix_open_file(struct sock *sk)
3018 {
3019 	struct path path;
3020 	struct file *f;
3021 	int fd;
3022 
3023 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3024 		return -EPERM;
3025 
3026 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3027 		return -ENOENT;
3028 
3029 	path = unix_sk(sk)->path;
3030 	if (!path.dentry)
3031 		return -ENOENT;
3032 
3033 	path_get(&path);
3034 
3035 	fd = get_unused_fd_flags(O_CLOEXEC);
3036 	if (fd < 0)
3037 		goto out;
3038 
3039 	f = dentry_open(&path, O_PATH, current_cred());
3040 	if (IS_ERR(f)) {
3041 		put_unused_fd(fd);
3042 		fd = PTR_ERR(f);
3043 		goto out;
3044 	}
3045 
3046 	fd_install(fd, f);
3047 out:
3048 	path_put(&path);
3049 
3050 	return fd;
3051 }
3052 
3053 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3054 {
3055 	struct sock *sk = sock->sk;
3056 	long amount = 0;
3057 	int err;
3058 
3059 	switch (cmd) {
3060 	case SIOCOUTQ:
3061 		amount = unix_outq_len(sk);
3062 		err = put_user(amount, (int __user *)arg);
3063 		break;
3064 	case SIOCINQ:
3065 		amount = unix_inq_len(sk);
3066 		if (amount < 0)
3067 			err = amount;
3068 		else
3069 			err = put_user(amount, (int __user *)arg);
3070 		break;
3071 	case SIOCUNIXFILE:
3072 		err = unix_open_file(sk);
3073 		break;
3074 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3075 	case SIOCATMARK:
3076 		{
3077 			struct sk_buff *skb;
3078 			int answ = 0;
3079 
3080 			skb = skb_peek(&sk->sk_receive_queue);
3081 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3082 				answ = 1;
3083 			err = put_user(answ, (int __user *)arg);
3084 		}
3085 		break;
3086 #endif
3087 	default:
3088 		err = -ENOIOCTLCMD;
3089 		break;
3090 	}
3091 	return err;
3092 }
3093 
3094 #ifdef CONFIG_COMPAT
3095 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3096 {
3097 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3098 }
3099 #endif
3100 
3101 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3102 {
3103 	struct sock *sk = sock->sk;
3104 	__poll_t mask;
3105 	u8 shutdown;
3106 
3107 	sock_poll_wait(file, sock, wait);
3108 	mask = 0;
3109 	shutdown = READ_ONCE(sk->sk_shutdown);
3110 
3111 	/* exceptional events? */
3112 	if (READ_ONCE(sk->sk_err))
3113 		mask |= EPOLLERR;
3114 	if (shutdown == SHUTDOWN_MASK)
3115 		mask |= EPOLLHUP;
3116 	if (shutdown & RCV_SHUTDOWN)
3117 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3118 
3119 	/* readable? */
3120 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3121 		mask |= EPOLLIN | EPOLLRDNORM;
3122 	if (sk_is_readable(sk))
3123 		mask |= EPOLLIN | EPOLLRDNORM;
3124 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3125 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3126 		mask |= EPOLLPRI;
3127 #endif
3128 
3129 	/* Connection-based need to check for termination and startup */
3130 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3131 	    sk->sk_state == TCP_CLOSE)
3132 		mask |= EPOLLHUP;
3133 
3134 	/*
3135 	 * we set writable also when the other side has shut down the
3136 	 * connection. This prevents stuck sockets.
3137 	 */
3138 	if (unix_writable(sk))
3139 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3140 
3141 	return mask;
3142 }
3143 
3144 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3145 				    poll_table *wait)
3146 {
3147 	struct sock *sk = sock->sk, *other;
3148 	unsigned int writable;
3149 	__poll_t mask;
3150 	u8 shutdown;
3151 
3152 	sock_poll_wait(file, sock, wait);
3153 	mask = 0;
3154 	shutdown = READ_ONCE(sk->sk_shutdown);
3155 
3156 	/* exceptional events? */
3157 	if (READ_ONCE(sk->sk_err) ||
3158 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3159 		mask |= EPOLLERR |
3160 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3161 
3162 	if (shutdown & RCV_SHUTDOWN)
3163 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3164 	if (shutdown == SHUTDOWN_MASK)
3165 		mask |= EPOLLHUP;
3166 
3167 	/* readable? */
3168 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3169 		mask |= EPOLLIN | EPOLLRDNORM;
3170 	if (sk_is_readable(sk))
3171 		mask |= EPOLLIN | EPOLLRDNORM;
3172 
3173 	/* Connection-based need to check for termination and startup */
3174 	if (sk->sk_type == SOCK_SEQPACKET) {
3175 		if (sk->sk_state == TCP_CLOSE)
3176 			mask |= EPOLLHUP;
3177 		/* connection hasn't started yet? */
3178 		if (sk->sk_state == TCP_SYN_SENT)
3179 			return mask;
3180 	}
3181 
3182 	/* No write status requested, avoid expensive OUT tests. */
3183 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3184 		return mask;
3185 
3186 	writable = unix_writable(sk);
3187 	if (writable) {
3188 		unix_state_lock(sk);
3189 
3190 		other = unix_peer(sk);
3191 		if (other && unix_peer(other) != sk &&
3192 		    unix_recvq_full_lockless(other) &&
3193 		    unix_dgram_peer_wake_me(sk, other))
3194 			writable = 0;
3195 
3196 		unix_state_unlock(sk);
3197 	}
3198 
3199 	if (writable)
3200 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3201 	else
3202 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3203 
3204 	return mask;
3205 }
3206 
3207 #ifdef CONFIG_PROC_FS
3208 
3209 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3210 
3211 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3212 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3213 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3214 
3215 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3216 {
3217 	unsigned long offset = get_offset(*pos);
3218 	unsigned long bucket = get_bucket(*pos);
3219 	unsigned long count = 0;
3220 	struct sock *sk;
3221 
3222 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3223 	     sk; sk = sk_next(sk)) {
3224 		if (++count == offset)
3225 			break;
3226 	}
3227 
3228 	return sk;
3229 }
3230 
3231 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3232 {
3233 	unsigned long bucket = get_bucket(*pos);
3234 	struct net *net = seq_file_net(seq);
3235 	struct sock *sk;
3236 
3237 	while (bucket < UNIX_HASH_SIZE) {
3238 		spin_lock(&net->unx.table.locks[bucket]);
3239 
3240 		sk = unix_from_bucket(seq, pos);
3241 		if (sk)
3242 			return sk;
3243 
3244 		spin_unlock(&net->unx.table.locks[bucket]);
3245 
3246 		*pos = set_bucket_offset(++bucket, 1);
3247 	}
3248 
3249 	return NULL;
3250 }
3251 
3252 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3253 				  loff_t *pos)
3254 {
3255 	unsigned long bucket = get_bucket(*pos);
3256 
3257 	sk = sk_next(sk);
3258 	if (sk)
3259 		return sk;
3260 
3261 
3262 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3263 
3264 	*pos = set_bucket_offset(++bucket, 1);
3265 
3266 	return unix_get_first(seq, pos);
3267 }
3268 
3269 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3270 {
3271 	if (!*pos)
3272 		return SEQ_START_TOKEN;
3273 
3274 	return unix_get_first(seq, pos);
3275 }
3276 
3277 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3278 {
3279 	++*pos;
3280 
3281 	if (v == SEQ_START_TOKEN)
3282 		return unix_get_first(seq, pos);
3283 
3284 	return unix_get_next(seq, v, pos);
3285 }
3286 
3287 static void unix_seq_stop(struct seq_file *seq, void *v)
3288 {
3289 	struct sock *sk = v;
3290 
3291 	if (sk)
3292 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3293 }
3294 
3295 static int unix_seq_show(struct seq_file *seq, void *v)
3296 {
3297 
3298 	if (v == SEQ_START_TOKEN)
3299 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3300 			 "Inode Path\n");
3301 	else {
3302 		struct sock *s = v;
3303 		struct unix_sock *u = unix_sk(s);
3304 		unix_state_lock(s);
3305 
3306 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3307 			s,
3308 			refcount_read(&s->sk_refcnt),
3309 			0,
3310 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3311 			s->sk_type,
3312 			s->sk_socket ?
3313 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3314 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3315 			sock_i_ino(s));
3316 
3317 		if (u->addr) {	// under a hash table lock here
3318 			int i, len;
3319 			seq_putc(seq, ' ');
3320 
3321 			i = 0;
3322 			len = u->addr->len -
3323 				offsetof(struct sockaddr_un, sun_path);
3324 			if (u->addr->name->sun_path[0]) {
3325 				len--;
3326 			} else {
3327 				seq_putc(seq, '@');
3328 				i++;
3329 			}
3330 			for ( ; i < len; i++)
3331 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3332 					 '@');
3333 		}
3334 		unix_state_unlock(s);
3335 		seq_putc(seq, '\n');
3336 	}
3337 
3338 	return 0;
3339 }
3340 
3341 static const struct seq_operations unix_seq_ops = {
3342 	.start  = unix_seq_start,
3343 	.next   = unix_seq_next,
3344 	.stop   = unix_seq_stop,
3345 	.show   = unix_seq_show,
3346 };
3347 
3348 #ifdef CONFIG_BPF_SYSCALL
3349 struct bpf_unix_iter_state {
3350 	struct seq_net_private p;
3351 	unsigned int cur_sk;
3352 	unsigned int end_sk;
3353 	unsigned int max_sk;
3354 	struct sock **batch;
3355 	bool st_bucket_done;
3356 };
3357 
3358 struct bpf_iter__unix {
3359 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3360 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3361 	uid_t uid __aligned(8);
3362 };
3363 
3364 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3365 			      struct unix_sock *unix_sk, uid_t uid)
3366 {
3367 	struct bpf_iter__unix ctx;
3368 
3369 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3370 	ctx.meta = meta;
3371 	ctx.unix_sk = unix_sk;
3372 	ctx.uid = uid;
3373 	return bpf_iter_run_prog(prog, &ctx);
3374 }
3375 
3376 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3377 
3378 {
3379 	struct bpf_unix_iter_state *iter = seq->private;
3380 	unsigned int expected = 1;
3381 	struct sock *sk;
3382 
3383 	sock_hold(start_sk);
3384 	iter->batch[iter->end_sk++] = start_sk;
3385 
3386 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3387 		if (iter->end_sk < iter->max_sk) {
3388 			sock_hold(sk);
3389 			iter->batch[iter->end_sk++] = sk;
3390 		}
3391 
3392 		expected++;
3393 	}
3394 
3395 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3396 
3397 	return expected;
3398 }
3399 
3400 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3401 {
3402 	while (iter->cur_sk < iter->end_sk)
3403 		sock_put(iter->batch[iter->cur_sk++]);
3404 }
3405 
3406 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3407 				       unsigned int new_batch_sz)
3408 {
3409 	struct sock **new_batch;
3410 
3411 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3412 			     GFP_USER | __GFP_NOWARN);
3413 	if (!new_batch)
3414 		return -ENOMEM;
3415 
3416 	bpf_iter_unix_put_batch(iter);
3417 	kvfree(iter->batch);
3418 	iter->batch = new_batch;
3419 	iter->max_sk = new_batch_sz;
3420 
3421 	return 0;
3422 }
3423 
3424 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3425 					loff_t *pos)
3426 {
3427 	struct bpf_unix_iter_state *iter = seq->private;
3428 	unsigned int expected;
3429 	bool resized = false;
3430 	struct sock *sk;
3431 
3432 	if (iter->st_bucket_done)
3433 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3434 
3435 again:
3436 	/* Get a new batch */
3437 	iter->cur_sk = 0;
3438 	iter->end_sk = 0;
3439 
3440 	sk = unix_get_first(seq, pos);
3441 	if (!sk)
3442 		return NULL; /* Done */
3443 
3444 	expected = bpf_iter_unix_hold_batch(seq, sk);
3445 
3446 	if (iter->end_sk == expected) {
3447 		iter->st_bucket_done = true;
3448 		return sk;
3449 	}
3450 
3451 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3452 		resized = true;
3453 		goto again;
3454 	}
3455 
3456 	return sk;
3457 }
3458 
3459 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3460 {
3461 	if (!*pos)
3462 		return SEQ_START_TOKEN;
3463 
3464 	/* bpf iter does not support lseek, so it always
3465 	 * continue from where it was stop()-ped.
3466 	 */
3467 	return bpf_iter_unix_batch(seq, pos);
3468 }
3469 
3470 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3471 {
3472 	struct bpf_unix_iter_state *iter = seq->private;
3473 	struct sock *sk;
3474 
3475 	/* Whenever seq_next() is called, the iter->cur_sk is
3476 	 * done with seq_show(), so advance to the next sk in
3477 	 * the batch.
3478 	 */
3479 	if (iter->cur_sk < iter->end_sk)
3480 		sock_put(iter->batch[iter->cur_sk++]);
3481 
3482 	++*pos;
3483 
3484 	if (iter->cur_sk < iter->end_sk)
3485 		sk = iter->batch[iter->cur_sk];
3486 	else
3487 		sk = bpf_iter_unix_batch(seq, pos);
3488 
3489 	return sk;
3490 }
3491 
3492 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3493 {
3494 	struct bpf_iter_meta meta;
3495 	struct bpf_prog *prog;
3496 	struct sock *sk = v;
3497 	uid_t uid;
3498 	bool slow;
3499 	int ret;
3500 
3501 	if (v == SEQ_START_TOKEN)
3502 		return 0;
3503 
3504 	slow = lock_sock_fast(sk);
3505 
3506 	if (unlikely(sk_unhashed(sk))) {
3507 		ret = SEQ_SKIP;
3508 		goto unlock;
3509 	}
3510 
3511 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3512 	meta.seq = seq;
3513 	prog = bpf_iter_get_info(&meta, false);
3514 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3515 unlock:
3516 	unlock_sock_fast(sk, slow);
3517 	return ret;
3518 }
3519 
3520 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3521 {
3522 	struct bpf_unix_iter_state *iter = seq->private;
3523 	struct bpf_iter_meta meta;
3524 	struct bpf_prog *prog;
3525 
3526 	if (!v) {
3527 		meta.seq = seq;
3528 		prog = bpf_iter_get_info(&meta, true);
3529 		if (prog)
3530 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3531 	}
3532 
3533 	if (iter->cur_sk < iter->end_sk)
3534 		bpf_iter_unix_put_batch(iter);
3535 }
3536 
3537 static const struct seq_operations bpf_iter_unix_seq_ops = {
3538 	.start	= bpf_iter_unix_seq_start,
3539 	.next	= bpf_iter_unix_seq_next,
3540 	.stop	= bpf_iter_unix_seq_stop,
3541 	.show	= bpf_iter_unix_seq_show,
3542 };
3543 #endif
3544 #endif
3545 
3546 static const struct net_proto_family unix_family_ops = {
3547 	.family = PF_UNIX,
3548 	.create = unix_create,
3549 	.owner	= THIS_MODULE,
3550 };
3551 
3552 
3553 static int __net_init unix_net_init(struct net *net)
3554 {
3555 	int i;
3556 
3557 	net->unx.sysctl_max_dgram_qlen = 10;
3558 	if (unix_sysctl_register(net))
3559 		goto out;
3560 
3561 #ifdef CONFIG_PROC_FS
3562 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3563 			     sizeof(struct seq_net_private)))
3564 		goto err_sysctl;
3565 #endif
3566 
3567 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3568 					      sizeof(spinlock_t), GFP_KERNEL);
3569 	if (!net->unx.table.locks)
3570 		goto err_proc;
3571 
3572 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3573 						sizeof(struct hlist_head),
3574 						GFP_KERNEL);
3575 	if (!net->unx.table.buckets)
3576 		goto free_locks;
3577 
3578 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3579 		spin_lock_init(&net->unx.table.locks[i]);
3580 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3581 	}
3582 
3583 	return 0;
3584 
3585 free_locks:
3586 	kvfree(net->unx.table.locks);
3587 err_proc:
3588 #ifdef CONFIG_PROC_FS
3589 	remove_proc_entry("unix", net->proc_net);
3590 err_sysctl:
3591 #endif
3592 	unix_sysctl_unregister(net);
3593 out:
3594 	return -ENOMEM;
3595 }
3596 
3597 static void __net_exit unix_net_exit(struct net *net)
3598 {
3599 	kvfree(net->unx.table.buckets);
3600 	kvfree(net->unx.table.locks);
3601 	unix_sysctl_unregister(net);
3602 	remove_proc_entry("unix", net->proc_net);
3603 }
3604 
3605 static struct pernet_operations unix_net_ops = {
3606 	.init = unix_net_init,
3607 	.exit = unix_net_exit,
3608 };
3609 
3610 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3611 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3612 		     struct unix_sock *unix_sk, uid_t uid)
3613 
3614 #define INIT_BATCH_SZ 16
3615 
3616 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3617 {
3618 	struct bpf_unix_iter_state *iter = priv_data;
3619 	int err;
3620 
3621 	err = bpf_iter_init_seq_net(priv_data, aux);
3622 	if (err)
3623 		return err;
3624 
3625 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3626 	if (err) {
3627 		bpf_iter_fini_seq_net(priv_data);
3628 		return err;
3629 	}
3630 
3631 	return 0;
3632 }
3633 
3634 static void bpf_iter_fini_unix(void *priv_data)
3635 {
3636 	struct bpf_unix_iter_state *iter = priv_data;
3637 
3638 	bpf_iter_fini_seq_net(priv_data);
3639 	kvfree(iter->batch);
3640 }
3641 
3642 static const struct bpf_iter_seq_info unix_seq_info = {
3643 	.seq_ops		= &bpf_iter_unix_seq_ops,
3644 	.init_seq_private	= bpf_iter_init_unix,
3645 	.fini_seq_private	= bpf_iter_fini_unix,
3646 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3647 };
3648 
3649 static const struct bpf_func_proto *
3650 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3651 			     const struct bpf_prog *prog)
3652 {
3653 	switch (func_id) {
3654 	case BPF_FUNC_setsockopt:
3655 		return &bpf_sk_setsockopt_proto;
3656 	case BPF_FUNC_getsockopt:
3657 		return &bpf_sk_getsockopt_proto;
3658 	default:
3659 		return NULL;
3660 	}
3661 }
3662 
3663 static struct bpf_iter_reg unix_reg_info = {
3664 	.target			= "unix",
3665 	.ctx_arg_info_size	= 1,
3666 	.ctx_arg_info		= {
3667 		{ offsetof(struct bpf_iter__unix, unix_sk),
3668 		  PTR_TO_BTF_ID_OR_NULL },
3669 	},
3670 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3671 	.seq_info		= &unix_seq_info,
3672 };
3673 
3674 static void __init bpf_iter_register(void)
3675 {
3676 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3677 	if (bpf_iter_reg_target(&unix_reg_info))
3678 		pr_warn("Warning: could not register bpf iterator unix\n");
3679 }
3680 #endif
3681 
3682 static int __init af_unix_init(void)
3683 {
3684 	int i, rc = -1;
3685 
3686 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3687 
3688 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3689 		spin_lock_init(&bsd_socket_locks[i]);
3690 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3691 	}
3692 
3693 	rc = proto_register(&unix_dgram_proto, 1);
3694 	if (rc != 0) {
3695 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3696 		goto out;
3697 	}
3698 
3699 	rc = proto_register(&unix_stream_proto, 1);
3700 	if (rc != 0) {
3701 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3702 		proto_unregister(&unix_dgram_proto);
3703 		goto out;
3704 	}
3705 
3706 	sock_register(&unix_family_ops);
3707 	register_pernet_subsys(&unix_net_ops);
3708 	unix_bpf_build_proto();
3709 
3710 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3711 	bpf_iter_register();
3712 #endif
3713 
3714 out:
3715 	return rc;
3716 }
3717 
3718 /* Later than subsys_initcall() because we depend on stuff initialised there */
3719 fs_initcall(af_unix_init);
3720