xref: /linux/net/unix/af_unix.c (revision b94038d841a91d0e3f59cfe4d073e210910366ee)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216 	return unix_peer(osk) == sk;
217 }
218 
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223 
224 static inline int unix_recvq_full_lockless(const struct sock *sk)
225 {
226 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228 
229 struct sock *unix_peer_get(struct sock *s)
230 {
231 	struct sock *peer;
232 
233 	unix_state_lock(s);
234 	peer = unix_peer(s);
235 	if (peer)
236 		sock_hold(peer);
237 	unix_state_unlock(s);
238 	return peer;
239 }
240 EXPORT_SYMBOL_GPL(unix_peer_get);
241 
242 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
243 					     int addr_len)
244 {
245 	struct unix_address *addr;
246 
247 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
248 	if (!addr)
249 		return NULL;
250 
251 	refcount_set(&addr->refcnt, 1);
252 	addr->len = addr_len;
253 	memcpy(addr->name, sunaddr, addr_len);
254 
255 	return addr;
256 }
257 
258 static inline void unix_release_addr(struct unix_address *addr)
259 {
260 	if (refcount_dec_and_test(&addr->refcnt))
261 		kfree(addr);
262 }
263 
264 /*
265  *	Check unix socket name:
266  *		- should be not zero length.
267  *	        - if started by not zero, should be NULL terminated (FS object)
268  *		- if started by zero, it is abstract name.
269  */
270 
271 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
272 {
273 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
274 	    addr_len > sizeof(*sunaddr))
275 		return -EINVAL;
276 
277 	if (sunaddr->sun_family != AF_UNIX)
278 		return -EINVAL;
279 
280 	return 0;
281 }
282 
283 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
284 {
285 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
286 	short offset = offsetof(struct sockaddr_storage, __data);
287 
288 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
289 
290 	/* This may look like an off by one error but it is a bit more
291 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
292 	 * sun_path[108] doesn't as such exist.  However in kernel space
293 	 * we are guaranteed that it is a valid memory location in our
294 	 * kernel address buffer because syscall functions always pass
295 	 * a pointer of struct sockaddr_storage which has a bigger buffer
296 	 * than 108.  Also, we must terminate sun_path for strlen() in
297 	 * getname_kernel().
298 	 */
299 	addr->__data[addr_len - offset] = 0;
300 
301 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
302 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
303 	 * know the actual buffer.
304 	 */
305 	return strlen(addr->__data) + offset + 1;
306 }
307 
308 static void __unix_remove_socket(struct sock *sk)
309 {
310 	sk_del_node_init(sk);
311 }
312 
313 static void __unix_insert_socket(struct net *net, struct sock *sk)
314 {
315 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
316 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
317 }
318 
319 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
320 				 struct unix_address *addr, unsigned int hash)
321 {
322 	__unix_remove_socket(sk);
323 	smp_store_release(&unix_sk(sk)->addr, addr);
324 
325 	sk->sk_hash = hash;
326 	__unix_insert_socket(net, sk);
327 }
328 
329 static void unix_remove_socket(struct net *net, struct sock *sk)
330 {
331 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
332 	__unix_remove_socket(sk);
333 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
334 }
335 
336 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
337 {
338 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
339 	__unix_insert_socket(net, sk);
340 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
341 }
342 
343 static void unix_insert_bsd_socket(struct sock *sk)
344 {
345 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
346 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
347 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
348 }
349 
350 static void unix_remove_bsd_socket(struct sock *sk)
351 {
352 	if (!hlist_unhashed(&sk->sk_bind_node)) {
353 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
354 		__sk_del_bind_node(sk);
355 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
356 
357 		sk_node_init(&sk->sk_bind_node);
358 	}
359 }
360 
361 static struct sock *__unix_find_socket_byname(struct net *net,
362 					      struct sockaddr_un *sunname,
363 					      int len, unsigned int hash)
364 {
365 	struct sock *s;
366 
367 	sk_for_each(s, &net->unx.table.buckets[hash]) {
368 		struct unix_sock *u = unix_sk(s);
369 
370 		if (u->addr->len == len &&
371 		    !memcmp(u->addr->name, sunname, len))
372 			return s;
373 	}
374 	return NULL;
375 }
376 
377 static inline struct sock *unix_find_socket_byname(struct net *net,
378 						   struct sockaddr_un *sunname,
379 						   int len, unsigned int hash)
380 {
381 	struct sock *s;
382 
383 	spin_lock(&net->unx.table.locks[hash]);
384 	s = __unix_find_socket_byname(net, sunname, len, hash);
385 	if (s)
386 		sock_hold(s);
387 	spin_unlock(&net->unx.table.locks[hash]);
388 	return s;
389 }
390 
391 static struct sock *unix_find_socket_byinode(struct inode *i)
392 {
393 	unsigned int hash = unix_bsd_hash(i);
394 	struct sock *s;
395 
396 	spin_lock(&bsd_socket_locks[hash]);
397 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
398 		struct dentry *dentry = unix_sk(s)->path.dentry;
399 
400 		if (dentry && d_backing_inode(dentry) == i) {
401 			sock_hold(s);
402 			spin_unlock(&bsd_socket_locks[hash]);
403 			return s;
404 		}
405 	}
406 	spin_unlock(&bsd_socket_locks[hash]);
407 	return NULL;
408 }
409 
410 /* Support code for asymmetrically connected dgram sockets
411  *
412  * If a datagram socket is connected to a socket not itself connected
413  * to the first socket (eg, /dev/log), clients may only enqueue more
414  * messages if the present receive queue of the server socket is not
415  * "too large". This means there's a second writeability condition
416  * poll and sendmsg need to test. The dgram recv code will do a wake
417  * up on the peer_wait wait queue of a socket upon reception of a
418  * datagram which needs to be propagated to sleeping would-be writers
419  * since these might not have sent anything so far. This can't be
420  * accomplished via poll_wait because the lifetime of the server
421  * socket might be less than that of its clients if these break their
422  * association with it or if the server socket is closed while clients
423  * are still connected to it and there's no way to inform "a polling
424  * implementation" that it should let go of a certain wait queue
425  *
426  * In order to propagate a wake up, a wait_queue_entry_t of the client
427  * socket is enqueued on the peer_wait queue of the server socket
428  * whose wake function does a wake_up on the ordinary client socket
429  * wait queue. This connection is established whenever a write (or
430  * poll for write) hit the flow control condition and broken when the
431  * association to the server socket is dissolved or after a wake up
432  * was relayed.
433  */
434 
435 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
436 				      void *key)
437 {
438 	struct unix_sock *u;
439 	wait_queue_head_t *u_sleep;
440 
441 	u = container_of(q, struct unix_sock, peer_wake);
442 
443 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
444 			    q);
445 	u->peer_wake.private = NULL;
446 
447 	/* relaying can only happen while the wq still exists */
448 	u_sleep = sk_sleep(&u->sk);
449 	if (u_sleep)
450 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
451 
452 	return 0;
453 }
454 
455 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
456 {
457 	struct unix_sock *u, *u_other;
458 	int rc;
459 
460 	u = unix_sk(sk);
461 	u_other = unix_sk(other);
462 	rc = 0;
463 	spin_lock(&u_other->peer_wait.lock);
464 
465 	if (!u->peer_wake.private) {
466 		u->peer_wake.private = other;
467 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
468 
469 		rc = 1;
470 	}
471 
472 	spin_unlock(&u_other->peer_wait.lock);
473 	return rc;
474 }
475 
476 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
477 					    struct sock *other)
478 {
479 	struct unix_sock *u, *u_other;
480 
481 	u = unix_sk(sk);
482 	u_other = unix_sk(other);
483 	spin_lock(&u_other->peer_wait.lock);
484 
485 	if (u->peer_wake.private == other) {
486 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
487 		u->peer_wake.private = NULL;
488 	}
489 
490 	spin_unlock(&u_other->peer_wait.lock);
491 }
492 
493 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
494 						   struct sock *other)
495 {
496 	unix_dgram_peer_wake_disconnect(sk, other);
497 	wake_up_interruptible_poll(sk_sleep(sk),
498 				   EPOLLOUT |
499 				   EPOLLWRNORM |
500 				   EPOLLWRBAND);
501 }
502 
503 /* preconditions:
504  *	- unix_peer(sk) == other
505  *	- association is stable
506  */
507 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
508 {
509 	int connected;
510 
511 	connected = unix_dgram_peer_wake_connect(sk, other);
512 
513 	/* If other is SOCK_DEAD, we want to make sure we signal
514 	 * POLLOUT, such that a subsequent write() can get a
515 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
516 	 * to other and its full, we will hang waiting for POLLOUT.
517 	 */
518 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
519 		return 1;
520 
521 	if (connected)
522 		unix_dgram_peer_wake_disconnect(sk, other);
523 
524 	return 0;
525 }
526 
527 static int unix_writable(const struct sock *sk, unsigned char state)
528 {
529 	return state != TCP_LISTEN &&
530 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
531 }
532 
533 static void unix_write_space(struct sock *sk)
534 {
535 	struct socket_wq *wq;
536 
537 	rcu_read_lock();
538 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
539 		wq = rcu_dereference(sk->sk_wq);
540 		if (skwq_has_sleeper(wq))
541 			wake_up_interruptible_sync_poll(&wq->wait,
542 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
543 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
544 	}
545 	rcu_read_unlock();
546 }
547 
548 /* When dgram socket disconnects (or changes its peer), we clear its receive
549  * queue of packets arrived from previous peer. First, it allows to do
550  * flow control based only on wmem_alloc; second, sk connected to peer
551  * may receive messages only from that peer. */
552 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
553 {
554 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
555 		skb_queue_purge(&sk->sk_receive_queue);
556 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
557 
558 		/* If one link of bidirectional dgram pipe is disconnected,
559 		 * we signal error. Messages are lost. Do not make this,
560 		 * when peer was not connected to us.
561 		 */
562 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
563 			WRITE_ONCE(other->sk_err, ECONNRESET);
564 			sk_error_report(other);
565 		}
566 	}
567 }
568 
569 static void unix_sock_destructor(struct sock *sk)
570 {
571 	struct unix_sock *u = unix_sk(sk);
572 
573 	skb_queue_purge(&sk->sk_receive_queue);
574 
575 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
576 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
577 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
578 	if (!sock_flag(sk, SOCK_DEAD)) {
579 		pr_info("Attempt to release alive unix socket: %p\n", sk);
580 		return;
581 	}
582 
583 	if (u->addr)
584 		unix_release_addr(u->addr);
585 
586 	atomic_long_dec(&unix_nr_socks);
587 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
588 #ifdef UNIX_REFCNT_DEBUG
589 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
590 		atomic_long_read(&unix_nr_socks));
591 #endif
592 }
593 
594 static void unix_release_sock(struct sock *sk, int embrion)
595 {
596 	struct unix_sock *u = unix_sk(sk);
597 	struct sock *skpair;
598 	struct sk_buff *skb;
599 	struct path path;
600 	int state;
601 
602 	unix_remove_socket(sock_net(sk), sk);
603 	unix_remove_bsd_socket(sk);
604 
605 	/* Clear state */
606 	unix_state_lock(sk);
607 	sock_orphan(sk);
608 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
609 	path	     = u->path;
610 	u->path.dentry = NULL;
611 	u->path.mnt = NULL;
612 	state = sk->sk_state;
613 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
614 
615 	skpair = unix_peer(sk);
616 	unix_peer(sk) = NULL;
617 
618 	unix_state_unlock(sk);
619 
620 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
621 	if (u->oob_skb) {
622 		kfree_skb(u->oob_skb);
623 		u->oob_skb = NULL;
624 	}
625 #endif
626 
627 	wake_up_interruptible_all(&u->peer_wait);
628 
629 	if (skpair != NULL) {
630 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
631 			unix_state_lock(skpair);
632 			/* No more writes */
633 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
634 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
635 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
636 			unix_state_unlock(skpair);
637 			skpair->sk_state_change(skpair);
638 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
639 		}
640 
641 		unix_dgram_peer_wake_disconnect(sk, skpair);
642 		sock_put(skpair); /* It may now die */
643 	}
644 
645 	/* Try to flush out this socket. Throw out buffers at least */
646 
647 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
648 		if (state == TCP_LISTEN)
649 			unix_release_sock(skb->sk, 1);
650 		/* passed fds are erased in the kfree_skb hook	      */
651 		UNIXCB(skb).consumed = skb->len;
652 		kfree_skb(skb);
653 	}
654 
655 	if (path.dentry)
656 		path_put(&path);
657 
658 	sock_put(sk);
659 
660 	/* ---- Socket is dead now and most probably destroyed ---- */
661 
662 	/*
663 	 * Fixme: BSD difference: In BSD all sockets connected to us get
664 	 *	  ECONNRESET and we die on the spot. In Linux we behave
665 	 *	  like files and pipes do and wait for the last
666 	 *	  dereference.
667 	 *
668 	 * Can't we simply set sock->err?
669 	 *
670 	 *	  What the above comment does talk about? --ANK(980817)
671 	 */
672 
673 	if (READ_ONCE(unix_tot_inflight))
674 		unix_gc();		/* Garbage collect fds */
675 }
676 
677 static void init_peercred(struct sock *sk)
678 {
679 	const struct cred *old_cred;
680 	struct pid *old_pid;
681 
682 	spin_lock(&sk->sk_peer_lock);
683 	old_pid = sk->sk_peer_pid;
684 	old_cred = sk->sk_peer_cred;
685 	sk->sk_peer_pid  = get_pid(task_tgid(current));
686 	sk->sk_peer_cred = get_current_cred();
687 	spin_unlock(&sk->sk_peer_lock);
688 
689 	put_pid(old_pid);
690 	put_cred(old_cred);
691 }
692 
693 static void copy_peercred(struct sock *sk, struct sock *peersk)
694 {
695 	const struct cred *old_cred;
696 	struct pid *old_pid;
697 
698 	if (sk < peersk) {
699 		spin_lock(&sk->sk_peer_lock);
700 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
701 	} else {
702 		spin_lock(&peersk->sk_peer_lock);
703 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
704 	}
705 	old_pid = sk->sk_peer_pid;
706 	old_cred = sk->sk_peer_cred;
707 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
708 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
709 
710 	spin_unlock(&sk->sk_peer_lock);
711 	spin_unlock(&peersk->sk_peer_lock);
712 
713 	put_pid(old_pid);
714 	put_cred(old_cred);
715 }
716 
717 static int unix_listen(struct socket *sock, int backlog)
718 {
719 	int err;
720 	struct sock *sk = sock->sk;
721 	struct unix_sock *u = unix_sk(sk);
722 
723 	err = -EOPNOTSUPP;
724 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
725 		goto out;	/* Only stream/seqpacket sockets accept */
726 	err = -EINVAL;
727 	if (!READ_ONCE(u->addr))
728 		goto out;	/* No listens on an unbound socket */
729 	unix_state_lock(sk);
730 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
731 		goto out_unlock;
732 	if (backlog > sk->sk_max_ack_backlog)
733 		wake_up_interruptible_all(&u->peer_wait);
734 	sk->sk_max_ack_backlog	= backlog;
735 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
736 
737 	/* set credentials so connect can copy them */
738 	init_peercred(sk);
739 	err = 0;
740 
741 out_unlock:
742 	unix_state_unlock(sk);
743 out:
744 	return err;
745 }
746 
747 static int unix_release(struct socket *);
748 static int unix_bind(struct socket *, struct sockaddr *, int);
749 static int unix_stream_connect(struct socket *, struct sockaddr *,
750 			       int addr_len, int flags);
751 static int unix_socketpair(struct socket *, struct socket *);
752 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
753 static int unix_getname(struct socket *, struct sockaddr *, int);
754 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
755 static __poll_t unix_dgram_poll(struct file *, struct socket *,
756 				    poll_table *);
757 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
758 #ifdef CONFIG_COMPAT
759 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
760 #endif
761 static int unix_shutdown(struct socket *, int);
762 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
763 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
764 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
765 				       struct pipe_inode_info *, size_t size,
766 				       unsigned int flags);
767 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
768 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
769 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
770 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
771 static int unix_dgram_connect(struct socket *, struct sockaddr *,
772 			      int, int);
773 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
775 				  int);
776 
777 #ifdef CONFIG_PROC_FS
778 static int unix_count_nr_fds(struct sock *sk)
779 {
780 	struct sk_buff *skb;
781 	struct unix_sock *u;
782 	int nr_fds = 0;
783 
784 	spin_lock(&sk->sk_receive_queue.lock);
785 	skb = skb_peek(&sk->sk_receive_queue);
786 	while (skb) {
787 		u = unix_sk(skb->sk);
788 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
789 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
790 	}
791 	spin_unlock(&sk->sk_receive_queue.lock);
792 
793 	return nr_fds;
794 }
795 
796 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
797 {
798 	struct sock *sk = sock->sk;
799 	unsigned char s_state;
800 	struct unix_sock *u;
801 	int nr_fds = 0;
802 
803 	if (sk) {
804 		s_state = READ_ONCE(sk->sk_state);
805 		u = unix_sk(sk);
806 
807 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
808 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
809 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
810 		 */
811 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
812 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
813 		else if (s_state == TCP_LISTEN)
814 			nr_fds = unix_count_nr_fds(sk);
815 
816 		seq_printf(m, "scm_fds: %u\n", nr_fds);
817 	}
818 }
819 #else
820 #define unix_show_fdinfo NULL
821 #endif
822 
823 static const struct proto_ops unix_stream_ops = {
824 	.family =	PF_UNIX,
825 	.owner =	THIS_MODULE,
826 	.release =	unix_release,
827 	.bind =		unix_bind,
828 	.connect =	unix_stream_connect,
829 	.socketpair =	unix_socketpair,
830 	.accept =	unix_accept,
831 	.getname =	unix_getname,
832 	.poll =		unix_poll,
833 	.ioctl =	unix_ioctl,
834 #ifdef CONFIG_COMPAT
835 	.compat_ioctl =	unix_compat_ioctl,
836 #endif
837 	.listen =	unix_listen,
838 	.shutdown =	unix_shutdown,
839 	.sendmsg =	unix_stream_sendmsg,
840 	.recvmsg =	unix_stream_recvmsg,
841 	.read_skb =	unix_stream_read_skb,
842 	.mmap =		sock_no_mmap,
843 	.splice_read =	unix_stream_splice_read,
844 	.set_peek_off =	sk_set_peek_off,
845 	.show_fdinfo =	unix_show_fdinfo,
846 };
847 
848 static const struct proto_ops unix_dgram_ops = {
849 	.family =	PF_UNIX,
850 	.owner =	THIS_MODULE,
851 	.release =	unix_release,
852 	.bind =		unix_bind,
853 	.connect =	unix_dgram_connect,
854 	.socketpair =	unix_socketpair,
855 	.accept =	sock_no_accept,
856 	.getname =	unix_getname,
857 	.poll =		unix_dgram_poll,
858 	.ioctl =	unix_ioctl,
859 #ifdef CONFIG_COMPAT
860 	.compat_ioctl =	unix_compat_ioctl,
861 #endif
862 	.listen =	sock_no_listen,
863 	.shutdown =	unix_shutdown,
864 	.sendmsg =	unix_dgram_sendmsg,
865 	.read_skb =	unix_read_skb,
866 	.recvmsg =	unix_dgram_recvmsg,
867 	.mmap =		sock_no_mmap,
868 	.set_peek_off =	sk_set_peek_off,
869 	.show_fdinfo =	unix_show_fdinfo,
870 };
871 
872 static const struct proto_ops unix_seqpacket_ops = {
873 	.family =	PF_UNIX,
874 	.owner =	THIS_MODULE,
875 	.release =	unix_release,
876 	.bind =		unix_bind,
877 	.connect =	unix_stream_connect,
878 	.socketpair =	unix_socketpair,
879 	.accept =	unix_accept,
880 	.getname =	unix_getname,
881 	.poll =		unix_dgram_poll,
882 	.ioctl =	unix_ioctl,
883 #ifdef CONFIG_COMPAT
884 	.compat_ioctl =	unix_compat_ioctl,
885 #endif
886 	.listen =	unix_listen,
887 	.shutdown =	unix_shutdown,
888 	.sendmsg =	unix_seqpacket_sendmsg,
889 	.recvmsg =	unix_seqpacket_recvmsg,
890 	.mmap =		sock_no_mmap,
891 	.set_peek_off =	sk_set_peek_off,
892 	.show_fdinfo =	unix_show_fdinfo,
893 };
894 
895 static void unix_close(struct sock *sk, long timeout)
896 {
897 	/* Nothing to do here, unix socket does not need a ->close().
898 	 * This is merely for sockmap.
899 	 */
900 }
901 
902 static void unix_unhash(struct sock *sk)
903 {
904 	/* Nothing to do here, unix socket does not need a ->unhash().
905 	 * This is merely for sockmap.
906 	 */
907 }
908 
909 static bool unix_bpf_bypass_getsockopt(int level, int optname)
910 {
911 	if (level == SOL_SOCKET) {
912 		switch (optname) {
913 		case SO_PEERPIDFD:
914 			return true;
915 		default:
916 			return false;
917 		}
918 	}
919 
920 	return false;
921 }
922 
923 struct proto unix_dgram_proto = {
924 	.name			= "UNIX",
925 	.owner			= THIS_MODULE,
926 	.obj_size		= sizeof(struct unix_sock),
927 	.close			= unix_close,
928 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
929 #ifdef CONFIG_BPF_SYSCALL
930 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
931 #endif
932 };
933 
934 struct proto unix_stream_proto = {
935 	.name			= "UNIX-STREAM",
936 	.owner			= THIS_MODULE,
937 	.obj_size		= sizeof(struct unix_sock),
938 	.close			= unix_close,
939 	.unhash			= unix_unhash,
940 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
941 #ifdef CONFIG_BPF_SYSCALL
942 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
943 #endif
944 };
945 
946 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
947 {
948 	struct unix_sock *u;
949 	struct sock *sk;
950 	int err;
951 
952 	atomic_long_inc(&unix_nr_socks);
953 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
954 		err = -ENFILE;
955 		goto err;
956 	}
957 
958 	if (type == SOCK_STREAM)
959 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
960 	else /*dgram and  seqpacket */
961 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
962 
963 	if (!sk) {
964 		err = -ENOMEM;
965 		goto err;
966 	}
967 
968 	sock_init_data(sock, sk);
969 
970 	sk->sk_hash		= unix_unbound_hash(sk);
971 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
972 	sk->sk_write_space	= unix_write_space;
973 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
974 	sk->sk_destruct		= unix_sock_destructor;
975 	u = unix_sk(sk);
976 	u->listener = NULL;
977 	u->vertex = NULL;
978 	u->path.dentry = NULL;
979 	u->path.mnt = NULL;
980 	spin_lock_init(&u->lock);
981 	mutex_init(&u->iolock); /* single task reading lock */
982 	mutex_init(&u->bindlock); /* single task binding lock */
983 	init_waitqueue_head(&u->peer_wait);
984 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
985 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
986 	unix_insert_unbound_socket(net, sk);
987 
988 	sock_prot_inuse_add(net, sk->sk_prot, 1);
989 
990 	return sk;
991 
992 err:
993 	atomic_long_dec(&unix_nr_socks);
994 	return ERR_PTR(err);
995 }
996 
997 static int unix_create(struct net *net, struct socket *sock, int protocol,
998 		       int kern)
999 {
1000 	struct sock *sk;
1001 
1002 	if (protocol && protocol != PF_UNIX)
1003 		return -EPROTONOSUPPORT;
1004 
1005 	sock->state = SS_UNCONNECTED;
1006 
1007 	switch (sock->type) {
1008 	case SOCK_STREAM:
1009 		sock->ops = &unix_stream_ops;
1010 		break;
1011 		/*
1012 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1013 		 *	nothing uses it.
1014 		 */
1015 	case SOCK_RAW:
1016 		sock->type = SOCK_DGRAM;
1017 		fallthrough;
1018 	case SOCK_DGRAM:
1019 		sock->ops = &unix_dgram_ops;
1020 		break;
1021 	case SOCK_SEQPACKET:
1022 		sock->ops = &unix_seqpacket_ops;
1023 		break;
1024 	default:
1025 		return -ESOCKTNOSUPPORT;
1026 	}
1027 
1028 	sk = unix_create1(net, sock, kern, sock->type);
1029 	if (IS_ERR(sk))
1030 		return PTR_ERR(sk);
1031 
1032 	return 0;
1033 }
1034 
1035 static int unix_release(struct socket *sock)
1036 {
1037 	struct sock *sk = sock->sk;
1038 
1039 	if (!sk)
1040 		return 0;
1041 
1042 	sk->sk_prot->close(sk, 0);
1043 	unix_release_sock(sk, 0);
1044 	sock->sk = NULL;
1045 
1046 	return 0;
1047 }
1048 
1049 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1050 				  int type)
1051 {
1052 	struct inode *inode;
1053 	struct path path;
1054 	struct sock *sk;
1055 	int err;
1056 
1057 	unix_mkname_bsd(sunaddr, addr_len);
1058 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1059 	if (err)
1060 		goto fail;
1061 
1062 	err = path_permission(&path, MAY_WRITE);
1063 	if (err)
1064 		goto path_put;
1065 
1066 	err = -ECONNREFUSED;
1067 	inode = d_backing_inode(path.dentry);
1068 	if (!S_ISSOCK(inode->i_mode))
1069 		goto path_put;
1070 
1071 	sk = unix_find_socket_byinode(inode);
1072 	if (!sk)
1073 		goto path_put;
1074 
1075 	err = -EPROTOTYPE;
1076 	if (sk->sk_type == type)
1077 		touch_atime(&path);
1078 	else
1079 		goto sock_put;
1080 
1081 	path_put(&path);
1082 
1083 	return sk;
1084 
1085 sock_put:
1086 	sock_put(sk);
1087 path_put:
1088 	path_put(&path);
1089 fail:
1090 	return ERR_PTR(err);
1091 }
1092 
1093 static struct sock *unix_find_abstract(struct net *net,
1094 				       struct sockaddr_un *sunaddr,
1095 				       int addr_len, int type)
1096 {
1097 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1098 	struct dentry *dentry;
1099 	struct sock *sk;
1100 
1101 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1102 	if (!sk)
1103 		return ERR_PTR(-ECONNREFUSED);
1104 
1105 	dentry = unix_sk(sk)->path.dentry;
1106 	if (dentry)
1107 		touch_atime(&unix_sk(sk)->path);
1108 
1109 	return sk;
1110 }
1111 
1112 static struct sock *unix_find_other(struct net *net,
1113 				    struct sockaddr_un *sunaddr,
1114 				    int addr_len, int type)
1115 {
1116 	struct sock *sk;
1117 
1118 	if (sunaddr->sun_path[0])
1119 		sk = unix_find_bsd(sunaddr, addr_len, type);
1120 	else
1121 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1122 
1123 	return sk;
1124 }
1125 
1126 static int unix_autobind(struct sock *sk)
1127 {
1128 	struct unix_sock *u = unix_sk(sk);
1129 	unsigned int new_hash, old_hash;
1130 	struct net *net = sock_net(sk);
1131 	struct unix_address *addr;
1132 	u32 lastnum, ordernum;
1133 	int err;
1134 
1135 	err = mutex_lock_interruptible(&u->bindlock);
1136 	if (err)
1137 		return err;
1138 
1139 	if (u->addr)
1140 		goto out;
1141 
1142 	err = -ENOMEM;
1143 	addr = kzalloc(sizeof(*addr) +
1144 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1145 	if (!addr)
1146 		goto out;
1147 
1148 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1149 	addr->name->sun_family = AF_UNIX;
1150 	refcount_set(&addr->refcnt, 1);
1151 
1152 	old_hash = sk->sk_hash;
1153 	ordernum = get_random_u32();
1154 	lastnum = ordernum & 0xFFFFF;
1155 retry:
1156 	ordernum = (ordernum + 1) & 0xFFFFF;
1157 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1158 
1159 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1160 	unix_table_double_lock(net, old_hash, new_hash);
1161 
1162 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1163 		unix_table_double_unlock(net, old_hash, new_hash);
1164 
1165 		/* __unix_find_socket_byname() may take long time if many names
1166 		 * are already in use.
1167 		 */
1168 		cond_resched();
1169 
1170 		if (ordernum == lastnum) {
1171 			/* Give up if all names seems to be in use. */
1172 			err = -ENOSPC;
1173 			unix_release_addr(addr);
1174 			goto out;
1175 		}
1176 
1177 		goto retry;
1178 	}
1179 
1180 	__unix_set_addr_hash(net, sk, addr, new_hash);
1181 	unix_table_double_unlock(net, old_hash, new_hash);
1182 	err = 0;
1183 
1184 out:	mutex_unlock(&u->bindlock);
1185 	return err;
1186 }
1187 
1188 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1189 			 int addr_len)
1190 {
1191 	umode_t mode = S_IFSOCK |
1192 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1193 	struct unix_sock *u = unix_sk(sk);
1194 	unsigned int new_hash, old_hash;
1195 	struct net *net = sock_net(sk);
1196 	struct mnt_idmap *idmap;
1197 	struct unix_address *addr;
1198 	struct dentry *dentry;
1199 	struct path parent;
1200 	int err;
1201 
1202 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1203 	addr = unix_create_addr(sunaddr, addr_len);
1204 	if (!addr)
1205 		return -ENOMEM;
1206 
1207 	/*
1208 	 * Get the parent directory, calculate the hash for last
1209 	 * component.
1210 	 */
1211 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1212 	if (IS_ERR(dentry)) {
1213 		err = PTR_ERR(dentry);
1214 		goto out;
1215 	}
1216 
1217 	/*
1218 	 * All right, let's create it.
1219 	 */
1220 	idmap = mnt_idmap(parent.mnt);
1221 	err = security_path_mknod(&parent, dentry, mode, 0);
1222 	if (!err)
1223 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1224 	if (err)
1225 		goto out_path;
1226 	err = mutex_lock_interruptible(&u->bindlock);
1227 	if (err)
1228 		goto out_unlink;
1229 	if (u->addr)
1230 		goto out_unlock;
1231 
1232 	old_hash = sk->sk_hash;
1233 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1234 	unix_table_double_lock(net, old_hash, new_hash);
1235 	u->path.mnt = mntget(parent.mnt);
1236 	u->path.dentry = dget(dentry);
1237 	__unix_set_addr_hash(net, sk, addr, new_hash);
1238 	unix_table_double_unlock(net, old_hash, new_hash);
1239 	unix_insert_bsd_socket(sk);
1240 	mutex_unlock(&u->bindlock);
1241 	done_path_create(&parent, dentry);
1242 	return 0;
1243 
1244 out_unlock:
1245 	mutex_unlock(&u->bindlock);
1246 	err = -EINVAL;
1247 out_unlink:
1248 	/* failed after successful mknod?  unlink what we'd created... */
1249 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1250 out_path:
1251 	done_path_create(&parent, dentry);
1252 out:
1253 	unix_release_addr(addr);
1254 	return err == -EEXIST ? -EADDRINUSE : err;
1255 }
1256 
1257 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1258 			      int addr_len)
1259 {
1260 	struct unix_sock *u = unix_sk(sk);
1261 	unsigned int new_hash, old_hash;
1262 	struct net *net = sock_net(sk);
1263 	struct unix_address *addr;
1264 	int err;
1265 
1266 	addr = unix_create_addr(sunaddr, addr_len);
1267 	if (!addr)
1268 		return -ENOMEM;
1269 
1270 	err = mutex_lock_interruptible(&u->bindlock);
1271 	if (err)
1272 		goto out;
1273 
1274 	if (u->addr) {
1275 		err = -EINVAL;
1276 		goto out_mutex;
1277 	}
1278 
1279 	old_hash = sk->sk_hash;
1280 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1281 	unix_table_double_lock(net, old_hash, new_hash);
1282 
1283 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1284 		goto out_spin;
1285 
1286 	__unix_set_addr_hash(net, sk, addr, new_hash);
1287 	unix_table_double_unlock(net, old_hash, new_hash);
1288 	mutex_unlock(&u->bindlock);
1289 	return 0;
1290 
1291 out_spin:
1292 	unix_table_double_unlock(net, old_hash, new_hash);
1293 	err = -EADDRINUSE;
1294 out_mutex:
1295 	mutex_unlock(&u->bindlock);
1296 out:
1297 	unix_release_addr(addr);
1298 	return err;
1299 }
1300 
1301 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1302 {
1303 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1304 	struct sock *sk = sock->sk;
1305 	int err;
1306 
1307 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1308 	    sunaddr->sun_family == AF_UNIX)
1309 		return unix_autobind(sk);
1310 
1311 	err = unix_validate_addr(sunaddr, addr_len);
1312 	if (err)
1313 		return err;
1314 
1315 	if (sunaddr->sun_path[0])
1316 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1317 	else
1318 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1319 
1320 	return err;
1321 }
1322 
1323 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1324 {
1325 	if (unlikely(sk1 == sk2) || !sk2) {
1326 		unix_state_lock(sk1);
1327 		return;
1328 	}
1329 	if (sk1 > sk2)
1330 		swap(sk1, sk2);
1331 
1332 	unix_state_lock(sk1);
1333 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1334 }
1335 
1336 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1337 {
1338 	if (unlikely(sk1 == sk2) || !sk2) {
1339 		unix_state_unlock(sk1);
1340 		return;
1341 	}
1342 	unix_state_unlock(sk1);
1343 	unix_state_unlock(sk2);
1344 }
1345 
1346 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1347 			      int alen, int flags)
1348 {
1349 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1350 	struct sock *sk = sock->sk;
1351 	struct sock *other;
1352 	int err;
1353 
1354 	err = -EINVAL;
1355 	if (alen < offsetofend(struct sockaddr, sa_family))
1356 		goto out;
1357 
1358 	if (addr->sa_family != AF_UNSPEC) {
1359 		err = unix_validate_addr(sunaddr, alen);
1360 		if (err)
1361 			goto out;
1362 
1363 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1364 		if (err)
1365 			goto out;
1366 
1367 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1368 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1369 		    !READ_ONCE(unix_sk(sk)->addr)) {
1370 			err = unix_autobind(sk);
1371 			if (err)
1372 				goto out;
1373 		}
1374 
1375 restart:
1376 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1377 		if (IS_ERR(other)) {
1378 			err = PTR_ERR(other);
1379 			goto out;
1380 		}
1381 
1382 		unix_state_double_lock(sk, other);
1383 
1384 		/* Apparently VFS overslept socket death. Retry. */
1385 		if (sock_flag(other, SOCK_DEAD)) {
1386 			unix_state_double_unlock(sk, other);
1387 			sock_put(other);
1388 			goto restart;
1389 		}
1390 
1391 		err = -EPERM;
1392 		if (!unix_may_send(sk, other))
1393 			goto out_unlock;
1394 
1395 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1396 		if (err)
1397 			goto out_unlock;
1398 
1399 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1400 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1401 	} else {
1402 		/*
1403 		 *	1003.1g breaking connected state with AF_UNSPEC
1404 		 */
1405 		other = NULL;
1406 		unix_state_double_lock(sk, other);
1407 	}
1408 
1409 	/*
1410 	 * If it was connected, reconnect.
1411 	 */
1412 	if (unix_peer(sk)) {
1413 		struct sock *old_peer = unix_peer(sk);
1414 
1415 		unix_peer(sk) = other;
1416 		if (!other)
1417 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1418 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1419 
1420 		unix_state_double_unlock(sk, other);
1421 
1422 		if (other != old_peer) {
1423 			unix_dgram_disconnected(sk, old_peer);
1424 
1425 			unix_state_lock(old_peer);
1426 			if (!unix_peer(old_peer))
1427 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1428 			unix_state_unlock(old_peer);
1429 		}
1430 
1431 		sock_put(old_peer);
1432 	} else {
1433 		unix_peer(sk) = other;
1434 		unix_state_double_unlock(sk, other);
1435 	}
1436 
1437 	return 0;
1438 
1439 out_unlock:
1440 	unix_state_double_unlock(sk, other);
1441 	sock_put(other);
1442 out:
1443 	return err;
1444 }
1445 
1446 static long unix_wait_for_peer(struct sock *other, long timeo)
1447 	__releases(&unix_sk(other)->lock)
1448 {
1449 	struct unix_sock *u = unix_sk(other);
1450 	int sched;
1451 	DEFINE_WAIT(wait);
1452 
1453 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1454 
1455 	sched = !sock_flag(other, SOCK_DEAD) &&
1456 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1457 		unix_recvq_full_lockless(other);
1458 
1459 	unix_state_unlock(other);
1460 
1461 	if (sched)
1462 		timeo = schedule_timeout(timeo);
1463 
1464 	finish_wait(&u->peer_wait, &wait);
1465 	return timeo;
1466 }
1467 
1468 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1469 			       int addr_len, int flags)
1470 {
1471 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1472 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1473 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1474 	struct net *net = sock_net(sk);
1475 	struct sk_buff *skb = NULL;
1476 	long timeo;
1477 	int err;
1478 
1479 	err = unix_validate_addr(sunaddr, addr_len);
1480 	if (err)
1481 		goto out;
1482 
1483 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1484 	if (err)
1485 		goto out;
1486 
1487 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1488 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1489 	    !READ_ONCE(u->addr)) {
1490 		err = unix_autobind(sk);
1491 		if (err)
1492 			goto out;
1493 	}
1494 
1495 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1496 
1497 	/* First of all allocate resources.
1498 	   If we will make it after state is locked,
1499 	   we will have to recheck all again in any case.
1500 	 */
1501 
1502 	/* create new sock for complete connection */
1503 	newsk = unix_create1(net, NULL, 0, sock->type);
1504 	if (IS_ERR(newsk)) {
1505 		err = PTR_ERR(newsk);
1506 		newsk = NULL;
1507 		goto out;
1508 	}
1509 
1510 	err = -ENOMEM;
1511 
1512 	/* Allocate skb for sending to listening sock */
1513 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1514 	if (skb == NULL)
1515 		goto out;
1516 
1517 restart:
1518 	/*  Find listening sock. */
1519 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1520 	if (IS_ERR(other)) {
1521 		err = PTR_ERR(other);
1522 		other = NULL;
1523 		goto out;
1524 	}
1525 
1526 	/* Latch state of peer */
1527 	unix_state_lock(other);
1528 
1529 	/* Apparently VFS overslept socket death. Retry. */
1530 	if (sock_flag(other, SOCK_DEAD)) {
1531 		unix_state_unlock(other);
1532 		sock_put(other);
1533 		goto restart;
1534 	}
1535 
1536 	err = -ECONNREFUSED;
1537 	if (other->sk_state != TCP_LISTEN)
1538 		goto out_unlock;
1539 	if (other->sk_shutdown & RCV_SHUTDOWN)
1540 		goto out_unlock;
1541 
1542 	if (unix_recvq_full_lockless(other)) {
1543 		err = -EAGAIN;
1544 		if (!timeo)
1545 			goto out_unlock;
1546 
1547 		timeo = unix_wait_for_peer(other, timeo);
1548 
1549 		err = sock_intr_errno(timeo);
1550 		if (signal_pending(current))
1551 			goto out;
1552 		sock_put(other);
1553 		goto restart;
1554 	}
1555 
1556 	/* Latch our state.
1557 
1558 	   It is tricky place. We need to grab our state lock and cannot
1559 	   drop lock on peer. It is dangerous because deadlock is
1560 	   possible. Connect to self case and simultaneous
1561 	   attempt to connect are eliminated by checking socket
1562 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1563 	   check this before attempt to grab lock.
1564 
1565 	   Well, and we have to recheck the state after socket locked.
1566 	 */
1567 	switch (READ_ONCE(sk->sk_state)) {
1568 	case TCP_CLOSE:
1569 		/* This is ok... continue with connect */
1570 		break;
1571 	case TCP_ESTABLISHED:
1572 		/* Socket is already connected */
1573 		err = -EISCONN;
1574 		goto out_unlock;
1575 	default:
1576 		err = -EINVAL;
1577 		goto out_unlock;
1578 	}
1579 
1580 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1581 
1582 	if (sk->sk_state != TCP_CLOSE) {
1583 		unix_state_unlock(sk);
1584 		unix_state_unlock(other);
1585 		sock_put(other);
1586 		goto restart;
1587 	}
1588 
1589 	err = security_unix_stream_connect(sk, other, newsk);
1590 	if (err) {
1591 		unix_state_unlock(sk);
1592 		goto out_unlock;
1593 	}
1594 
1595 	/* The way is open! Fastly set all the necessary fields... */
1596 
1597 	sock_hold(sk);
1598 	unix_peer(newsk)	= sk;
1599 	newsk->sk_state		= TCP_ESTABLISHED;
1600 	newsk->sk_type		= sk->sk_type;
1601 	init_peercred(newsk);
1602 	newu = unix_sk(newsk);
1603 	newu->listener = other;
1604 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1605 	otheru = unix_sk(other);
1606 
1607 	/* copy address information from listening to new sock
1608 	 *
1609 	 * The contents of *(otheru->addr) and otheru->path
1610 	 * are seen fully set up here, since we have found
1611 	 * otheru in hash under its lock.  Insertion into the
1612 	 * hash chain we'd found it in had been done in an
1613 	 * earlier critical area protected by the chain's lock,
1614 	 * the same one where we'd set *(otheru->addr) contents,
1615 	 * as well as otheru->path and otheru->addr itself.
1616 	 *
1617 	 * Using smp_store_release() here to set newu->addr
1618 	 * is enough to make those stores, as well as stores
1619 	 * to newu->path visible to anyone who gets newu->addr
1620 	 * by smp_load_acquire().  IOW, the same warranties
1621 	 * as for unix_sock instances bound in unix_bind() or
1622 	 * in unix_autobind().
1623 	 */
1624 	if (otheru->path.dentry) {
1625 		path_get(&otheru->path);
1626 		newu->path = otheru->path;
1627 	}
1628 	refcount_inc(&otheru->addr->refcnt);
1629 	smp_store_release(&newu->addr, otheru->addr);
1630 
1631 	/* Set credentials */
1632 	copy_peercred(sk, other);
1633 
1634 	sock->state	= SS_CONNECTED;
1635 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1636 	sock_hold(newsk);
1637 
1638 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1639 	unix_peer(sk)	= newsk;
1640 
1641 	unix_state_unlock(sk);
1642 
1643 	/* take ten and send info to listening sock */
1644 	spin_lock(&other->sk_receive_queue.lock);
1645 	__skb_queue_tail(&other->sk_receive_queue, skb);
1646 	spin_unlock(&other->sk_receive_queue.lock);
1647 	unix_state_unlock(other);
1648 	other->sk_data_ready(other);
1649 	sock_put(other);
1650 	return 0;
1651 
1652 out_unlock:
1653 	if (other)
1654 		unix_state_unlock(other);
1655 
1656 out:
1657 	kfree_skb(skb);
1658 	if (newsk)
1659 		unix_release_sock(newsk, 0);
1660 	if (other)
1661 		sock_put(other);
1662 	return err;
1663 }
1664 
1665 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1666 {
1667 	struct sock *ska = socka->sk, *skb = sockb->sk;
1668 
1669 	/* Join our sockets back to back */
1670 	sock_hold(ska);
1671 	sock_hold(skb);
1672 	unix_peer(ska) = skb;
1673 	unix_peer(skb) = ska;
1674 	init_peercred(ska);
1675 	init_peercred(skb);
1676 
1677 	ska->sk_state = TCP_ESTABLISHED;
1678 	skb->sk_state = TCP_ESTABLISHED;
1679 	socka->state  = SS_CONNECTED;
1680 	sockb->state  = SS_CONNECTED;
1681 	return 0;
1682 }
1683 
1684 static void unix_sock_inherit_flags(const struct socket *old,
1685 				    struct socket *new)
1686 {
1687 	if (test_bit(SOCK_PASSCRED, &old->flags))
1688 		set_bit(SOCK_PASSCRED, &new->flags);
1689 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1690 		set_bit(SOCK_PASSPIDFD, &new->flags);
1691 	if (test_bit(SOCK_PASSSEC, &old->flags))
1692 		set_bit(SOCK_PASSSEC, &new->flags);
1693 }
1694 
1695 static int unix_accept(struct socket *sock, struct socket *newsock,
1696 		       struct proto_accept_arg *arg)
1697 {
1698 	struct sock *sk = sock->sk;
1699 	struct sk_buff *skb;
1700 	struct sock *tsk;
1701 
1702 	arg->err = -EOPNOTSUPP;
1703 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1704 		goto out;
1705 
1706 	arg->err = -EINVAL;
1707 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1708 		goto out;
1709 
1710 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1711 	 * so that no locks are necessary.
1712 	 */
1713 
1714 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1715 				&arg->err);
1716 	if (!skb) {
1717 		/* This means receive shutdown. */
1718 		if (arg->err == 0)
1719 			arg->err = -EINVAL;
1720 		goto out;
1721 	}
1722 
1723 	tsk = skb->sk;
1724 	skb_free_datagram(sk, skb);
1725 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1726 
1727 	/* attach accepted sock to socket */
1728 	unix_state_lock(tsk);
1729 	unix_update_edges(unix_sk(tsk));
1730 	newsock->state = SS_CONNECTED;
1731 	unix_sock_inherit_flags(sock, newsock);
1732 	sock_graft(tsk, newsock);
1733 	unix_state_unlock(tsk);
1734 	return 0;
1735 
1736 out:
1737 	return arg->err;
1738 }
1739 
1740 
1741 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1742 {
1743 	struct sock *sk = sock->sk;
1744 	struct unix_address *addr;
1745 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1746 	int err = 0;
1747 
1748 	if (peer) {
1749 		sk = unix_peer_get(sk);
1750 
1751 		err = -ENOTCONN;
1752 		if (!sk)
1753 			goto out;
1754 		err = 0;
1755 	} else {
1756 		sock_hold(sk);
1757 	}
1758 
1759 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1760 	if (!addr) {
1761 		sunaddr->sun_family = AF_UNIX;
1762 		sunaddr->sun_path[0] = 0;
1763 		err = offsetof(struct sockaddr_un, sun_path);
1764 	} else {
1765 		err = addr->len;
1766 		memcpy(sunaddr, addr->name, addr->len);
1767 
1768 		if (peer)
1769 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1770 					       CGROUP_UNIX_GETPEERNAME);
1771 		else
1772 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1773 					       CGROUP_UNIX_GETSOCKNAME);
1774 	}
1775 	sock_put(sk);
1776 out:
1777 	return err;
1778 }
1779 
1780 /* The "user->unix_inflight" variable is protected by the garbage
1781  * collection lock, and we just read it locklessly here. If you go
1782  * over the limit, there might be a tiny race in actually noticing
1783  * it across threads. Tough.
1784  */
1785 static inline bool too_many_unix_fds(struct task_struct *p)
1786 {
1787 	struct user_struct *user = current_user();
1788 
1789 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1790 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1791 	return false;
1792 }
1793 
1794 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1795 {
1796 	if (too_many_unix_fds(current))
1797 		return -ETOOMANYREFS;
1798 
1799 	UNIXCB(skb).fp = scm->fp;
1800 	scm->fp = NULL;
1801 
1802 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1803 		return -ENOMEM;
1804 
1805 	return 0;
1806 }
1807 
1808 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1809 {
1810 	scm->fp = UNIXCB(skb).fp;
1811 	UNIXCB(skb).fp = NULL;
1812 
1813 	unix_destroy_fpl(scm->fp);
1814 }
1815 
1816 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1817 {
1818 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1819 }
1820 
1821 static void unix_destruct_scm(struct sk_buff *skb)
1822 {
1823 	struct scm_cookie scm;
1824 
1825 	memset(&scm, 0, sizeof(scm));
1826 	scm.pid  = UNIXCB(skb).pid;
1827 	if (UNIXCB(skb).fp)
1828 		unix_detach_fds(&scm, skb);
1829 
1830 	/* Alas, it calls VFS */
1831 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1832 	scm_destroy(&scm);
1833 	sock_wfree(skb);
1834 }
1835 
1836 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1837 {
1838 	int err = 0;
1839 
1840 	UNIXCB(skb).pid  = get_pid(scm->pid);
1841 	UNIXCB(skb).uid = scm->creds.uid;
1842 	UNIXCB(skb).gid = scm->creds.gid;
1843 	UNIXCB(skb).fp = NULL;
1844 	unix_get_secdata(scm, skb);
1845 	if (scm->fp && send_fds)
1846 		err = unix_attach_fds(scm, skb);
1847 
1848 	skb->destructor = unix_destruct_scm;
1849 	return err;
1850 }
1851 
1852 static bool unix_passcred_enabled(const struct socket *sock,
1853 				  const struct sock *other)
1854 {
1855 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1856 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1857 	       !other->sk_socket ||
1858 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1859 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1860 }
1861 
1862 /*
1863  * Some apps rely on write() giving SCM_CREDENTIALS
1864  * We include credentials if source or destination socket
1865  * asserted SOCK_PASSCRED.
1866  */
1867 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1868 			    const struct sock *other)
1869 {
1870 	if (UNIXCB(skb).pid)
1871 		return;
1872 	if (unix_passcred_enabled(sock, other)) {
1873 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1874 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1875 	}
1876 }
1877 
1878 static bool unix_skb_scm_eq(struct sk_buff *skb,
1879 			    struct scm_cookie *scm)
1880 {
1881 	return UNIXCB(skb).pid == scm->pid &&
1882 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1883 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1884 	       unix_secdata_eq(scm, skb);
1885 }
1886 
1887 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1888 {
1889 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1890 	struct unix_sock *u = unix_sk(sk);
1891 
1892 	if (unlikely(fp && fp->count)) {
1893 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1894 		unix_add_edges(fp, u);
1895 	}
1896 }
1897 
1898 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1899 {
1900 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1901 	struct unix_sock *u = unix_sk(sk);
1902 
1903 	if (unlikely(fp && fp->count)) {
1904 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1905 		unix_del_edges(fp);
1906 	}
1907 }
1908 
1909 /*
1910  *	Send AF_UNIX data.
1911  */
1912 
1913 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1914 			      size_t len)
1915 {
1916 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1917 	struct sock *sk = sock->sk, *other = NULL;
1918 	struct unix_sock *u = unix_sk(sk);
1919 	struct scm_cookie scm;
1920 	struct sk_buff *skb;
1921 	int data_len = 0;
1922 	int sk_locked;
1923 	long timeo;
1924 	int err;
1925 
1926 	err = scm_send(sock, msg, &scm, false);
1927 	if (err < 0)
1928 		return err;
1929 
1930 	wait_for_unix_gc(scm.fp);
1931 
1932 	err = -EOPNOTSUPP;
1933 	if (msg->msg_flags&MSG_OOB)
1934 		goto out;
1935 
1936 	if (msg->msg_namelen) {
1937 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1938 		if (err)
1939 			goto out;
1940 
1941 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1942 							    msg->msg_name,
1943 							    &msg->msg_namelen,
1944 							    NULL);
1945 		if (err)
1946 			goto out;
1947 	} else {
1948 		sunaddr = NULL;
1949 		err = -ENOTCONN;
1950 		other = unix_peer_get(sk);
1951 		if (!other)
1952 			goto out;
1953 	}
1954 
1955 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1956 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1957 	    !READ_ONCE(u->addr)) {
1958 		err = unix_autobind(sk);
1959 		if (err)
1960 			goto out;
1961 	}
1962 
1963 	err = -EMSGSIZE;
1964 	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1965 		goto out;
1966 
1967 	if (len > SKB_MAX_ALLOC) {
1968 		data_len = min_t(size_t,
1969 				 len - SKB_MAX_ALLOC,
1970 				 MAX_SKB_FRAGS * PAGE_SIZE);
1971 		data_len = PAGE_ALIGN(data_len);
1972 
1973 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1974 	}
1975 
1976 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1977 				   msg->msg_flags & MSG_DONTWAIT, &err,
1978 				   PAGE_ALLOC_COSTLY_ORDER);
1979 	if (skb == NULL)
1980 		goto out;
1981 
1982 	err = unix_scm_to_skb(&scm, skb, true);
1983 	if (err < 0)
1984 		goto out_free;
1985 
1986 	skb_put(skb, len - data_len);
1987 	skb->data_len = data_len;
1988 	skb->len = len;
1989 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1990 	if (err)
1991 		goto out_free;
1992 
1993 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1994 
1995 restart:
1996 	if (!other) {
1997 		err = -ECONNRESET;
1998 		if (sunaddr == NULL)
1999 			goto out_free;
2000 
2001 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2002 					sk->sk_type);
2003 		if (IS_ERR(other)) {
2004 			err = PTR_ERR(other);
2005 			other = NULL;
2006 			goto out_free;
2007 		}
2008 	}
2009 
2010 	if (sk_filter(other, skb) < 0) {
2011 		/* Toss the packet but do not return any error to the sender */
2012 		err = len;
2013 		goto out_free;
2014 	}
2015 
2016 	sk_locked = 0;
2017 	unix_state_lock(other);
2018 restart_locked:
2019 	err = -EPERM;
2020 	if (!unix_may_send(sk, other))
2021 		goto out_unlock;
2022 
2023 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2024 		/*
2025 		 *	Check with 1003.1g - what should
2026 		 *	datagram error
2027 		 */
2028 		unix_state_unlock(other);
2029 		sock_put(other);
2030 
2031 		if (!sk_locked)
2032 			unix_state_lock(sk);
2033 
2034 		err = 0;
2035 		if (sk->sk_type == SOCK_SEQPACKET) {
2036 			/* We are here only when racing with unix_release_sock()
2037 			 * is clearing @other. Never change state to TCP_CLOSE
2038 			 * unlike SOCK_DGRAM wants.
2039 			 */
2040 			unix_state_unlock(sk);
2041 			err = -EPIPE;
2042 		} else if (unix_peer(sk) == other) {
2043 			unix_peer(sk) = NULL;
2044 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2045 
2046 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2047 			unix_state_unlock(sk);
2048 
2049 			unix_dgram_disconnected(sk, other);
2050 			sock_put(other);
2051 			err = -ECONNREFUSED;
2052 		} else {
2053 			unix_state_unlock(sk);
2054 		}
2055 
2056 		other = NULL;
2057 		if (err)
2058 			goto out_free;
2059 		goto restart;
2060 	}
2061 
2062 	err = -EPIPE;
2063 	if (other->sk_shutdown & RCV_SHUTDOWN)
2064 		goto out_unlock;
2065 
2066 	if (sk->sk_type != SOCK_SEQPACKET) {
2067 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2068 		if (err)
2069 			goto out_unlock;
2070 	}
2071 
2072 	/* other == sk && unix_peer(other) != sk if
2073 	 * - unix_peer(sk) == NULL, destination address bound to sk
2074 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2075 	 */
2076 	if (other != sk &&
2077 	    unlikely(unix_peer(other) != sk &&
2078 	    unix_recvq_full_lockless(other))) {
2079 		if (timeo) {
2080 			timeo = unix_wait_for_peer(other, timeo);
2081 
2082 			err = sock_intr_errno(timeo);
2083 			if (signal_pending(current))
2084 				goto out_free;
2085 
2086 			goto restart;
2087 		}
2088 
2089 		if (!sk_locked) {
2090 			unix_state_unlock(other);
2091 			unix_state_double_lock(sk, other);
2092 		}
2093 
2094 		if (unix_peer(sk) != other ||
2095 		    unix_dgram_peer_wake_me(sk, other)) {
2096 			err = -EAGAIN;
2097 			sk_locked = 1;
2098 			goto out_unlock;
2099 		}
2100 
2101 		if (!sk_locked) {
2102 			sk_locked = 1;
2103 			goto restart_locked;
2104 		}
2105 	}
2106 
2107 	if (unlikely(sk_locked))
2108 		unix_state_unlock(sk);
2109 
2110 	if (sock_flag(other, SOCK_RCVTSTAMP))
2111 		__net_timestamp(skb);
2112 	maybe_add_creds(skb, sock, other);
2113 	scm_stat_add(other, skb);
2114 	skb_queue_tail(&other->sk_receive_queue, skb);
2115 	unix_state_unlock(other);
2116 	other->sk_data_ready(other);
2117 	sock_put(other);
2118 	scm_destroy(&scm);
2119 	return len;
2120 
2121 out_unlock:
2122 	if (sk_locked)
2123 		unix_state_unlock(sk);
2124 	unix_state_unlock(other);
2125 out_free:
2126 	kfree_skb(skb);
2127 out:
2128 	if (other)
2129 		sock_put(other);
2130 	scm_destroy(&scm);
2131 	return err;
2132 }
2133 
2134 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2135  * bytes, and a minimum of a full page.
2136  */
2137 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2138 
2139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2140 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2141 		     struct scm_cookie *scm, bool fds_sent)
2142 {
2143 	struct unix_sock *ousk = unix_sk(other);
2144 	struct sk_buff *skb;
2145 	int err = 0;
2146 
2147 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2148 
2149 	if (!skb)
2150 		return err;
2151 
2152 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2153 	if (err < 0) {
2154 		kfree_skb(skb);
2155 		return err;
2156 	}
2157 	skb_put(skb, 1);
2158 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2159 
2160 	if (err) {
2161 		kfree_skb(skb);
2162 		return err;
2163 	}
2164 
2165 	unix_state_lock(other);
2166 
2167 	if (sock_flag(other, SOCK_DEAD) ||
2168 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2169 		unix_state_unlock(other);
2170 		kfree_skb(skb);
2171 		return -EPIPE;
2172 	}
2173 
2174 	maybe_add_creds(skb, sock, other);
2175 	skb_get(skb);
2176 
2177 	scm_stat_add(other, skb);
2178 
2179 	spin_lock(&other->sk_receive_queue.lock);
2180 	if (ousk->oob_skb)
2181 		consume_skb(ousk->oob_skb);
2182 	WRITE_ONCE(ousk->oob_skb, skb);
2183 	__skb_queue_tail(&other->sk_receive_queue, skb);
2184 	spin_unlock(&other->sk_receive_queue.lock);
2185 
2186 	sk_send_sigurg(other);
2187 	unix_state_unlock(other);
2188 	other->sk_data_ready(other);
2189 
2190 	return err;
2191 }
2192 #endif
2193 
2194 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2195 			       size_t len)
2196 {
2197 	struct sock *sk = sock->sk;
2198 	struct sock *other = NULL;
2199 	int err, size;
2200 	struct sk_buff *skb;
2201 	int sent = 0;
2202 	struct scm_cookie scm;
2203 	bool fds_sent = false;
2204 	int data_len;
2205 
2206 	err = scm_send(sock, msg, &scm, false);
2207 	if (err < 0)
2208 		return err;
2209 
2210 	wait_for_unix_gc(scm.fp);
2211 
2212 	err = -EOPNOTSUPP;
2213 	if (msg->msg_flags & MSG_OOB) {
2214 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2215 		if (len)
2216 			len--;
2217 		else
2218 #endif
2219 			goto out_err;
2220 	}
2221 
2222 	if (msg->msg_namelen) {
2223 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2224 		goto out_err;
2225 	} else {
2226 		err = -ENOTCONN;
2227 		other = unix_peer(sk);
2228 		if (!other)
2229 			goto out_err;
2230 	}
2231 
2232 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2233 		goto pipe_err;
2234 
2235 	while (sent < len) {
2236 		size = len - sent;
2237 
2238 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2239 			skb = sock_alloc_send_pskb(sk, 0, 0,
2240 						   msg->msg_flags & MSG_DONTWAIT,
2241 						   &err, 0);
2242 		} else {
2243 			/* Keep two messages in the pipe so it schedules better */
2244 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2245 
2246 			/* allow fallback to order-0 allocations */
2247 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2248 
2249 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2250 
2251 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2252 
2253 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2254 						   msg->msg_flags & MSG_DONTWAIT, &err,
2255 						   get_order(UNIX_SKB_FRAGS_SZ));
2256 		}
2257 		if (!skb)
2258 			goto out_err;
2259 
2260 		/* Only send the fds in the first buffer */
2261 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2262 		if (err < 0) {
2263 			kfree_skb(skb);
2264 			goto out_err;
2265 		}
2266 		fds_sent = true;
2267 
2268 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2269 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2270 						   sk->sk_allocation);
2271 			if (err < 0) {
2272 				kfree_skb(skb);
2273 				goto out_err;
2274 			}
2275 			size = err;
2276 			refcount_add(size, &sk->sk_wmem_alloc);
2277 		} else {
2278 			skb_put(skb, size - data_len);
2279 			skb->data_len = data_len;
2280 			skb->len = size;
2281 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2282 			if (err) {
2283 				kfree_skb(skb);
2284 				goto out_err;
2285 			}
2286 		}
2287 
2288 		unix_state_lock(other);
2289 
2290 		if (sock_flag(other, SOCK_DEAD) ||
2291 		    (other->sk_shutdown & RCV_SHUTDOWN))
2292 			goto pipe_err_free;
2293 
2294 		maybe_add_creds(skb, sock, other);
2295 		scm_stat_add(other, skb);
2296 		skb_queue_tail(&other->sk_receive_queue, skb);
2297 		unix_state_unlock(other);
2298 		other->sk_data_ready(other);
2299 		sent += size;
2300 	}
2301 
2302 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2303 	if (msg->msg_flags & MSG_OOB) {
2304 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2305 		if (err)
2306 			goto out_err;
2307 		sent++;
2308 	}
2309 #endif
2310 
2311 	scm_destroy(&scm);
2312 
2313 	return sent;
2314 
2315 pipe_err_free:
2316 	unix_state_unlock(other);
2317 	kfree_skb(skb);
2318 pipe_err:
2319 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2320 		send_sig(SIGPIPE, current, 0);
2321 	err = -EPIPE;
2322 out_err:
2323 	scm_destroy(&scm);
2324 	return sent ? : err;
2325 }
2326 
2327 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2328 				  size_t len)
2329 {
2330 	int err;
2331 	struct sock *sk = sock->sk;
2332 
2333 	err = sock_error(sk);
2334 	if (err)
2335 		return err;
2336 
2337 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2338 		return -ENOTCONN;
2339 
2340 	if (msg->msg_namelen)
2341 		msg->msg_namelen = 0;
2342 
2343 	return unix_dgram_sendmsg(sock, msg, len);
2344 }
2345 
2346 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2347 				  size_t size, int flags)
2348 {
2349 	struct sock *sk = sock->sk;
2350 
2351 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2352 		return -ENOTCONN;
2353 
2354 	return unix_dgram_recvmsg(sock, msg, size, flags);
2355 }
2356 
2357 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2358 {
2359 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2360 
2361 	if (addr) {
2362 		msg->msg_namelen = addr->len;
2363 		memcpy(msg->msg_name, addr->name, addr->len);
2364 	}
2365 }
2366 
2367 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2368 			 int flags)
2369 {
2370 	struct scm_cookie scm;
2371 	struct socket *sock = sk->sk_socket;
2372 	struct unix_sock *u = unix_sk(sk);
2373 	struct sk_buff *skb, *last;
2374 	long timeo;
2375 	int skip;
2376 	int err;
2377 
2378 	err = -EOPNOTSUPP;
2379 	if (flags&MSG_OOB)
2380 		goto out;
2381 
2382 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2383 
2384 	do {
2385 		mutex_lock(&u->iolock);
2386 
2387 		skip = sk_peek_offset(sk, flags);
2388 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2389 					      &skip, &err, &last);
2390 		if (skb) {
2391 			if (!(flags & MSG_PEEK))
2392 				scm_stat_del(sk, skb);
2393 			break;
2394 		}
2395 
2396 		mutex_unlock(&u->iolock);
2397 
2398 		if (err != -EAGAIN)
2399 			break;
2400 	} while (timeo &&
2401 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2402 					      &err, &timeo, last));
2403 
2404 	if (!skb) { /* implies iolock unlocked */
2405 		unix_state_lock(sk);
2406 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2407 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2408 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2409 			err = 0;
2410 		unix_state_unlock(sk);
2411 		goto out;
2412 	}
2413 
2414 	if (wq_has_sleeper(&u->peer_wait))
2415 		wake_up_interruptible_sync_poll(&u->peer_wait,
2416 						EPOLLOUT | EPOLLWRNORM |
2417 						EPOLLWRBAND);
2418 
2419 	if (msg->msg_name) {
2420 		unix_copy_addr(msg, skb->sk);
2421 
2422 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2423 						      msg->msg_name,
2424 						      &msg->msg_namelen);
2425 	}
2426 
2427 	if (size > skb->len - skip)
2428 		size = skb->len - skip;
2429 	else if (size < skb->len - skip)
2430 		msg->msg_flags |= MSG_TRUNC;
2431 
2432 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2433 	if (err)
2434 		goto out_free;
2435 
2436 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2437 		__sock_recv_timestamp(msg, sk, skb);
2438 
2439 	memset(&scm, 0, sizeof(scm));
2440 
2441 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2442 	unix_set_secdata(&scm, skb);
2443 
2444 	if (!(flags & MSG_PEEK)) {
2445 		if (UNIXCB(skb).fp)
2446 			unix_detach_fds(&scm, skb);
2447 
2448 		sk_peek_offset_bwd(sk, skb->len);
2449 	} else {
2450 		/* It is questionable: on PEEK we could:
2451 		   - do not return fds - good, but too simple 8)
2452 		   - return fds, and do not return them on read (old strategy,
2453 		     apparently wrong)
2454 		   - clone fds (I chose it for now, it is the most universal
2455 		     solution)
2456 
2457 		   POSIX 1003.1g does not actually define this clearly
2458 		   at all. POSIX 1003.1g doesn't define a lot of things
2459 		   clearly however!
2460 
2461 		*/
2462 
2463 		sk_peek_offset_fwd(sk, size);
2464 
2465 		if (UNIXCB(skb).fp)
2466 			unix_peek_fds(&scm, skb);
2467 	}
2468 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2469 
2470 	scm_recv_unix(sock, msg, &scm, flags);
2471 
2472 out_free:
2473 	skb_free_datagram(sk, skb);
2474 	mutex_unlock(&u->iolock);
2475 out:
2476 	return err;
2477 }
2478 
2479 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2480 			      int flags)
2481 {
2482 	struct sock *sk = sock->sk;
2483 
2484 #ifdef CONFIG_BPF_SYSCALL
2485 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2486 
2487 	if (prot != &unix_dgram_proto)
2488 		return prot->recvmsg(sk, msg, size, flags, NULL);
2489 #endif
2490 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2491 }
2492 
2493 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2494 {
2495 	struct unix_sock *u = unix_sk(sk);
2496 	struct sk_buff *skb;
2497 	int err;
2498 
2499 	mutex_lock(&u->iolock);
2500 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2501 	mutex_unlock(&u->iolock);
2502 	if (!skb)
2503 		return err;
2504 
2505 	return recv_actor(sk, skb);
2506 }
2507 
2508 /*
2509  *	Sleep until more data has arrived. But check for races..
2510  */
2511 static long unix_stream_data_wait(struct sock *sk, long timeo,
2512 				  struct sk_buff *last, unsigned int last_len,
2513 				  bool freezable)
2514 {
2515 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2516 	struct sk_buff *tail;
2517 	DEFINE_WAIT(wait);
2518 
2519 	unix_state_lock(sk);
2520 
2521 	for (;;) {
2522 		prepare_to_wait(sk_sleep(sk), &wait, state);
2523 
2524 		tail = skb_peek_tail(&sk->sk_receive_queue);
2525 		if (tail != last ||
2526 		    (tail && tail->len != last_len) ||
2527 		    sk->sk_err ||
2528 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2529 		    signal_pending(current) ||
2530 		    !timeo)
2531 			break;
2532 
2533 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2534 		unix_state_unlock(sk);
2535 		timeo = schedule_timeout(timeo);
2536 		unix_state_lock(sk);
2537 
2538 		if (sock_flag(sk, SOCK_DEAD))
2539 			break;
2540 
2541 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2542 	}
2543 
2544 	finish_wait(sk_sleep(sk), &wait);
2545 	unix_state_unlock(sk);
2546 	return timeo;
2547 }
2548 
2549 static unsigned int unix_skb_len(const struct sk_buff *skb)
2550 {
2551 	return skb->len - UNIXCB(skb).consumed;
2552 }
2553 
2554 struct unix_stream_read_state {
2555 	int (*recv_actor)(struct sk_buff *, int, int,
2556 			  struct unix_stream_read_state *);
2557 	struct socket *socket;
2558 	struct msghdr *msg;
2559 	struct pipe_inode_info *pipe;
2560 	size_t size;
2561 	int flags;
2562 	unsigned int splice_flags;
2563 };
2564 
2565 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2566 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2567 {
2568 	struct socket *sock = state->socket;
2569 	struct sock *sk = sock->sk;
2570 	struct unix_sock *u = unix_sk(sk);
2571 	int chunk = 1;
2572 	struct sk_buff *oob_skb;
2573 
2574 	mutex_lock(&u->iolock);
2575 	unix_state_lock(sk);
2576 	spin_lock(&sk->sk_receive_queue.lock);
2577 
2578 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2579 		spin_unlock(&sk->sk_receive_queue.lock);
2580 		unix_state_unlock(sk);
2581 		mutex_unlock(&u->iolock);
2582 		return -EINVAL;
2583 	}
2584 
2585 	oob_skb = u->oob_skb;
2586 
2587 	if (!(state->flags & MSG_PEEK))
2588 		WRITE_ONCE(u->oob_skb, NULL);
2589 	else
2590 		skb_get(oob_skb);
2591 
2592 	spin_unlock(&sk->sk_receive_queue.lock);
2593 	unix_state_unlock(sk);
2594 
2595 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2596 
2597 	if (!(state->flags & MSG_PEEK))
2598 		UNIXCB(oob_skb).consumed += 1;
2599 
2600 	consume_skb(oob_skb);
2601 
2602 	mutex_unlock(&u->iolock);
2603 
2604 	if (chunk < 0)
2605 		return -EFAULT;
2606 
2607 	state->msg->msg_flags |= MSG_OOB;
2608 	return 1;
2609 }
2610 
2611 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2612 				  int flags, int copied)
2613 {
2614 	struct unix_sock *u = unix_sk(sk);
2615 
2616 	if (!unix_skb_len(skb)) {
2617 		if (!(flags & MSG_PEEK)) {
2618 			skb_unlink(skb, &sk->sk_receive_queue);
2619 			consume_skb(skb);
2620 		}
2621 
2622 		skb = NULL;
2623 	} else {
2624 		struct sk_buff *unlinked_skb = NULL;
2625 
2626 		spin_lock(&sk->sk_receive_queue.lock);
2627 
2628 		if (skb == u->oob_skb) {
2629 			if (copied) {
2630 				skb = NULL;
2631 			} else if (!(flags & MSG_PEEK)) {
2632 				if (sock_flag(sk, SOCK_URGINLINE)) {
2633 					WRITE_ONCE(u->oob_skb, NULL);
2634 					consume_skb(skb);
2635 				} else {
2636 					__skb_unlink(skb, &sk->sk_receive_queue);
2637 					WRITE_ONCE(u->oob_skb, NULL);
2638 					unlinked_skb = skb;
2639 					skb = skb_peek(&sk->sk_receive_queue);
2640 				}
2641 			} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2642 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
2643 			}
2644 		}
2645 
2646 		spin_unlock(&sk->sk_receive_queue.lock);
2647 
2648 		if (unlinked_skb) {
2649 			WARN_ON_ONCE(skb_unref(unlinked_skb));
2650 			kfree_skb(unlinked_skb);
2651 		}
2652 	}
2653 	return skb;
2654 }
2655 #endif
2656 
2657 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2658 {
2659 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2660 		return -ENOTCONN;
2661 
2662 	return unix_read_skb(sk, recv_actor);
2663 }
2664 
2665 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2666 				    bool freezable)
2667 {
2668 	struct scm_cookie scm;
2669 	struct socket *sock = state->socket;
2670 	struct sock *sk = sock->sk;
2671 	struct unix_sock *u = unix_sk(sk);
2672 	int copied = 0;
2673 	int flags = state->flags;
2674 	int noblock = flags & MSG_DONTWAIT;
2675 	bool check_creds = false;
2676 	int target;
2677 	int err = 0;
2678 	long timeo;
2679 	int skip;
2680 	size_t size = state->size;
2681 	unsigned int last_len;
2682 
2683 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2684 		err = -EINVAL;
2685 		goto out;
2686 	}
2687 
2688 	if (unlikely(flags & MSG_OOB)) {
2689 		err = -EOPNOTSUPP;
2690 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2691 		err = unix_stream_recv_urg(state);
2692 #endif
2693 		goto out;
2694 	}
2695 
2696 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2697 	timeo = sock_rcvtimeo(sk, noblock);
2698 
2699 	memset(&scm, 0, sizeof(scm));
2700 
2701 	/* Lock the socket to prevent queue disordering
2702 	 * while sleeps in memcpy_tomsg
2703 	 */
2704 	mutex_lock(&u->iolock);
2705 
2706 	skip = max(sk_peek_offset(sk, flags), 0);
2707 
2708 	do {
2709 		int chunk;
2710 		bool drop_skb;
2711 		struct sk_buff *skb, *last;
2712 
2713 redo:
2714 		unix_state_lock(sk);
2715 		if (sock_flag(sk, SOCK_DEAD)) {
2716 			err = -ECONNRESET;
2717 			goto unlock;
2718 		}
2719 		last = skb = skb_peek(&sk->sk_receive_queue);
2720 		last_len = last ? last->len : 0;
2721 
2722 again:
2723 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2724 		if (skb) {
2725 			skb = manage_oob(skb, sk, flags, copied);
2726 			if (!skb && copied) {
2727 				unix_state_unlock(sk);
2728 				break;
2729 			}
2730 		}
2731 #endif
2732 		if (skb == NULL) {
2733 			if (copied >= target)
2734 				goto unlock;
2735 
2736 			/*
2737 			 *	POSIX 1003.1g mandates this order.
2738 			 */
2739 
2740 			err = sock_error(sk);
2741 			if (err)
2742 				goto unlock;
2743 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2744 				goto unlock;
2745 
2746 			unix_state_unlock(sk);
2747 			if (!timeo) {
2748 				err = -EAGAIN;
2749 				break;
2750 			}
2751 
2752 			mutex_unlock(&u->iolock);
2753 
2754 			timeo = unix_stream_data_wait(sk, timeo, last,
2755 						      last_len, freezable);
2756 
2757 			if (signal_pending(current)) {
2758 				err = sock_intr_errno(timeo);
2759 				scm_destroy(&scm);
2760 				goto out;
2761 			}
2762 
2763 			mutex_lock(&u->iolock);
2764 			goto redo;
2765 unlock:
2766 			unix_state_unlock(sk);
2767 			break;
2768 		}
2769 
2770 		while (skip >= unix_skb_len(skb)) {
2771 			skip -= unix_skb_len(skb);
2772 			last = skb;
2773 			last_len = skb->len;
2774 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2775 			if (!skb)
2776 				goto again;
2777 		}
2778 
2779 		unix_state_unlock(sk);
2780 
2781 		if (check_creds) {
2782 			/* Never glue messages from different writers */
2783 			if (!unix_skb_scm_eq(skb, &scm))
2784 				break;
2785 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2786 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2787 			/* Copy credentials */
2788 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2789 			unix_set_secdata(&scm, skb);
2790 			check_creds = true;
2791 		}
2792 
2793 		/* Copy address just once */
2794 		if (state->msg && state->msg->msg_name) {
2795 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2796 					 state->msg->msg_name);
2797 			unix_copy_addr(state->msg, skb->sk);
2798 
2799 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2800 							      state->msg->msg_name,
2801 							      &state->msg->msg_namelen);
2802 
2803 			sunaddr = NULL;
2804 		}
2805 
2806 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2807 		skb_get(skb);
2808 		chunk = state->recv_actor(skb, skip, chunk, state);
2809 		drop_skb = !unix_skb_len(skb);
2810 		/* skb is only safe to use if !drop_skb */
2811 		consume_skb(skb);
2812 		if (chunk < 0) {
2813 			if (copied == 0)
2814 				copied = -EFAULT;
2815 			break;
2816 		}
2817 		copied += chunk;
2818 		size -= chunk;
2819 
2820 		if (drop_skb) {
2821 			/* the skb was touched by a concurrent reader;
2822 			 * we should not expect anything from this skb
2823 			 * anymore and assume it invalid - we can be
2824 			 * sure it was dropped from the socket queue
2825 			 *
2826 			 * let's report a short read
2827 			 */
2828 			err = 0;
2829 			break;
2830 		}
2831 
2832 		/* Mark read part of skb as used */
2833 		if (!(flags & MSG_PEEK)) {
2834 			UNIXCB(skb).consumed += chunk;
2835 
2836 			sk_peek_offset_bwd(sk, chunk);
2837 
2838 			if (UNIXCB(skb).fp) {
2839 				scm_stat_del(sk, skb);
2840 				unix_detach_fds(&scm, skb);
2841 			}
2842 
2843 			if (unix_skb_len(skb))
2844 				break;
2845 
2846 			skb_unlink(skb, &sk->sk_receive_queue);
2847 			consume_skb(skb);
2848 
2849 			if (scm.fp)
2850 				break;
2851 		} else {
2852 			/* It is questionable, see note in unix_dgram_recvmsg.
2853 			 */
2854 			if (UNIXCB(skb).fp)
2855 				unix_peek_fds(&scm, skb);
2856 
2857 			sk_peek_offset_fwd(sk, chunk);
2858 
2859 			if (UNIXCB(skb).fp)
2860 				break;
2861 
2862 			skip = 0;
2863 			last = skb;
2864 			last_len = skb->len;
2865 			unix_state_lock(sk);
2866 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2867 			if (skb)
2868 				goto again;
2869 			unix_state_unlock(sk);
2870 			break;
2871 		}
2872 	} while (size);
2873 
2874 	mutex_unlock(&u->iolock);
2875 	if (state->msg)
2876 		scm_recv_unix(sock, state->msg, &scm, flags);
2877 	else
2878 		scm_destroy(&scm);
2879 out:
2880 	return copied ? : err;
2881 }
2882 
2883 static int unix_stream_read_actor(struct sk_buff *skb,
2884 				  int skip, int chunk,
2885 				  struct unix_stream_read_state *state)
2886 {
2887 	int ret;
2888 
2889 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2890 				    state->msg, chunk);
2891 	return ret ?: chunk;
2892 }
2893 
2894 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2895 			  size_t size, int flags)
2896 {
2897 	struct unix_stream_read_state state = {
2898 		.recv_actor = unix_stream_read_actor,
2899 		.socket = sk->sk_socket,
2900 		.msg = msg,
2901 		.size = size,
2902 		.flags = flags
2903 	};
2904 
2905 	return unix_stream_read_generic(&state, true);
2906 }
2907 
2908 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2909 			       size_t size, int flags)
2910 {
2911 	struct unix_stream_read_state state = {
2912 		.recv_actor = unix_stream_read_actor,
2913 		.socket = sock,
2914 		.msg = msg,
2915 		.size = size,
2916 		.flags = flags
2917 	};
2918 
2919 #ifdef CONFIG_BPF_SYSCALL
2920 	struct sock *sk = sock->sk;
2921 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2922 
2923 	if (prot != &unix_stream_proto)
2924 		return prot->recvmsg(sk, msg, size, flags, NULL);
2925 #endif
2926 	return unix_stream_read_generic(&state, true);
2927 }
2928 
2929 static int unix_stream_splice_actor(struct sk_buff *skb,
2930 				    int skip, int chunk,
2931 				    struct unix_stream_read_state *state)
2932 {
2933 	return skb_splice_bits(skb, state->socket->sk,
2934 			       UNIXCB(skb).consumed + skip,
2935 			       state->pipe, chunk, state->splice_flags);
2936 }
2937 
2938 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2939 				       struct pipe_inode_info *pipe,
2940 				       size_t size, unsigned int flags)
2941 {
2942 	struct unix_stream_read_state state = {
2943 		.recv_actor = unix_stream_splice_actor,
2944 		.socket = sock,
2945 		.pipe = pipe,
2946 		.size = size,
2947 		.splice_flags = flags,
2948 	};
2949 
2950 	if (unlikely(*ppos))
2951 		return -ESPIPE;
2952 
2953 	if (sock->file->f_flags & O_NONBLOCK ||
2954 	    flags & SPLICE_F_NONBLOCK)
2955 		state.flags = MSG_DONTWAIT;
2956 
2957 	return unix_stream_read_generic(&state, false);
2958 }
2959 
2960 static int unix_shutdown(struct socket *sock, int mode)
2961 {
2962 	struct sock *sk = sock->sk;
2963 	struct sock *other;
2964 
2965 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2966 		return -EINVAL;
2967 	/* This maps:
2968 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2969 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2970 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2971 	 */
2972 	++mode;
2973 
2974 	unix_state_lock(sk);
2975 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2976 	other = unix_peer(sk);
2977 	if (other)
2978 		sock_hold(other);
2979 	unix_state_unlock(sk);
2980 	sk->sk_state_change(sk);
2981 
2982 	if (other &&
2983 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2984 
2985 		int peer_mode = 0;
2986 		const struct proto *prot = READ_ONCE(other->sk_prot);
2987 
2988 		if (prot->unhash)
2989 			prot->unhash(other);
2990 		if (mode&RCV_SHUTDOWN)
2991 			peer_mode |= SEND_SHUTDOWN;
2992 		if (mode&SEND_SHUTDOWN)
2993 			peer_mode |= RCV_SHUTDOWN;
2994 		unix_state_lock(other);
2995 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2996 		unix_state_unlock(other);
2997 		other->sk_state_change(other);
2998 		if (peer_mode == SHUTDOWN_MASK)
2999 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3000 		else if (peer_mode & RCV_SHUTDOWN)
3001 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3002 	}
3003 	if (other)
3004 		sock_put(other);
3005 
3006 	return 0;
3007 }
3008 
3009 long unix_inq_len(struct sock *sk)
3010 {
3011 	struct sk_buff *skb;
3012 	long amount = 0;
3013 
3014 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3015 		return -EINVAL;
3016 
3017 	spin_lock(&sk->sk_receive_queue.lock);
3018 	if (sk->sk_type == SOCK_STREAM ||
3019 	    sk->sk_type == SOCK_SEQPACKET) {
3020 		skb_queue_walk(&sk->sk_receive_queue, skb)
3021 			amount += unix_skb_len(skb);
3022 	} else {
3023 		skb = skb_peek(&sk->sk_receive_queue);
3024 		if (skb)
3025 			amount = skb->len;
3026 	}
3027 	spin_unlock(&sk->sk_receive_queue.lock);
3028 
3029 	return amount;
3030 }
3031 EXPORT_SYMBOL_GPL(unix_inq_len);
3032 
3033 long unix_outq_len(struct sock *sk)
3034 {
3035 	return sk_wmem_alloc_get(sk);
3036 }
3037 EXPORT_SYMBOL_GPL(unix_outq_len);
3038 
3039 static int unix_open_file(struct sock *sk)
3040 {
3041 	struct path path;
3042 	struct file *f;
3043 	int fd;
3044 
3045 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3046 		return -EPERM;
3047 
3048 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3049 		return -ENOENT;
3050 
3051 	path = unix_sk(sk)->path;
3052 	if (!path.dentry)
3053 		return -ENOENT;
3054 
3055 	path_get(&path);
3056 
3057 	fd = get_unused_fd_flags(O_CLOEXEC);
3058 	if (fd < 0)
3059 		goto out;
3060 
3061 	f = dentry_open(&path, O_PATH, current_cred());
3062 	if (IS_ERR(f)) {
3063 		put_unused_fd(fd);
3064 		fd = PTR_ERR(f);
3065 		goto out;
3066 	}
3067 
3068 	fd_install(fd, f);
3069 out:
3070 	path_put(&path);
3071 
3072 	return fd;
3073 }
3074 
3075 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3076 {
3077 	struct sock *sk = sock->sk;
3078 	long amount = 0;
3079 	int err;
3080 
3081 	switch (cmd) {
3082 	case SIOCOUTQ:
3083 		amount = unix_outq_len(sk);
3084 		err = put_user(amount, (int __user *)arg);
3085 		break;
3086 	case SIOCINQ:
3087 		amount = unix_inq_len(sk);
3088 		if (amount < 0)
3089 			err = amount;
3090 		else
3091 			err = put_user(amount, (int __user *)arg);
3092 		break;
3093 	case SIOCUNIXFILE:
3094 		err = unix_open_file(sk);
3095 		break;
3096 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3097 	case SIOCATMARK:
3098 		{
3099 			struct sk_buff *skb;
3100 			int answ = 0;
3101 
3102 			skb = skb_peek(&sk->sk_receive_queue);
3103 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3104 				answ = 1;
3105 			err = put_user(answ, (int __user *)arg);
3106 		}
3107 		break;
3108 #endif
3109 	default:
3110 		err = -ENOIOCTLCMD;
3111 		break;
3112 	}
3113 	return err;
3114 }
3115 
3116 #ifdef CONFIG_COMPAT
3117 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3118 {
3119 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3120 }
3121 #endif
3122 
3123 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3124 {
3125 	struct sock *sk = sock->sk;
3126 	unsigned char state;
3127 	__poll_t mask;
3128 	u8 shutdown;
3129 
3130 	sock_poll_wait(file, sock, wait);
3131 	mask = 0;
3132 	shutdown = READ_ONCE(sk->sk_shutdown);
3133 	state = READ_ONCE(sk->sk_state);
3134 
3135 	/* exceptional events? */
3136 	if (READ_ONCE(sk->sk_err))
3137 		mask |= EPOLLERR;
3138 	if (shutdown == SHUTDOWN_MASK)
3139 		mask |= EPOLLHUP;
3140 	if (shutdown & RCV_SHUTDOWN)
3141 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3142 
3143 	/* readable? */
3144 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3145 		mask |= EPOLLIN | EPOLLRDNORM;
3146 	if (sk_is_readable(sk))
3147 		mask |= EPOLLIN | EPOLLRDNORM;
3148 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3149 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3150 		mask |= EPOLLPRI;
3151 #endif
3152 
3153 	/* Connection-based need to check for termination and startup */
3154 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3155 	    state == TCP_CLOSE)
3156 		mask |= EPOLLHUP;
3157 
3158 	/*
3159 	 * we set writable also when the other side has shut down the
3160 	 * connection. This prevents stuck sockets.
3161 	 */
3162 	if (unix_writable(sk, state))
3163 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3164 
3165 	return mask;
3166 }
3167 
3168 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3169 				    poll_table *wait)
3170 {
3171 	struct sock *sk = sock->sk, *other;
3172 	unsigned int writable;
3173 	unsigned char state;
3174 	__poll_t mask;
3175 	u8 shutdown;
3176 
3177 	sock_poll_wait(file, sock, wait);
3178 	mask = 0;
3179 	shutdown = READ_ONCE(sk->sk_shutdown);
3180 	state = READ_ONCE(sk->sk_state);
3181 
3182 	/* exceptional events? */
3183 	if (READ_ONCE(sk->sk_err) ||
3184 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3185 		mask |= EPOLLERR |
3186 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3187 
3188 	if (shutdown & RCV_SHUTDOWN)
3189 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3190 	if (shutdown == SHUTDOWN_MASK)
3191 		mask |= EPOLLHUP;
3192 
3193 	/* readable? */
3194 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3195 		mask |= EPOLLIN | EPOLLRDNORM;
3196 	if (sk_is_readable(sk))
3197 		mask |= EPOLLIN | EPOLLRDNORM;
3198 
3199 	/* Connection-based need to check for termination and startup */
3200 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3201 		mask |= EPOLLHUP;
3202 
3203 	/* No write status requested, avoid expensive OUT tests. */
3204 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3205 		return mask;
3206 
3207 	writable = unix_writable(sk, state);
3208 	if (writable) {
3209 		unix_state_lock(sk);
3210 
3211 		other = unix_peer(sk);
3212 		if (other && unix_peer(other) != sk &&
3213 		    unix_recvq_full_lockless(other) &&
3214 		    unix_dgram_peer_wake_me(sk, other))
3215 			writable = 0;
3216 
3217 		unix_state_unlock(sk);
3218 	}
3219 
3220 	if (writable)
3221 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3222 	else
3223 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3224 
3225 	return mask;
3226 }
3227 
3228 #ifdef CONFIG_PROC_FS
3229 
3230 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3231 
3232 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3233 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3234 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3235 
3236 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3237 {
3238 	unsigned long offset = get_offset(*pos);
3239 	unsigned long bucket = get_bucket(*pos);
3240 	unsigned long count = 0;
3241 	struct sock *sk;
3242 
3243 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3244 	     sk; sk = sk_next(sk)) {
3245 		if (++count == offset)
3246 			break;
3247 	}
3248 
3249 	return sk;
3250 }
3251 
3252 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3253 {
3254 	unsigned long bucket = get_bucket(*pos);
3255 	struct net *net = seq_file_net(seq);
3256 	struct sock *sk;
3257 
3258 	while (bucket < UNIX_HASH_SIZE) {
3259 		spin_lock(&net->unx.table.locks[bucket]);
3260 
3261 		sk = unix_from_bucket(seq, pos);
3262 		if (sk)
3263 			return sk;
3264 
3265 		spin_unlock(&net->unx.table.locks[bucket]);
3266 
3267 		*pos = set_bucket_offset(++bucket, 1);
3268 	}
3269 
3270 	return NULL;
3271 }
3272 
3273 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3274 				  loff_t *pos)
3275 {
3276 	unsigned long bucket = get_bucket(*pos);
3277 
3278 	sk = sk_next(sk);
3279 	if (sk)
3280 		return sk;
3281 
3282 
3283 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3284 
3285 	*pos = set_bucket_offset(++bucket, 1);
3286 
3287 	return unix_get_first(seq, pos);
3288 }
3289 
3290 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3291 {
3292 	if (!*pos)
3293 		return SEQ_START_TOKEN;
3294 
3295 	return unix_get_first(seq, pos);
3296 }
3297 
3298 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3299 {
3300 	++*pos;
3301 
3302 	if (v == SEQ_START_TOKEN)
3303 		return unix_get_first(seq, pos);
3304 
3305 	return unix_get_next(seq, v, pos);
3306 }
3307 
3308 static void unix_seq_stop(struct seq_file *seq, void *v)
3309 {
3310 	struct sock *sk = v;
3311 
3312 	if (sk)
3313 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3314 }
3315 
3316 static int unix_seq_show(struct seq_file *seq, void *v)
3317 {
3318 
3319 	if (v == SEQ_START_TOKEN)
3320 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3321 			 "Inode Path\n");
3322 	else {
3323 		struct sock *s = v;
3324 		struct unix_sock *u = unix_sk(s);
3325 		unix_state_lock(s);
3326 
3327 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3328 			s,
3329 			refcount_read(&s->sk_refcnt),
3330 			0,
3331 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3332 			s->sk_type,
3333 			s->sk_socket ?
3334 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3335 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3336 			sock_i_ino(s));
3337 
3338 		if (u->addr) {	// under a hash table lock here
3339 			int i, len;
3340 			seq_putc(seq, ' ');
3341 
3342 			i = 0;
3343 			len = u->addr->len -
3344 				offsetof(struct sockaddr_un, sun_path);
3345 			if (u->addr->name->sun_path[0]) {
3346 				len--;
3347 			} else {
3348 				seq_putc(seq, '@');
3349 				i++;
3350 			}
3351 			for ( ; i < len; i++)
3352 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3353 					 '@');
3354 		}
3355 		unix_state_unlock(s);
3356 		seq_putc(seq, '\n');
3357 	}
3358 
3359 	return 0;
3360 }
3361 
3362 static const struct seq_operations unix_seq_ops = {
3363 	.start  = unix_seq_start,
3364 	.next   = unix_seq_next,
3365 	.stop   = unix_seq_stop,
3366 	.show   = unix_seq_show,
3367 };
3368 
3369 #ifdef CONFIG_BPF_SYSCALL
3370 struct bpf_unix_iter_state {
3371 	struct seq_net_private p;
3372 	unsigned int cur_sk;
3373 	unsigned int end_sk;
3374 	unsigned int max_sk;
3375 	struct sock **batch;
3376 	bool st_bucket_done;
3377 };
3378 
3379 struct bpf_iter__unix {
3380 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3381 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3382 	uid_t uid __aligned(8);
3383 };
3384 
3385 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3386 			      struct unix_sock *unix_sk, uid_t uid)
3387 {
3388 	struct bpf_iter__unix ctx;
3389 
3390 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3391 	ctx.meta = meta;
3392 	ctx.unix_sk = unix_sk;
3393 	ctx.uid = uid;
3394 	return bpf_iter_run_prog(prog, &ctx);
3395 }
3396 
3397 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3398 
3399 {
3400 	struct bpf_unix_iter_state *iter = seq->private;
3401 	unsigned int expected = 1;
3402 	struct sock *sk;
3403 
3404 	sock_hold(start_sk);
3405 	iter->batch[iter->end_sk++] = start_sk;
3406 
3407 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3408 		if (iter->end_sk < iter->max_sk) {
3409 			sock_hold(sk);
3410 			iter->batch[iter->end_sk++] = sk;
3411 		}
3412 
3413 		expected++;
3414 	}
3415 
3416 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3417 
3418 	return expected;
3419 }
3420 
3421 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3422 {
3423 	while (iter->cur_sk < iter->end_sk)
3424 		sock_put(iter->batch[iter->cur_sk++]);
3425 }
3426 
3427 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3428 				       unsigned int new_batch_sz)
3429 {
3430 	struct sock **new_batch;
3431 
3432 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3433 			     GFP_USER | __GFP_NOWARN);
3434 	if (!new_batch)
3435 		return -ENOMEM;
3436 
3437 	bpf_iter_unix_put_batch(iter);
3438 	kvfree(iter->batch);
3439 	iter->batch = new_batch;
3440 	iter->max_sk = new_batch_sz;
3441 
3442 	return 0;
3443 }
3444 
3445 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3446 					loff_t *pos)
3447 {
3448 	struct bpf_unix_iter_state *iter = seq->private;
3449 	unsigned int expected;
3450 	bool resized = false;
3451 	struct sock *sk;
3452 
3453 	if (iter->st_bucket_done)
3454 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3455 
3456 again:
3457 	/* Get a new batch */
3458 	iter->cur_sk = 0;
3459 	iter->end_sk = 0;
3460 
3461 	sk = unix_get_first(seq, pos);
3462 	if (!sk)
3463 		return NULL; /* Done */
3464 
3465 	expected = bpf_iter_unix_hold_batch(seq, sk);
3466 
3467 	if (iter->end_sk == expected) {
3468 		iter->st_bucket_done = true;
3469 		return sk;
3470 	}
3471 
3472 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3473 		resized = true;
3474 		goto again;
3475 	}
3476 
3477 	return sk;
3478 }
3479 
3480 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3481 {
3482 	if (!*pos)
3483 		return SEQ_START_TOKEN;
3484 
3485 	/* bpf iter does not support lseek, so it always
3486 	 * continue from where it was stop()-ped.
3487 	 */
3488 	return bpf_iter_unix_batch(seq, pos);
3489 }
3490 
3491 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3492 {
3493 	struct bpf_unix_iter_state *iter = seq->private;
3494 	struct sock *sk;
3495 
3496 	/* Whenever seq_next() is called, the iter->cur_sk is
3497 	 * done with seq_show(), so advance to the next sk in
3498 	 * the batch.
3499 	 */
3500 	if (iter->cur_sk < iter->end_sk)
3501 		sock_put(iter->batch[iter->cur_sk++]);
3502 
3503 	++*pos;
3504 
3505 	if (iter->cur_sk < iter->end_sk)
3506 		sk = iter->batch[iter->cur_sk];
3507 	else
3508 		sk = bpf_iter_unix_batch(seq, pos);
3509 
3510 	return sk;
3511 }
3512 
3513 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3514 {
3515 	struct bpf_iter_meta meta;
3516 	struct bpf_prog *prog;
3517 	struct sock *sk = v;
3518 	uid_t uid;
3519 	bool slow;
3520 	int ret;
3521 
3522 	if (v == SEQ_START_TOKEN)
3523 		return 0;
3524 
3525 	slow = lock_sock_fast(sk);
3526 
3527 	if (unlikely(sk_unhashed(sk))) {
3528 		ret = SEQ_SKIP;
3529 		goto unlock;
3530 	}
3531 
3532 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3533 	meta.seq = seq;
3534 	prog = bpf_iter_get_info(&meta, false);
3535 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3536 unlock:
3537 	unlock_sock_fast(sk, slow);
3538 	return ret;
3539 }
3540 
3541 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3542 {
3543 	struct bpf_unix_iter_state *iter = seq->private;
3544 	struct bpf_iter_meta meta;
3545 	struct bpf_prog *prog;
3546 
3547 	if (!v) {
3548 		meta.seq = seq;
3549 		prog = bpf_iter_get_info(&meta, true);
3550 		if (prog)
3551 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3552 	}
3553 
3554 	if (iter->cur_sk < iter->end_sk)
3555 		bpf_iter_unix_put_batch(iter);
3556 }
3557 
3558 static const struct seq_operations bpf_iter_unix_seq_ops = {
3559 	.start	= bpf_iter_unix_seq_start,
3560 	.next	= bpf_iter_unix_seq_next,
3561 	.stop	= bpf_iter_unix_seq_stop,
3562 	.show	= bpf_iter_unix_seq_show,
3563 };
3564 #endif
3565 #endif
3566 
3567 static const struct net_proto_family unix_family_ops = {
3568 	.family = PF_UNIX,
3569 	.create = unix_create,
3570 	.owner	= THIS_MODULE,
3571 };
3572 
3573 
3574 static int __net_init unix_net_init(struct net *net)
3575 {
3576 	int i;
3577 
3578 	net->unx.sysctl_max_dgram_qlen = 10;
3579 	if (unix_sysctl_register(net))
3580 		goto out;
3581 
3582 #ifdef CONFIG_PROC_FS
3583 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3584 			     sizeof(struct seq_net_private)))
3585 		goto err_sysctl;
3586 #endif
3587 
3588 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3589 					      sizeof(spinlock_t), GFP_KERNEL);
3590 	if (!net->unx.table.locks)
3591 		goto err_proc;
3592 
3593 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3594 						sizeof(struct hlist_head),
3595 						GFP_KERNEL);
3596 	if (!net->unx.table.buckets)
3597 		goto free_locks;
3598 
3599 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3600 		spin_lock_init(&net->unx.table.locks[i]);
3601 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3602 	}
3603 
3604 	return 0;
3605 
3606 free_locks:
3607 	kvfree(net->unx.table.locks);
3608 err_proc:
3609 #ifdef CONFIG_PROC_FS
3610 	remove_proc_entry("unix", net->proc_net);
3611 err_sysctl:
3612 #endif
3613 	unix_sysctl_unregister(net);
3614 out:
3615 	return -ENOMEM;
3616 }
3617 
3618 static void __net_exit unix_net_exit(struct net *net)
3619 {
3620 	kvfree(net->unx.table.buckets);
3621 	kvfree(net->unx.table.locks);
3622 	unix_sysctl_unregister(net);
3623 	remove_proc_entry("unix", net->proc_net);
3624 }
3625 
3626 static struct pernet_operations unix_net_ops = {
3627 	.init = unix_net_init,
3628 	.exit = unix_net_exit,
3629 };
3630 
3631 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3632 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3633 		     struct unix_sock *unix_sk, uid_t uid)
3634 
3635 #define INIT_BATCH_SZ 16
3636 
3637 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3638 {
3639 	struct bpf_unix_iter_state *iter = priv_data;
3640 	int err;
3641 
3642 	err = bpf_iter_init_seq_net(priv_data, aux);
3643 	if (err)
3644 		return err;
3645 
3646 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3647 	if (err) {
3648 		bpf_iter_fini_seq_net(priv_data);
3649 		return err;
3650 	}
3651 
3652 	return 0;
3653 }
3654 
3655 static void bpf_iter_fini_unix(void *priv_data)
3656 {
3657 	struct bpf_unix_iter_state *iter = priv_data;
3658 
3659 	bpf_iter_fini_seq_net(priv_data);
3660 	kvfree(iter->batch);
3661 }
3662 
3663 static const struct bpf_iter_seq_info unix_seq_info = {
3664 	.seq_ops		= &bpf_iter_unix_seq_ops,
3665 	.init_seq_private	= bpf_iter_init_unix,
3666 	.fini_seq_private	= bpf_iter_fini_unix,
3667 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3668 };
3669 
3670 static const struct bpf_func_proto *
3671 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3672 			     const struct bpf_prog *prog)
3673 {
3674 	switch (func_id) {
3675 	case BPF_FUNC_setsockopt:
3676 		return &bpf_sk_setsockopt_proto;
3677 	case BPF_FUNC_getsockopt:
3678 		return &bpf_sk_getsockopt_proto;
3679 	default:
3680 		return NULL;
3681 	}
3682 }
3683 
3684 static struct bpf_iter_reg unix_reg_info = {
3685 	.target			= "unix",
3686 	.ctx_arg_info_size	= 1,
3687 	.ctx_arg_info		= {
3688 		{ offsetof(struct bpf_iter__unix, unix_sk),
3689 		  PTR_TO_BTF_ID_OR_NULL },
3690 	},
3691 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3692 	.seq_info		= &unix_seq_info,
3693 };
3694 
3695 static void __init bpf_iter_register(void)
3696 {
3697 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3698 	if (bpf_iter_reg_target(&unix_reg_info))
3699 		pr_warn("Warning: could not register bpf iterator unix\n");
3700 }
3701 #endif
3702 
3703 static int __init af_unix_init(void)
3704 {
3705 	int i, rc = -1;
3706 
3707 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3708 
3709 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3710 		spin_lock_init(&bsd_socket_locks[i]);
3711 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3712 	}
3713 
3714 	rc = proto_register(&unix_dgram_proto, 1);
3715 	if (rc != 0) {
3716 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3717 		goto out;
3718 	}
3719 
3720 	rc = proto_register(&unix_stream_proto, 1);
3721 	if (rc != 0) {
3722 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3723 		proto_unregister(&unix_dgram_proto);
3724 		goto out;
3725 	}
3726 
3727 	sock_register(&unix_family_ops);
3728 	register_pernet_subsys(&unix_net_ops);
3729 	unix_bpf_build_proto();
3730 
3731 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3732 	bpf_iter_register();
3733 #endif
3734 
3735 out:
3736 	return rc;
3737 }
3738 
3739 /* Later than subsys_initcall() because we depend on stuff initialised there */
3740 fs_initcall(af_unix_init);
3741