xref: /linux/net/unix/af_unix.c (revision 9a95c5bfbf02a0a7f5983280fe284a0ff0836c34)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216 	return unix_peer(osk) == sk;
217 }
218 
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223 
224 static inline int unix_recvq_full_lockless(const struct sock *sk)
225 {
226 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228 
229 struct sock *unix_peer_get(struct sock *s)
230 {
231 	struct sock *peer;
232 
233 	unix_state_lock(s);
234 	peer = unix_peer(s);
235 	if (peer)
236 		sock_hold(peer);
237 	unix_state_unlock(s);
238 	return peer;
239 }
240 EXPORT_SYMBOL_GPL(unix_peer_get);
241 
242 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
243 					     int addr_len)
244 {
245 	struct unix_address *addr;
246 
247 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
248 	if (!addr)
249 		return NULL;
250 
251 	refcount_set(&addr->refcnt, 1);
252 	addr->len = addr_len;
253 	memcpy(addr->name, sunaddr, addr_len);
254 
255 	return addr;
256 }
257 
258 static inline void unix_release_addr(struct unix_address *addr)
259 {
260 	if (refcount_dec_and_test(&addr->refcnt))
261 		kfree(addr);
262 }
263 
264 /*
265  *	Check unix socket name:
266  *		- should be not zero length.
267  *	        - if started by not zero, should be NULL terminated (FS object)
268  *		- if started by zero, it is abstract name.
269  */
270 
271 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
272 {
273 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
274 	    addr_len > sizeof(*sunaddr))
275 		return -EINVAL;
276 
277 	if (sunaddr->sun_family != AF_UNIX)
278 		return -EINVAL;
279 
280 	return 0;
281 }
282 
283 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
284 {
285 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
286 	short offset = offsetof(struct sockaddr_storage, __data);
287 
288 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
289 
290 	/* This may look like an off by one error but it is a bit more
291 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
292 	 * sun_path[108] doesn't as such exist.  However in kernel space
293 	 * we are guaranteed that it is a valid memory location in our
294 	 * kernel address buffer because syscall functions always pass
295 	 * a pointer of struct sockaddr_storage which has a bigger buffer
296 	 * than 108.  Also, we must terminate sun_path for strlen() in
297 	 * getname_kernel().
298 	 */
299 	addr->__data[addr_len - offset] = 0;
300 
301 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
302 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
303 	 * know the actual buffer.
304 	 */
305 	return strlen(addr->__data) + offset + 1;
306 }
307 
308 static void __unix_remove_socket(struct sock *sk)
309 {
310 	sk_del_node_init(sk);
311 }
312 
313 static void __unix_insert_socket(struct net *net, struct sock *sk)
314 {
315 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
316 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
317 }
318 
319 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
320 				 struct unix_address *addr, unsigned int hash)
321 {
322 	__unix_remove_socket(sk);
323 	smp_store_release(&unix_sk(sk)->addr, addr);
324 
325 	sk->sk_hash = hash;
326 	__unix_insert_socket(net, sk);
327 }
328 
329 static void unix_remove_socket(struct net *net, struct sock *sk)
330 {
331 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
332 	__unix_remove_socket(sk);
333 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
334 }
335 
336 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
337 {
338 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
339 	__unix_insert_socket(net, sk);
340 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
341 }
342 
343 static void unix_insert_bsd_socket(struct sock *sk)
344 {
345 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
346 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
347 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
348 }
349 
350 static void unix_remove_bsd_socket(struct sock *sk)
351 {
352 	if (!hlist_unhashed(&sk->sk_bind_node)) {
353 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
354 		__sk_del_bind_node(sk);
355 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
356 
357 		sk_node_init(&sk->sk_bind_node);
358 	}
359 }
360 
361 static struct sock *__unix_find_socket_byname(struct net *net,
362 					      struct sockaddr_un *sunname,
363 					      int len, unsigned int hash)
364 {
365 	struct sock *s;
366 
367 	sk_for_each(s, &net->unx.table.buckets[hash]) {
368 		struct unix_sock *u = unix_sk(s);
369 
370 		if (u->addr->len == len &&
371 		    !memcmp(u->addr->name, sunname, len))
372 			return s;
373 	}
374 	return NULL;
375 }
376 
377 static inline struct sock *unix_find_socket_byname(struct net *net,
378 						   struct sockaddr_un *sunname,
379 						   int len, unsigned int hash)
380 {
381 	struct sock *s;
382 
383 	spin_lock(&net->unx.table.locks[hash]);
384 	s = __unix_find_socket_byname(net, sunname, len, hash);
385 	if (s)
386 		sock_hold(s);
387 	spin_unlock(&net->unx.table.locks[hash]);
388 	return s;
389 }
390 
391 static struct sock *unix_find_socket_byinode(struct inode *i)
392 {
393 	unsigned int hash = unix_bsd_hash(i);
394 	struct sock *s;
395 
396 	spin_lock(&bsd_socket_locks[hash]);
397 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
398 		struct dentry *dentry = unix_sk(s)->path.dentry;
399 
400 		if (dentry && d_backing_inode(dentry) == i) {
401 			sock_hold(s);
402 			spin_unlock(&bsd_socket_locks[hash]);
403 			return s;
404 		}
405 	}
406 	spin_unlock(&bsd_socket_locks[hash]);
407 	return NULL;
408 }
409 
410 /* Support code for asymmetrically connected dgram sockets
411  *
412  * If a datagram socket is connected to a socket not itself connected
413  * to the first socket (eg, /dev/log), clients may only enqueue more
414  * messages if the present receive queue of the server socket is not
415  * "too large". This means there's a second writeability condition
416  * poll and sendmsg need to test. The dgram recv code will do a wake
417  * up on the peer_wait wait queue of a socket upon reception of a
418  * datagram which needs to be propagated to sleeping would-be writers
419  * since these might not have sent anything so far. This can't be
420  * accomplished via poll_wait because the lifetime of the server
421  * socket might be less than that of its clients if these break their
422  * association with it or if the server socket is closed while clients
423  * are still connected to it and there's no way to inform "a polling
424  * implementation" that it should let go of a certain wait queue
425  *
426  * In order to propagate a wake up, a wait_queue_entry_t of the client
427  * socket is enqueued on the peer_wait queue of the server socket
428  * whose wake function does a wake_up on the ordinary client socket
429  * wait queue. This connection is established whenever a write (or
430  * poll for write) hit the flow control condition and broken when the
431  * association to the server socket is dissolved or after a wake up
432  * was relayed.
433  */
434 
435 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
436 				      void *key)
437 {
438 	struct unix_sock *u;
439 	wait_queue_head_t *u_sleep;
440 
441 	u = container_of(q, struct unix_sock, peer_wake);
442 
443 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
444 			    q);
445 	u->peer_wake.private = NULL;
446 
447 	/* relaying can only happen while the wq still exists */
448 	u_sleep = sk_sleep(&u->sk);
449 	if (u_sleep)
450 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
451 
452 	return 0;
453 }
454 
455 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
456 {
457 	struct unix_sock *u, *u_other;
458 	int rc;
459 
460 	u = unix_sk(sk);
461 	u_other = unix_sk(other);
462 	rc = 0;
463 	spin_lock(&u_other->peer_wait.lock);
464 
465 	if (!u->peer_wake.private) {
466 		u->peer_wake.private = other;
467 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
468 
469 		rc = 1;
470 	}
471 
472 	spin_unlock(&u_other->peer_wait.lock);
473 	return rc;
474 }
475 
476 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
477 					    struct sock *other)
478 {
479 	struct unix_sock *u, *u_other;
480 
481 	u = unix_sk(sk);
482 	u_other = unix_sk(other);
483 	spin_lock(&u_other->peer_wait.lock);
484 
485 	if (u->peer_wake.private == other) {
486 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
487 		u->peer_wake.private = NULL;
488 	}
489 
490 	spin_unlock(&u_other->peer_wait.lock);
491 }
492 
493 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
494 						   struct sock *other)
495 {
496 	unix_dgram_peer_wake_disconnect(sk, other);
497 	wake_up_interruptible_poll(sk_sleep(sk),
498 				   EPOLLOUT |
499 				   EPOLLWRNORM |
500 				   EPOLLWRBAND);
501 }
502 
503 /* preconditions:
504  *	- unix_peer(sk) == other
505  *	- association is stable
506  */
507 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
508 {
509 	int connected;
510 
511 	connected = unix_dgram_peer_wake_connect(sk, other);
512 
513 	/* If other is SOCK_DEAD, we want to make sure we signal
514 	 * POLLOUT, such that a subsequent write() can get a
515 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
516 	 * to other and its full, we will hang waiting for POLLOUT.
517 	 */
518 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
519 		return 1;
520 
521 	if (connected)
522 		unix_dgram_peer_wake_disconnect(sk, other);
523 
524 	return 0;
525 }
526 
527 static int unix_writable(const struct sock *sk, unsigned char state)
528 {
529 	return state != TCP_LISTEN &&
530 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
531 }
532 
533 static void unix_write_space(struct sock *sk)
534 {
535 	struct socket_wq *wq;
536 
537 	rcu_read_lock();
538 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
539 		wq = rcu_dereference(sk->sk_wq);
540 		if (skwq_has_sleeper(wq))
541 			wake_up_interruptible_sync_poll(&wq->wait,
542 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
543 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
544 	}
545 	rcu_read_unlock();
546 }
547 
548 /* When dgram socket disconnects (or changes its peer), we clear its receive
549  * queue of packets arrived from previous peer. First, it allows to do
550  * flow control based only on wmem_alloc; second, sk connected to peer
551  * may receive messages only from that peer. */
552 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
553 {
554 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
555 		skb_queue_purge(&sk->sk_receive_queue);
556 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
557 
558 		/* If one link of bidirectional dgram pipe is disconnected,
559 		 * we signal error. Messages are lost. Do not make this,
560 		 * when peer was not connected to us.
561 		 */
562 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
563 			WRITE_ONCE(other->sk_err, ECONNRESET);
564 			sk_error_report(other);
565 		}
566 	}
567 }
568 
569 static void unix_sock_destructor(struct sock *sk)
570 {
571 	struct unix_sock *u = unix_sk(sk);
572 
573 	skb_queue_purge(&sk->sk_receive_queue);
574 
575 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
576 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
577 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
578 	if (!sock_flag(sk, SOCK_DEAD)) {
579 		pr_info("Attempt to release alive unix socket: %p\n", sk);
580 		return;
581 	}
582 
583 	if (u->addr)
584 		unix_release_addr(u->addr);
585 
586 	atomic_long_dec(&unix_nr_socks);
587 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
588 #ifdef UNIX_REFCNT_DEBUG
589 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
590 		atomic_long_read(&unix_nr_socks));
591 #endif
592 }
593 
594 static void unix_release_sock(struct sock *sk, int embrion)
595 {
596 	struct unix_sock *u = unix_sk(sk);
597 	struct sock *skpair;
598 	struct sk_buff *skb;
599 	struct path path;
600 	int state;
601 
602 	unix_remove_socket(sock_net(sk), sk);
603 	unix_remove_bsd_socket(sk);
604 
605 	/* Clear state */
606 	unix_state_lock(sk);
607 	sock_orphan(sk);
608 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
609 	path	     = u->path;
610 	u->path.dentry = NULL;
611 	u->path.mnt = NULL;
612 	state = sk->sk_state;
613 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
614 
615 	skpair = unix_peer(sk);
616 	unix_peer(sk) = NULL;
617 
618 	unix_state_unlock(sk);
619 
620 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
621 	if (u->oob_skb) {
622 		kfree_skb(u->oob_skb);
623 		u->oob_skb = NULL;
624 	}
625 #endif
626 
627 	wake_up_interruptible_all(&u->peer_wait);
628 
629 	if (skpair != NULL) {
630 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
631 			unix_state_lock(skpair);
632 			/* No more writes */
633 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
634 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
635 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
636 			unix_state_unlock(skpair);
637 			skpair->sk_state_change(skpair);
638 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
639 		}
640 
641 		unix_dgram_peer_wake_disconnect(sk, skpair);
642 		sock_put(skpair); /* It may now die */
643 	}
644 
645 	/* Try to flush out this socket. Throw out buffers at least */
646 
647 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
648 		if (state == TCP_LISTEN)
649 			unix_release_sock(skb->sk, 1);
650 		/* passed fds are erased in the kfree_skb hook	      */
651 		UNIXCB(skb).consumed = skb->len;
652 		kfree_skb(skb);
653 	}
654 
655 	if (path.dentry)
656 		path_put(&path);
657 
658 	sock_put(sk);
659 
660 	/* ---- Socket is dead now and most probably destroyed ---- */
661 
662 	/*
663 	 * Fixme: BSD difference: In BSD all sockets connected to us get
664 	 *	  ECONNRESET and we die on the spot. In Linux we behave
665 	 *	  like files and pipes do and wait for the last
666 	 *	  dereference.
667 	 *
668 	 * Can't we simply set sock->err?
669 	 *
670 	 *	  What the above comment does talk about? --ANK(980817)
671 	 */
672 
673 	if (READ_ONCE(unix_tot_inflight))
674 		unix_gc();		/* Garbage collect fds */
675 }
676 
677 static void init_peercred(struct sock *sk)
678 {
679 	const struct cred *old_cred;
680 	struct pid *old_pid;
681 
682 	spin_lock(&sk->sk_peer_lock);
683 	old_pid = sk->sk_peer_pid;
684 	old_cred = sk->sk_peer_cred;
685 	sk->sk_peer_pid  = get_pid(task_tgid(current));
686 	sk->sk_peer_cred = get_current_cred();
687 	spin_unlock(&sk->sk_peer_lock);
688 
689 	put_pid(old_pid);
690 	put_cred(old_cred);
691 }
692 
693 static void copy_peercred(struct sock *sk, struct sock *peersk)
694 {
695 	const struct cred *old_cred;
696 	struct pid *old_pid;
697 
698 	if (sk < peersk) {
699 		spin_lock(&sk->sk_peer_lock);
700 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
701 	} else {
702 		spin_lock(&peersk->sk_peer_lock);
703 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
704 	}
705 	old_pid = sk->sk_peer_pid;
706 	old_cred = sk->sk_peer_cred;
707 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
708 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
709 
710 	spin_unlock(&sk->sk_peer_lock);
711 	spin_unlock(&peersk->sk_peer_lock);
712 
713 	put_pid(old_pid);
714 	put_cred(old_cred);
715 }
716 
717 static int unix_listen(struct socket *sock, int backlog)
718 {
719 	int err;
720 	struct sock *sk = sock->sk;
721 	struct unix_sock *u = unix_sk(sk);
722 
723 	err = -EOPNOTSUPP;
724 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
725 		goto out;	/* Only stream/seqpacket sockets accept */
726 	err = -EINVAL;
727 	if (!READ_ONCE(u->addr))
728 		goto out;	/* No listens on an unbound socket */
729 	unix_state_lock(sk);
730 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
731 		goto out_unlock;
732 	if (backlog > sk->sk_max_ack_backlog)
733 		wake_up_interruptible_all(&u->peer_wait);
734 	sk->sk_max_ack_backlog	= backlog;
735 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
736 
737 	/* set credentials so connect can copy them */
738 	init_peercred(sk);
739 	err = 0;
740 
741 out_unlock:
742 	unix_state_unlock(sk);
743 out:
744 	return err;
745 }
746 
747 static int unix_release(struct socket *);
748 static int unix_bind(struct socket *, struct sockaddr *, int);
749 static int unix_stream_connect(struct socket *, struct sockaddr *,
750 			       int addr_len, int flags);
751 static int unix_socketpair(struct socket *, struct socket *);
752 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
753 static int unix_getname(struct socket *, struct sockaddr *, int);
754 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
755 static __poll_t unix_dgram_poll(struct file *, struct socket *,
756 				    poll_table *);
757 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
758 #ifdef CONFIG_COMPAT
759 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
760 #endif
761 static int unix_shutdown(struct socket *, int);
762 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
763 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
764 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
765 				       struct pipe_inode_info *, size_t size,
766 				       unsigned int flags);
767 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
768 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
769 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
770 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
771 static int unix_dgram_connect(struct socket *, struct sockaddr *,
772 			      int, int);
773 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
775 				  int);
776 
777 #ifdef CONFIG_PROC_FS
778 static int unix_count_nr_fds(struct sock *sk)
779 {
780 	struct sk_buff *skb;
781 	struct unix_sock *u;
782 	int nr_fds = 0;
783 
784 	spin_lock(&sk->sk_receive_queue.lock);
785 	skb = skb_peek(&sk->sk_receive_queue);
786 	while (skb) {
787 		u = unix_sk(skb->sk);
788 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
789 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
790 	}
791 	spin_unlock(&sk->sk_receive_queue.lock);
792 
793 	return nr_fds;
794 }
795 
796 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
797 {
798 	struct sock *sk = sock->sk;
799 	unsigned char s_state;
800 	struct unix_sock *u;
801 	int nr_fds = 0;
802 
803 	if (sk) {
804 		s_state = READ_ONCE(sk->sk_state);
805 		u = unix_sk(sk);
806 
807 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
808 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
809 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
810 		 */
811 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
812 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
813 		else if (s_state == TCP_LISTEN)
814 			nr_fds = unix_count_nr_fds(sk);
815 
816 		seq_printf(m, "scm_fds: %u\n", nr_fds);
817 	}
818 }
819 #else
820 #define unix_show_fdinfo NULL
821 #endif
822 
823 static const struct proto_ops unix_stream_ops = {
824 	.family =	PF_UNIX,
825 	.owner =	THIS_MODULE,
826 	.release =	unix_release,
827 	.bind =		unix_bind,
828 	.connect =	unix_stream_connect,
829 	.socketpair =	unix_socketpair,
830 	.accept =	unix_accept,
831 	.getname =	unix_getname,
832 	.poll =		unix_poll,
833 	.ioctl =	unix_ioctl,
834 #ifdef CONFIG_COMPAT
835 	.compat_ioctl =	unix_compat_ioctl,
836 #endif
837 	.listen =	unix_listen,
838 	.shutdown =	unix_shutdown,
839 	.sendmsg =	unix_stream_sendmsg,
840 	.recvmsg =	unix_stream_recvmsg,
841 	.read_skb =	unix_stream_read_skb,
842 	.mmap =		sock_no_mmap,
843 	.splice_read =	unix_stream_splice_read,
844 	.set_peek_off =	sk_set_peek_off,
845 	.show_fdinfo =	unix_show_fdinfo,
846 };
847 
848 static const struct proto_ops unix_dgram_ops = {
849 	.family =	PF_UNIX,
850 	.owner =	THIS_MODULE,
851 	.release =	unix_release,
852 	.bind =		unix_bind,
853 	.connect =	unix_dgram_connect,
854 	.socketpair =	unix_socketpair,
855 	.accept =	sock_no_accept,
856 	.getname =	unix_getname,
857 	.poll =		unix_dgram_poll,
858 	.ioctl =	unix_ioctl,
859 #ifdef CONFIG_COMPAT
860 	.compat_ioctl =	unix_compat_ioctl,
861 #endif
862 	.listen =	sock_no_listen,
863 	.shutdown =	unix_shutdown,
864 	.sendmsg =	unix_dgram_sendmsg,
865 	.read_skb =	unix_read_skb,
866 	.recvmsg =	unix_dgram_recvmsg,
867 	.mmap =		sock_no_mmap,
868 	.set_peek_off =	sk_set_peek_off,
869 	.show_fdinfo =	unix_show_fdinfo,
870 };
871 
872 static const struct proto_ops unix_seqpacket_ops = {
873 	.family =	PF_UNIX,
874 	.owner =	THIS_MODULE,
875 	.release =	unix_release,
876 	.bind =		unix_bind,
877 	.connect =	unix_stream_connect,
878 	.socketpair =	unix_socketpair,
879 	.accept =	unix_accept,
880 	.getname =	unix_getname,
881 	.poll =		unix_dgram_poll,
882 	.ioctl =	unix_ioctl,
883 #ifdef CONFIG_COMPAT
884 	.compat_ioctl =	unix_compat_ioctl,
885 #endif
886 	.listen =	unix_listen,
887 	.shutdown =	unix_shutdown,
888 	.sendmsg =	unix_seqpacket_sendmsg,
889 	.recvmsg =	unix_seqpacket_recvmsg,
890 	.mmap =		sock_no_mmap,
891 	.set_peek_off =	sk_set_peek_off,
892 	.show_fdinfo =	unix_show_fdinfo,
893 };
894 
895 static void unix_close(struct sock *sk, long timeout)
896 {
897 	/* Nothing to do here, unix socket does not need a ->close().
898 	 * This is merely for sockmap.
899 	 */
900 }
901 
902 static void unix_unhash(struct sock *sk)
903 {
904 	/* Nothing to do here, unix socket does not need a ->unhash().
905 	 * This is merely for sockmap.
906 	 */
907 }
908 
909 static bool unix_bpf_bypass_getsockopt(int level, int optname)
910 {
911 	if (level == SOL_SOCKET) {
912 		switch (optname) {
913 		case SO_PEERPIDFD:
914 			return true;
915 		default:
916 			return false;
917 		}
918 	}
919 
920 	return false;
921 }
922 
923 struct proto unix_dgram_proto = {
924 	.name			= "UNIX",
925 	.owner			= THIS_MODULE,
926 	.obj_size		= sizeof(struct unix_sock),
927 	.close			= unix_close,
928 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
929 #ifdef CONFIG_BPF_SYSCALL
930 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
931 #endif
932 };
933 
934 struct proto unix_stream_proto = {
935 	.name			= "UNIX-STREAM",
936 	.owner			= THIS_MODULE,
937 	.obj_size		= sizeof(struct unix_sock),
938 	.close			= unix_close,
939 	.unhash			= unix_unhash,
940 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
941 #ifdef CONFIG_BPF_SYSCALL
942 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
943 #endif
944 };
945 
946 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
947 {
948 	struct unix_sock *u;
949 	struct sock *sk;
950 	int err;
951 
952 	atomic_long_inc(&unix_nr_socks);
953 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
954 		err = -ENFILE;
955 		goto err;
956 	}
957 
958 	if (type == SOCK_STREAM)
959 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
960 	else /*dgram and  seqpacket */
961 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
962 
963 	if (!sk) {
964 		err = -ENOMEM;
965 		goto err;
966 	}
967 
968 	sock_init_data(sock, sk);
969 
970 	sk->sk_hash		= unix_unbound_hash(sk);
971 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
972 	sk->sk_write_space	= unix_write_space;
973 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
974 	sk->sk_destruct		= unix_sock_destructor;
975 	u = unix_sk(sk);
976 	u->listener = NULL;
977 	u->vertex = NULL;
978 	u->path.dentry = NULL;
979 	u->path.mnt = NULL;
980 	spin_lock_init(&u->lock);
981 	mutex_init(&u->iolock); /* single task reading lock */
982 	mutex_init(&u->bindlock); /* single task binding lock */
983 	init_waitqueue_head(&u->peer_wait);
984 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
985 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
986 	unix_insert_unbound_socket(net, sk);
987 
988 	sock_prot_inuse_add(net, sk->sk_prot, 1);
989 
990 	return sk;
991 
992 err:
993 	atomic_long_dec(&unix_nr_socks);
994 	return ERR_PTR(err);
995 }
996 
997 static int unix_create(struct net *net, struct socket *sock, int protocol,
998 		       int kern)
999 {
1000 	struct sock *sk;
1001 
1002 	if (protocol && protocol != PF_UNIX)
1003 		return -EPROTONOSUPPORT;
1004 
1005 	sock->state = SS_UNCONNECTED;
1006 
1007 	switch (sock->type) {
1008 	case SOCK_STREAM:
1009 		sock->ops = &unix_stream_ops;
1010 		break;
1011 		/*
1012 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1013 		 *	nothing uses it.
1014 		 */
1015 	case SOCK_RAW:
1016 		sock->type = SOCK_DGRAM;
1017 		fallthrough;
1018 	case SOCK_DGRAM:
1019 		sock->ops = &unix_dgram_ops;
1020 		break;
1021 	case SOCK_SEQPACKET:
1022 		sock->ops = &unix_seqpacket_ops;
1023 		break;
1024 	default:
1025 		return -ESOCKTNOSUPPORT;
1026 	}
1027 
1028 	sk = unix_create1(net, sock, kern, sock->type);
1029 	if (IS_ERR(sk))
1030 		return PTR_ERR(sk);
1031 
1032 	return 0;
1033 }
1034 
1035 static int unix_release(struct socket *sock)
1036 {
1037 	struct sock *sk = sock->sk;
1038 
1039 	if (!sk)
1040 		return 0;
1041 
1042 	sk->sk_prot->close(sk, 0);
1043 	unix_release_sock(sk, 0);
1044 	sock->sk = NULL;
1045 
1046 	return 0;
1047 }
1048 
1049 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1050 				  int type)
1051 {
1052 	struct inode *inode;
1053 	struct path path;
1054 	struct sock *sk;
1055 	int err;
1056 
1057 	unix_mkname_bsd(sunaddr, addr_len);
1058 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1059 	if (err)
1060 		goto fail;
1061 
1062 	err = path_permission(&path, MAY_WRITE);
1063 	if (err)
1064 		goto path_put;
1065 
1066 	err = -ECONNREFUSED;
1067 	inode = d_backing_inode(path.dentry);
1068 	if (!S_ISSOCK(inode->i_mode))
1069 		goto path_put;
1070 
1071 	sk = unix_find_socket_byinode(inode);
1072 	if (!sk)
1073 		goto path_put;
1074 
1075 	err = -EPROTOTYPE;
1076 	if (sk->sk_type == type)
1077 		touch_atime(&path);
1078 	else
1079 		goto sock_put;
1080 
1081 	path_put(&path);
1082 
1083 	return sk;
1084 
1085 sock_put:
1086 	sock_put(sk);
1087 path_put:
1088 	path_put(&path);
1089 fail:
1090 	return ERR_PTR(err);
1091 }
1092 
1093 static struct sock *unix_find_abstract(struct net *net,
1094 				       struct sockaddr_un *sunaddr,
1095 				       int addr_len, int type)
1096 {
1097 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1098 	struct dentry *dentry;
1099 	struct sock *sk;
1100 
1101 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1102 	if (!sk)
1103 		return ERR_PTR(-ECONNREFUSED);
1104 
1105 	dentry = unix_sk(sk)->path.dentry;
1106 	if (dentry)
1107 		touch_atime(&unix_sk(sk)->path);
1108 
1109 	return sk;
1110 }
1111 
1112 static struct sock *unix_find_other(struct net *net,
1113 				    struct sockaddr_un *sunaddr,
1114 				    int addr_len, int type)
1115 {
1116 	struct sock *sk;
1117 
1118 	if (sunaddr->sun_path[0])
1119 		sk = unix_find_bsd(sunaddr, addr_len, type);
1120 	else
1121 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1122 
1123 	return sk;
1124 }
1125 
1126 static int unix_autobind(struct sock *sk)
1127 {
1128 	struct unix_sock *u = unix_sk(sk);
1129 	unsigned int new_hash, old_hash;
1130 	struct net *net = sock_net(sk);
1131 	struct unix_address *addr;
1132 	u32 lastnum, ordernum;
1133 	int err;
1134 
1135 	err = mutex_lock_interruptible(&u->bindlock);
1136 	if (err)
1137 		return err;
1138 
1139 	if (u->addr)
1140 		goto out;
1141 
1142 	err = -ENOMEM;
1143 	addr = kzalloc(sizeof(*addr) +
1144 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1145 	if (!addr)
1146 		goto out;
1147 
1148 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1149 	addr->name->sun_family = AF_UNIX;
1150 	refcount_set(&addr->refcnt, 1);
1151 
1152 	old_hash = sk->sk_hash;
1153 	ordernum = get_random_u32();
1154 	lastnum = ordernum & 0xFFFFF;
1155 retry:
1156 	ordernum = (ordernum + 1) & 0xFFFFF;
1157 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1158 
1159 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1160 	unix_table_double_lock(net, old_hash, new_hash);
1161 
1162 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1163 		unix_table_double_unlock(net, old_hash, new_hash);
1164 
1165 		/* __unix_find_socket_byname() may take long time if many names
1166 		 * are already in use.
1167 		 */
1168 		cond_resched();
1169 
1170 		if (ordernum == lastnum) {
1171 			/* Give up if all names seems to be in use. */
1172 			err = -ENOSPC;
1173 			unix_release_addr(addr);
1174 			goto out;
1175 		}
1176 
1177 		goto retry;
1178 	}
1179 
1180 	__unix_set_addr_hash(net, sk, addr, new_hash);
1181 	unix_table_double_unlock(net, old_hash, new_hash);
1182 	err = 0;
1183 
1184 out:	mutex_unlock(&u->bindlock);
1185 	return err;
1186 }
1187 
1188 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1189 			 int addr_len)
1190 {
1191 	umode_t mode = S_IFSOCK |
1192 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1193 	struct unix_sock *u = unix_sk(sk);
1194 	unsigned int new_hash, old_hash;
1195 	struct net *net = sock_net(sk);
1196 	struct mnt_idmap *idmap;
1197 	struct unix_address *addr;
1198 	struct dentry *dentry;
1199 	struct path parent;
1200 	int err;
1201 
1202 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1203 	addr = unix_create_addr(sunaddr, addr_len);
1204 	if (!addr)
1205 		return -ENOMEM;
1206 
1207 	/*
1208 	 * Get the parent directory, calculate the hash for last
1209 	 * component.
1210 	 */
1211 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1212 	if (IS_ERR(dentry)) {
1213 		err = PTR_ERR(dentry);
1214 		goto out;
1215 	}
1216 
1217 	/*
1218 	 * All right, let's create it.
1219 	 */
1220 	idmap = mnt_idmap(parent.mnt);
1221 	err = security_path_mknod(&parent, dentry, mode, 0);
1222 	if (!err)
1223 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1224 	if (err)
1225 		goto out_path;
1226 	err = mutex_lock_interruptible(&u->bindlock);
1227 	if (err)
1228 		goto out_unlink;
1229 	if (u->addr)
1230 		goto out_unlock;
1231 
1232 	old_hash = sk->sk_hash;
1233 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1234 	unix_table_double_lock(net, old_hash, new_hash);
1235 	u->path.mnt = mntget(parent.mnt);
1236 	u->path.dentry = dget(dentry);
1237 	__unix_set_addr_hash(net, sk, addr, new_hash);
1238 	unix_table_double_unlock(net, old_hash, new_hash);
1239 	unix_insert_bsd_socket(sk);
1240 	mutex_unlock(&u->bindlock);
1241 	done_path_create(&parent, dentry);
1242 	return 0;
1243 
1244 out_unlock:
1245 	mutex_unlock(&u->bindlock);
1246 	err = -EINVAL;
1247 out_unlink:
1248 	/* failed after successful mknod?  unlink what we'd created... */
1249 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1250 out_path:
1251 	done_path_create(&parent, dentry);
1252 out:
1253 	unix_release_addr(addr);
1254 	return err == -EEXIST ? -EADDRINUSE : err;
1255 }
1256 
1257 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1258 			      int addr_len)
1259 {
1260 	struct unix_sock *u = unix_sk(sk);
1261 	unsigned int new_hash, old_hash;
1262 	struct net *net = sock_net(sk);
1263 	struct unix_address *addr;
1264 	int err;
1265 
1266 	addr = unix_create_addr(sunaddr, addr_len);
1267 	if (!addr)
1268 		return -ENOMEM;
1269 
1270 	err = mutex_lock_interruptible(&u->bindlock);
1271 	if (err)
1272 		goto out;
1273 
1274 	if (u->addr) {
1275 		err = -EINVAL;
1276 		goto out_mutex;
1277 	}
1278 
1279 	old_hash = sk->sk_hash;
1280 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1281 	unix_table_double_lock(net, old_hash, new_hash);
1282 
1283 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1284 		goto out_spin;
1285 
1286 	__unix_set_addr_hash(net, sk, addr, new_hash);
1287 	unix_table_double_unlock(net, old_hash, new_hash);
1288 	mutex_unlock(&u->bindlock);
1289 	return 0;
1290 
1291 out_spin:
1292 	unix_table_double_unlock(net, old_hash, new_hash);
1293 	err = -EADDRINUSE;
1294 out_mutex:
1295 	mutex_unlock(&u->bindlock);
1296 out:
1297 	unix_release_addr(addr);
1298 	return err;
1299 }
1300 
1301 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1302 {
1303 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1304 	struct sock *sk = sock->sk;
1305 	int err;
1306 
1307 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1308 	    sunaddr->sun_family == AF_UNIX)
1309 		return unix_autobind(sk);
1310 
1311 	err = unix_validate_addr(sunaddr, addr_len);
1312 	if (err)
1313 		return err;
1314 
1315 	if (sunaddr->sun_path[0])
1316 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1317 	else
1318 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1319 
1320 	return err;
1321 }
1322 
1323 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1324 {
1325 	if (unlikely(sk1 == sk2) || !sk2) {
1326 		unix_state_lock(sk1);
1327 		return;
1328 	}
1329 	if (sk1 > sk2)
1330 		swap(sk1, sk2);
1331 
1332 	unix_state_lock(sk1);
1333 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1334 }
1335 
1336 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1337 {
1338 	if (unlikely(sk1 == sk2) || !sk2) {
1339 		unix_state_unlock(sk1);
1340 		return;
1341 	}
1342 	unix_state_unlock(sk1);
1343 	unix_state_unlock(sk2);
1344 }
1345 
1346 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1347 			      int alen, int flags)
1348 {
1349 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1350 	struct sock *sk = sock->sk;
1351 	struct sock *other;
1352 	int err;
1353 
1354 	err = -EINVAL;
1355 	if (alen < offsetofend(struct sockaddr, sa_family))
1356 		goto out;
1357 
1358 	if (addr->sa_family != AF_UNSPEC) {
1359 		err = unix_validate_addr(sunaddr, alen);
1360 		if (err)
1361 			goto out;
1362 
1363 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1364 		if (err)
1365 			goto out;
1366 
1367 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1368 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1369 		    !READ_ONCE(unix_sk(sk)->addr)) {
1370 			err = unix_autobind(sk);
1371 			if (err)
1372 				goto out;
1373 		}
1374 
1375 restart:
1376 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1377 		if (IS_ERR(other)) {
1378 			err = PTR_ERR(other);
1379 			goto out;
1380 		}
1381 
1382 		unix_state_double_lock(sk, other);
1383 
1384 		/* Apparently VFS overslept socket death. Retry. */
1385 		if (sock_flag(other, SOCK_DEAD)) {
1386 			unix_state_double_unlock(sk, other);
1387 			sock_put(other);
1388 			goto restart;
1389 		}
1390 
1391 		err = -EPERM;
1392 		if (!unix_may_send(sk, other))
1393 			goto out_unlock;
1394 
1395 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1396 		if (err)
1397 			goto out_unlock;
1398 
1399 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1400 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1401 	} else {
1402 		/*
1403 		 *	1003.1g breaking connected state with AF_UNSPEC
1404 		 */
1405 		other = NULL;
1406 		unix_state_double_lock(sk, other);
1407 	}
1408 
1409 	/*
1410 	 * If it was connected, reconnect.
1411 	 */
1412 	if (unix_peer(sk)) {
1413 		struct sock *old_peer = unix_peer(sk);
1414 
1415 		unix_peer(sk) = other;
1416 		if (!other)
1417 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1418 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1419 
1420 		unix_state_double_unlock(sk, other);
1421 
1422 		if (other != old_peer) {
1423 			unix_dgram_disconnected(sk, old_peer);
1424 
1425 			unix_state_lock(old_peer);
1426 			if (!unix_peer(old_peer))
1427 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1428 			unix_state_unlock(old_peer);
1429 		}
1430 
1431 		sock_put(old_peer);
1432 	} else {
1433 		unix_peer(sk) = other;
1434 		unix_state_double_unlock(sk, other);
1435 	}
1436 
1437 	return 0;
1438 
1439 out_unlock:
1440 	unix_state_double_unlock(sk, other);
1441 	sock_put(other);
1442 out:
1443 	return err;
1444 }
1445 
1446 static long unix_wait_for_peer(struct sock *other, long timeo)
1447 	__releases(&unix_sk(other)->lock)
1448 {
1449 	struct unix_sock *u = unix_sk(other);
1450 	int sched;
1451 	DEFINE_WAIT(wait);
1452 
1453 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1454 
1455 	sched = !sock_flag(other, SOCK_DEAD) &&
1456 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1457 		unix_recvq_full_lockless(other);
1458 
1459 	unix_state_unlock(other);
1460 
1461 	if (sched)
1462 		timeo = schedule_timeout(timeo);
1463 
1464 	finish_wait(&u->peer_wait, &wait);
1465 	return timeo;
1466 }
1467 
1468 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1469 			       int addr_len, int flags)
1470 {
1471 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1472 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1473 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1474 	struct net *net = sock_net(sk);
1475 	struct sk_buff *skb = NULL;
1476 	long timeo;
1477 	int err;
1478 
1479 	err = unix_validate_addr(sunaddr, addr_len);
1480 	if (err)
1481 		goto out;
1482 
1483 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1484 	if (err)
1485 		goto out;
1486 
1487 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1488 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1489 	    !READ_ONCE(u->addr)) {
1490 		err = unix_autobind(sk);
1491 		if (err)
1492 			goto out;
1493 	}
1494 
1495 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1496 
1497 	/* First of all allocate resources.
1498 	   If we will make it after state is locked,
1499 	   we will have to recheck all again in any case.
1500 	 */
1501 
1502 	/* create new sock for complete connection */
1503 	newsk = unix_create1(net, NULL, 0, sock->type);
1504 	if (IS_ERR(newsk)) {
1505 		err = PTR_ERR(newsk);
1506 		newsk = NULL;
1507 		goto out;
1508 	}
1509 
1510 	err = -ENOMEM;
1511 
1512 	/* Allocate skb for sending to listening sock */
1513 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1514 	if (skb == NULL)
1515 		goto out;
1516 
1517 restart:
1518 	/*  Find listening sock. */
1519 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1520 	if (IS_ERR(other)) {
1521 		err = PTR_ERR(other);
1522 		other = NULL;
1523 		goto out;
1524 	}
1525 
1526 	/* Latch state of peer */
1527 	unix_state_lock(other);
1528 
1529 	/* Apparently VFS overslept socket death. Retry. */
1530 	if (sock_flag(other, SOCK_DEAD)) {
1531 		unix_state_unlock(other);
1532 		sock_put(other);
1533 		goto restart;
1534 	}
1535 
1536 	err = -ECONNREFUSED;
1537 	if (other->sk_state != TCP_LISTEN)
1538 		goto out_unlock;
1539 	if (other->sk_shutdown & RCV_SHUTDOWN)
1540 		goto out_unlock;
1541 
1542 	if (unix_recvq_full_lockless(other)) {
1543 		err = -EAGAIN;
1544 		if (!timeo)
1545 			goto out_unlock;
1546 
1547 		timeo = unix_wait_for_peer(other, timeo);
1548 
1549 		err = sock_intr_errno(timeo);
1550 		if (signal_pending(current))
1551 			goto out;
1552 		sock_put(other);
1553 		goto restart;
1554 	}
1555 
1556 	/* Latch our state.
1557 
1558 	   It is tricky place. We need to grab our state lock and cannot
1559 	   drop lock on peer. It is dangerous because deadlock is
1560 	   possible. Connect to self case and simultaneous
1561 	   attempt to connect are eliminated by checking socket
1562 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1563 	   check this before attempt to grab lock.
1564 
1565 	   Well, and we have to recheck the state after socket locked.
1566 	 */
1567 	switch (READ_ONCE(sk->sk_state)) {
1568 	case TCP_CLOSE:
1569 		/* This is ok... continue with connect */
1570 		break;
1571 	case TCP_ESTABLISHED:
1572 		/* Socket is already connected */
1573 		err = -EISCONN;
1574 		goto out_unlock;
1575 	default:
1576 		err = -EINVAL;
1577 		goto out_unlock;
1578 	}
1579 
1580 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1581 
1582 	if (sk->sk_state != TCP_CLOSE) {
1583 		unix_state_unlock(sk);
1584 		unix_state_unlock(other);
1585 		sock_put(other);
1586 		goto restart;
1587 	}
1588 
1589 	err = security_unix_stream_connect(sk, other, newsk);
1590 	if (err) {
1591 		unix_state_unlock(sk);
1592 		goto out_unlock;
1593 	}
1594 
1595 	/* The way is open! Fastly set all the necessary fields... */
1596 
1597 	sock_hold(sk);
1598 	unix_peer(newsk)	= sk;
1599 	newsk->sk_state		= TCP_ESTABLISHED;
1600 	newsk->sk_type		= sk->sk_type;
1601 	init_peercred(newsk);
1602 	newu = unix_sk(newsk);
1603 	newu->listener = other;
1604 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1605 	otheru = unix_sk(other);
1606 
1607 	/* copy address information from listening to new sock
1608 	 *
1609 	 * The contents of *(otheru->addr) and otheru->path
1610 	 * are seen fully set up here, since we have found
1611 	 * otheru in hash under its lock.  Insertion into the
1612 	 * hash chain we'd found it in had been done in an
1613 	 * earlier critical area protected by the chain's lock,
1614 	 * the same one where we'd set *(otheru->addr) contents,
1615 	 * as well as otheru->path and otheru->addr itself.
1616 	 *
1617 	 * Using smp_store_release() here to set newu->addr
1618 	 * is enough to make those stores, as well as stores
1619 	 * to newu->path visible to anyone who gets newu->addr
1620 	 * by smp_load_acquire().  IOW, the same warranties
1621 	 * as for unix_sock instances bound in unix_bind() or
1622 	 * in unix_autobind().
1623 	 */
1624 	if (otheru->path.dentry) {
1625 		path_get(&otheru->path);
1626 		newu->path = otheru->path;
1627 	}
1628 	refcount_inc(&otheru->addr->refcnt);
1629 	smp_store_release(&newu->addr, otheru->addr);
1630 
1631 	/* Set credentials */
1632 	copy_peercred(sk, other);
1633 
1634 	sock->state	= SS_CONNECTED;
1635 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1636 	sock_hold(newsk);
1637 
1638 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1639 	unix_peer(sk)	= newsk;
1640 
1641 	unix_state_unlock(sk);
1642 
1643 	/* take ten and send info to listening sock */
1644 	spin_lock(&other->sk_receive_queue.lock);
1645 	__skb_queue_tail(&other->sk_receive_queue, skb);
1646 	spin_unlock(&other->sk_receive_queue.lock);
1647 	unix_state_unlock(other);
1648 	other->sk_data_ready(other);
1649 	sock_put(other);
1650 	return 0;
1651 
1652 out_unlock:
1653 	if (other)
1654 		unix_state_unlock(other);
1655 
1656 out:
1657 	kfree_skb(skb);
1658 	if (newsk)
1659 		unix_release_sock(newsk, 0);
1660 	if (other)
1661 		sock_put(other);
1662 	return err;
1663 }
1664 
1665 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1666 {
1667 	struct sock *ska = socka->sk, *skb = sockb->sk;
1668 
1669 	/* Join our sockets back to back */
1670 	sock_hold(ska);
1671 	sock_hold(skb);
1672 	unix_peer(ska) = skb;
1673 	unix_peer(skb) = ska;
1674 	init_peercred(ska);
1675 	init_peercred(skb);
1676 
1677 	ska->sk_state = TCP_ESTABLISHED;
1678 	skb->sk_state = TCP_ESTABLISHED;
1679 	socka->state  = SS_CONNECTED;
1680 	sockb->state  = SS_CONNECTED;
1681 	return 0;
1682 }
1683 
1684 static void unix_sock_inherit_flags(const struct socket *old,
1685 				    struct socket *new)
1686 {
1687 	if (test_bit(SOCK_PASSCRED, &old->flags))
1688 		set_bit(SOCK_PASSCRED, &new->flags);
1689 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1690 		set_bit(SOCK_PASSPIDFD, &new->flags);
1691 	if (test_bit(SOCK_PASSSEC, &old->flags))
1692 		set_bit(SOCK_PASSSEC, &new->flags);
1693 }
1694 
1695 static int unix_accept(struct socket *sock, struct socket *newsock,
1696 		       struct proto_accept_arg *arg)
1697 {
1698 	struct sock *sk = sock->sk;
1699 	struct sk_buff *skb;
1700 	struct sock *tsk;
1701 
1702 	arg->err = -EOPNOTSUPP;
1703 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1704 		goto out;
1705 
1706 	arg->err = -EINVAL;
1707 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1708 		goto out;
1709 
1710 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1711 	 * so that no locks are necessary.
1712 	 */
1713 
1714 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1715 				&arg->err);
1716 	if (!skb) {
1717 		/* This means receive shutdown. */
1718 		if (arg->err == 0)
1719 			arg->err = -EINVAL;
1720 		goto out;
1721 	}
1722 
1723 	tsk = skb->sk;
1724 	skb_free_datagram(sk, skb);
1725 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1726 
1727 	/* attach accepted sock to socket */
1728 	unix_state_lock(tsk);
1729 	unix_update_edges(unix_sk(tsk));
1730 	newsock->state = SS_CONNECTED;
1731 	unix_sock_inherit_flags(sock, newsock);
1732 	sock_graft(tsk, newsock);
1733 	unix_state_unlock(tsk);
1734 	return 0;
1735 
1736 out:
1737 	return arg->err;
1738 }
1739 
1740 
1741 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1742 {
1743 	struct sock *sk = sock->sk;
1744 	struct unix_address *addr;
1745 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1746 	int err = 0;
1747 
1748 	if (peer) {
1749 		sk = unix_peer_get(sk);
1750 
1751 		err = -ENOTCONN;
1752 		if (!sk)
1753 			goto out;
1754 		err = 0;
1755 	} else {
1756 		sock_hold(sk);
1757 	}
1758 
1759 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1760 	if (!addr) {
1761 		sunaddr->sun_family = AF_UNIX;
1762 		sunaddr->sun_path[0] = 0;
1763 		err = offsetof(struct sockaddr_un, sun_path);
1764 	} else {
1765 		err = addr->len;
1766 		memcpy(sunaddr, addr->name, addr->len);
1767 
1768 		if (peer)
1769 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1770 					       CGROUP_UNIX_GETPEERNAME);
1771 		else
1772 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1773 					       CGROUP_UNIX_GETSOCKNAME);
1774 	}
1775 	sock_put(sk);
1776 out:
1777 	return err;
1778 }
1779 
1780 /* The "user->unix_inflight" variable is protected by the garbage
1781  * collection lock, and we just read it locklessly here. If you go
1782  * over the limit, there might be a tiny race in actually noticing
1783  * it across threads. Tough.
1784  */
1785 static inline bool too_many_unix_fds(struct task_struct *p)
1786 {
1787 	struct user_struct *user = current_user();
1788 
1789 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1790 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1791 	return false;
1792 }
1793 
1794 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1795 {
1796 	if (too_many_unix_fds(current))
1797 		return -ETOOMANYREFS;
1798 
1799 	UNIXCB(skb).fp = scm->fp;
1800 	scm->fp = NULL;
1801 
1802 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1803 		return -ENOMEM;
1804 
1805 	return 0;
1806 }
1807 
1808 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1809 {
1810 	scm->fp = UNIXCB(skb).fp;
1811 	UNIXCB(skb).fp = NULL;
1812 
1813 	unix_destroy_fpl(scm->fp);
1814 }
1815 
1816 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1817 {
1818 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1819 }
1820 
1821 static void unix_destruct_scm(struct sk_buff *skb)
1822 {
1823 	struct scm_cookie scm;
1824 
1825 	memset(&scm, 0, sizeof(scm));
1826 	scm.pid  = UNIXCB(skb).pid;
1827 	if (UNIXCB(skb).fp)
1828 		unix_detach_fds(&scm, skb);
1829 
1830 	/* Alas, it calls VFS */
1831 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1832 	scm_destroy(&scm);
1833 	sock_wfree(skb);
1834 }
1835 
1836 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1837 {
1838 	int err = 0;
1839 
1840 	UNIXCB(skb).pid  = get_pid(scm->pid);
1841 	UNIXCB(skb).uid = scm->creds.uid;
1842 	UNIXCB(skb).gid = scm->creds.gid;
1843 	UNIXCB(skb).fp = NULL;
1844 	unix_get_secdata(scm, skb);
1845 	if (scm->fp && send_fds)
1846 		err = unix_attach_fds(scm, skb);
1847 
1848 	skb->destructor = unix_destruct_scm;
1849 	return err;
1850 }
1851 
1852 static bool unix_passcred_enabled(const struct socket *sock,
1853 				  const struct sock *other)
1854 {
1855 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1856 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1857 	       !other->sk_socket ||
1858 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1859 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1860 }
1861 
1862 /*
1863  * Some apps rely on write() giving SCM_CREDENTIALS
1864  * We include credentials if source or destination socket
1865  * asserted SOCK_PASSCRED.
1866  */
1867 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1868 			    const struct sock *other)
1869 {
1870 	if (UNIXCB(skb).pid)
1871 		return;
1872 	if (unix_passcred_enabled(sock, other)) {
1873 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1874 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1875 	}
1876 }
1877 
1878 static bool unix_skb_scm_eq(struct sk_buff *skb,
1879 			    struct scm_cookie *scm)
1880 {
1881 	return UNIXCB(skb).pid == scm->pid &&
1882 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1883 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1884 	       unix_secdata_eq(scm, skb);
1885 }
1886 
1887 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1888 {
1889 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1890 	struct unix_sock *u = unix_sk(sk);
1891 
1892 	if (unlikely(fp && fp->count)) {
1893 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1894 		unix_add_edges(fp, u);
1895 	}
1896 }
1897 
1898 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1899 {
1900 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1901 	struct unix_sock *u = unix_sk(sk);
1902 
1903 	if (unlikely(fp && fp->count)) {
1904 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1905 		unix_del_edges(fp);
1906 	}
1907 }
1908 
1909 /*
1910  *	Send AF_UNIX data.
1911  */
1912 
1913 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1914 			      size_t len)
1915 {
1916 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1917 	struct sock *sk = sock->sk, *other = NULL;
1918 	struct unix_sock *u = unix_sk(sk);
1919 	struct scm_cookie scm;
1920 	struct sk_buff *skb;
1921 	int data_len = 0;
1922 	int sk_locked;
1923 	long timeo;
1924 	int err;
1925 
1926 	err = scm_send(sock, msg, &scm, false);
1927 	if (err < 0)
1928 		return err;
1929 
1930 	wait_for_unix_gc(scm.fp);
1931 
1932 	err = -EOPNOTSUPP;
1933 	if (msg->msg_flags&MSG_OOB)
1934 		goto out;
1935 
1936 	if (msg->msg_namelen) {
1937 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1938 		if (err)
1939 			goto out;
1940 
1941 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1942 							    msg->msg_name,
1943 							    &msg->msg_namelen,
1944 							    NULL);
1945 		if (err)
1946 			goto out;
1947 	} else {
1948 		sunaddr = NULL;
1949 		err = -ENOTCONN;
1950 		other = unix_peer_get(sk);
1951 		if (!other)
1952 			goto out;
1953 	}
1954 
1955 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1956 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1957 	    !READ_ONCE(u->addr)) {
1958 		err = unix_autobind(sk);
1959 		if (err)
1960 			goto out;
1961 	}
1962 
1963 	err = -EMSGSIZE;
1964 	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
1965 		goto out;
1966 
1967 	if (len > SKB_MAX_ALLOC) {
1968 		data_len = min_t(size_t,
1969 				 len - SKB_MAX_ALLOC,
1970 				 MAX_SKB_FRAGS * PAGE_SIZE);
1971 		data_len = PAGE_ALIGN(data_len);
1972 
1973 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1974 	}
1975 
1976 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1977 				   msg->msg_flags & MSG_DONTWAIT, &err,
1978 				   PAGE_ALLOC_COSTLY_ORDER);
1979 	if (skb == NULL)
1980 		goto out;
1981 
1982 	err = unix_scm_to_skb(&scm, skb, true);
1983 	if (err < 0)
1984 		goto out_free;
1985 
1986 	skb_put(skb, len - data_len);
1987 	skb->data_len = data_len;
1988 	skb->len = len;
1989 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1990 	if (err)
1991 		goto out_free;
1992 
1993 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1994 
1995 restart:
1996 	if (!other) {
1997 		err = -ECONNRESET;
1998 		if (sunaddr == NULL)
1999 			goto out_free;
2000 
2001 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2002 					sk->sk_type);
2003 		if (IS_ERR(other)) {
2004 			err = PTR_ERR(other);
2005 			other = NULL;
2006 			goto out_free;
2007 		}
2008 	}
2009 
2010 	if (sk_filter(other, skb) < 0) {
2011 		/* Toss the packet but do not return any error to the sender */
2012 		err = len;
2013 		goto out_free;
2014 	}
2015 
2016 	sk_locked = 0;
2017 	unix_state_lock(other);
2018 restart_locked:
2019 	err = -EPERM;
2020 	if (!unix_may_send(sk, other))
2021 		goto out_unlock;
2022 
2023 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2024 		/*
2025 		 *	Check with 1003.1g - what should
2026 		 *	datagram error
2027 		 */
2028 		unix_state_unlock(other);
2029 		sock_put(other);
2030 
2031 		if (!sk_locked)
2032 			unix_state_lock(sk);
2033 
2034 		err = 0;
2035 		if (sk->sk_type == SOCK_SEQPACKET) {
2036 			/* We are here only when racing with unix_release_sock()
2037 			 * is clearing @other. Never change state to TCP_CLOSE
2038 			 * unlike SOCK_DGRAM wants.
2039 			 */
2040 			unix_state_unlock(sk);
2041 			err = -EPIPE;
2042 		} else if (unix_peer(sk) == other) {
2043 			unix_peer(sk) = NULL;
2044 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2045 
2046 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2047 			unix_state_unlock(sk);
2048 
2049 			unix_dgram_disconnected(sk, other);
2050 			sock_put(other);
2051 			err = -ECONNREFUSED;
2052 		} else {
2053 			unix_state_unlock(sk);
2054 		}
2055 
2056 		other = NULL;
2057 		if (err)
2058 			goto out_free;
2059 		goto restart;
2060 	}
2061 
2062 	err = -EPIPE;
2063 	if (other->sk_shutdown & RCV_SHUTDOWN)
2064 		goto out_unlock;
2065 
2066 	if (sk->sk_type != SOCK_SEQPACKET) {
2067 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2068 		if (err)
2069 			goto out_unlock;
2070 	}
2071 
2072 	/* other == sk && unix_peer(other) != sk if
2073 	 * - unix_peer(sk) == NULL, destination address bound to sk
2074 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2075 	 */
2076 	if (other != sk &&
2077 	    unlikely(unix_peer(other) != sk &&
2078 	    unix_recvq_full_lockless(other))) {
2079 		if (timeo) {
2080 			timeo = unix_wait_for_peer(other, timeo);
2081 
2082 			err = sock_intr_errno(timeo);
2083 			if (signal_pending(current))
2084 				goto out_free;
2085 
2086 			goto restart;
2087 		}
2088 
2089 		if (!sk_locked) {
2090 			unix_state_unlock(other);
2091 			unix_state_double_lock(sk, other);
2092 		}
2093 
2094 		if (unix_peer(sk) != other ||
2095 		    unix_dgram_peer_wake_me(sk, other)) {
2096 			err = -EAGAIN;
2097 			sk_locked = 1;
2098 			goto out_unlock;
2099 		}
2100 
2101 		if (!sk_locked) {
2102 			sk_locked = 1;
2103 			goto restart_locked;
2104 		}
2105 	}
2106 
2107 	if (unlikely(sk_locked))
2108 		unix_state_unlock(sk);
2109 
2110 	if (sock_flag(other, SOCK_RCVTSTAMP))
2111 		__net_timestamp(skb);
2112 	maybe_add_creds(skb, sock, other);
2113 	scm_stat_add(other, skb);
2114 	skb_queue_tail(&other->sk_receive_queue, skb);
2115 	unix_state_unlock(other);
2116 	other->sk_data_ready(other);
2117 	sock_put(other);
2118 	scm_destroy(&scm);
2119 	return len;
2120 
2121 out_unlock:
2122 	if (sk_locked)
2123 		unix_state_unlock(sk);
2124 	unix_state_unlock(other);
2125 out_free:
2126 	kfree_skb(skb);
2127 out:
2128 	if (other)
2129 		sock_put(other);
2130 	scm_destroy(&scm);
2131 	return err;
2132 }
2133 
2134 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2135  * bytes, and a minimum of a full page.
2136  */
2137 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2138 
2139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2140 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2141 		     struct scm_cookie *scm, bool fds_sent)
2142 {
2143 	struct unix_sock *ousk = unix_sk(other);
2144 	struct sk_buff *skb;
2145 	int err = 0;
2146 
2147 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2148 
2149 	if (!skb)
2150 		return err;
2151 
2152 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2153 	if (err < 0) {
2154 		kfree_skb(skb);
2155 		return err;
2156 	}
2157 	skb_put(skb, 1);
2158 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2159 
2160 	if (err) {
2161 		kfree_skb(skb);
2162 		return err;
2163 	}
2164 
2165 	unix_state_lock(other);
2166 
2167 	if (sock_flag(other, SOCK_DEAD) ||
2168 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2169 		unix_state_unlock(other);
2170 		kfree_skb(skb);
2171 		return -EPIPE;
2172 	}
2173 
2174 	maybe_add_creds(skb, sock, other);
2175 	skb_get(skb);
2176 
2177 	scm_stat_add(other, skb);
2178 
2179 	spin_lock(&other->sk_receive_queue.lock);
2180 	if (ousk->oob_skb)
2181 		consume_skb(ousk->oob_skb);
2182 	WRITE_ONCE(ousk->oob_skb, skb);
2183 	__skb_queue_tail(&other->sk_receive_queue, skb);
2184 	spin_unlock(&other->sk_receive_queue.lock);
2185 
2186 	sk_send_sigurg(other);
2187 	unix_state_unlock(other);
2188 	other->sk_data_ready(other);
2189 
2190 	return err;
2191 }
2192 #endif
2193 
2194 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2195 			       size_t len)
2196 {
2197 	struct sock *sk = sock->sk;
2198 	struct sock *other = NULL;
2199 	int err, size;
2200 	struct sk_buff *skb;
2201 	int sent = 0;
2202 	struct scm_cookie scm;
2203 	bool fds_sent = false;
2204 	int data_len;
2205 
2206 	err = scm_send(sock, msg, &scm, false);
2207 	if (err < 0)
2208 		return err;
2209 
2210 	wait_for_unix_gc(scm.fp);
2211 
2212 	err = -EOPNOTSUPP;
2213 	if (msg->msg_flags & MSG_OOB) {
2214 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2215 		if (len)
2216 			len--;
2217 		else
2218 #endif
2219 			goto out_err;
2220 	}
2221 
2222 	if (msg->msg_namelen) {
2223 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2224 		goto out_err;
2225 	} else {
2226 		err = -ENOTCONN;
2227 		other = unix_peer(sk);
2228 		if (!other)
2229 			goto out_err;
2230 	}
2231 
2232 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2233 		goto pipe_err;
2234 
2235 	while (sent < len) {
2236 		size = len - sent;
2237 
2238 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2239 			skb = sock_alloc_send_pskb(sk, 0, 0,
2240 						   msg->msg_flags & MSG_DONTWAIT,
2241 						   &err, 0);
2242 		} else {
2243 			/* Keep two messages in the pipe so it schedules better */
2244 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2245 
2246 			/* allow fallback to order-0 allocations */
2247 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2248 
2249 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2250 
2251 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2252 
2253 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2254 						   msg->msg_flags & MSG_DONTWAIT, &err,
2255 						   get_order(UNIX_SKB_FRAGS_SZ));
2256 		}
2257 		if (!skb)
2258 			goto out_err;
2259 
2260 		/* Only send the fds in the first buffer */
2261 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2262 		if (err < 0) {
2263 			kfree_skb(skb);
2264 			goto out_err;
2265 		}
2266 		fds_sent = true;
2267 
2268 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2269 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2270 						   sk->sk_allocation);
2271 			if (err < 0) {
2272 				kfree_skb(skb);
2273 				goto out_err;
2274 			}
2275 			size = err;
2276 			refcount_add(size, &sk->sk_wmem_alloc);
2277 		} else {
2278 			skb_put(skb, size - data_len);
2279 			skb->data_len = data_len;
2280 			skb->len = size;
2281 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2282 			if (err) {
2283 				kfree_skb(skb);
2284 				goto out_err;
2285 			}
2286 		}
2287 
2288 		unix_state_lock(other);
2289 
2290 		if (sock_flag(other, SOCK_DEAD) ||
2291 		    (other->sk_shutdown & RCV_SHUTDOWN))
2292 			goto pipe_err_free;
2293 
2294 		maybe_add_creds(skb, sock, other);
2295 		scm_stat_add(other, skb);
2296 		skb_queue_tail(&other->sk_receive_queue, skb);
2297 		unix_state_unlock(other);
2298 		other->sk_data_ready(other);
2299 		sent += size;
2300 	}
2301 
2302 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2303 	if (msg->msg_flags & MSG_OOB) {
2304 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2305 		if (err)
2306 			goto out_err;
2307 		sent++;
2308 	}
2309 #endif
2310 
2311 	scm_destroy(&scm);
2312 
2313 	return sent;
2314 
2315 pipe_err_free:
2316 	unix_state_unlock(other);
2317 	kfree_skb(skb);
2318 pipe_err:
2319 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2320 		send_sig(SIGPIPE, current, 0);
2321 	err = -EPIPE;
2322 out_err:
2323 	scm_destroy(&scm);
2324 	return sent ? : err;
2325 }
2326 
2327 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2328 				  size_t len)
2329 {
2330 	int err;
2331 	struct sock *sk = sock->sk;
2332 
2333 	err = sock_error(sk);
2334 	if (err)
2335 		return err;
2336 
2337 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2338 		return -ENOTCONN;
2339 
2340 	if (msg->msg_namelen)
2341 		msg->msg_namelen = 0;
2342 
2343 	return unix_dgram_sendmsg(sock, msg, len);
2344 }
2345 
2346 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2347 				  size_t size, int flags)
2348 {
2349 	struct sock *sk = sock->sk;
2350 
2351 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2352 		return -ENOTCONN;
2353 
2354 	return unix_dgram_recvmsg(sock, msg, size, flags);
2355 }
2356 
2357 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2358 {
2359 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2360 
2361 	if (addr) {
2362 		msg->msg_namelen = addr->len;
2363 		memcpy(msg->msg_name, addr->name, addr->len);
2364 	}
2365 }
2366 
2367 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2368 			 int flags)
2369 {
2370 	struct scm_cookie scm;
2371 	struct socket *sock = sk->sk_socket;
2372 	struct unix_sock *u = unix_sk(sk);
2373 	struct sk_buff *skb, *last;
2374 	long timeo;
2375 	int skip;
2376 	int err;
2377 
2378 	err = -EOPNOTSUPP;
2379 	if (flags&MSG_OOB)
2380 		goto out;
2381 
2382 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2383 
2384 	do {
2385 		mutex_lock(&u->iolock);
2386 
2387 		skip = sk_peek_offset(sk, flags);
2388 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2389 					      &skip, &err, &last);
2390 		if (skb) {
2391 			if (!(flags & MSG_PEEK))
2392 				scm_stat_del(sk, skb);
2393 			break;
2394 		}
2395 
2396 		mutex_unlock(&u->iolock);
2397 
2398 		if (err != -EAGAIN)
2399 			break;
2400 	} while (timeo &&
2401 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2402 					      &err, &timeo, last));
2403 
2404 	if (!skb) { /* implies iolock unlocked */
2405 		unix_state_lock(sk);
2406 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2407 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2408 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2409 			err = 0;
2410 		unix_state_unlock(sk);
2411 		goto out;
2412 	}
2413 
2414 	if (wq_has_sleeper(&u->peer_wait))
2415 		wake_up_interruptible_sync_poll(&u->peer_wait,
2416 						EPOLLOUT | EPOLLWRNORM |
2417 						EPOLLWRBAND);
2418 
2419 	if (msg->msg_name) {
2420 		unix_copy_addr(msg, skb->sk);
2421 
2422 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2423 						      msg->msg_name,
2424 						      &msg->msg_namelen);
2425 	}
2426 
2427 	if (size > skb->len - skip)
2428 		size = skb->len - skip;
2429 	else if (size < skb->len - skip)
2430 		msg->msg_flags |= MSG_TRUNC;
2431 
2432 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2433 	if (err)
2434 		goto out_free;
2435 
2436 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2437 		__sock_recv_timestamp(msg, sk, skb);
2438 
2439 	memset(&scm, 0, sizeof(scm));
2440 
2441 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2442 	unix_set_secdata(&scm, skb);
2443 
2444 	if (!(flags & MSG_PEEK)) {
2445 		if (UNIXCB(skb).fp)
2446 			unix_detach_fds(&scm, skb);
2447 
2448 		sk_peek_offset_bwd(sk, skb->len);
2449 	} else {
2450 		/* It is questionable: on PEEK we could:
2451 		   - do not return fds - good, but too simple 8)
2452 		   - return fds, and do not return them on read (old strategy,
2453 		     apparently wrong)
2454 		   - clone fds (I chose it for now, it is the most universal
2455 		     solution)
2456 
2457 		   POSIX 1003.1g does not actually define this clearly
2458 		   at all. POSIX 1003.1g doesn't define a lot of things
2459 		   clearly however!
2460 
2461 		*/
2462 
2463 		sk_peek_offset_fwd(sk, size);
2464 
2465 		if (UNIXCB(skb).fp)
2466 			unix_peek_fds(&scm, skb);
2467 	}
2468 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2469 
2470 	scm_recv_unix(sock, msg, &scm, flags);
2471 
2472 out_free:
2473 	skb_free_datagram(sk, skb);
2474 	mutex_unlock(&u->iolock);
2475 out:
2476 	return err;
2477 }
2478 
2479 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2480 			      int flags)
2481 {
2482 	struct sock *sk = sock->sk;
2483 
2484 #ifdef CONFIG_BPF_SYSCALL
2485 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2486 
2487 	if (prot != &unix_dgram_proto)
2488 		return prot->recvmsg(sk, msg, size, flags, NULL);
2489 #endif
2490 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2491 }
2492 
2493 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2494 {
2495 	struct unix_sock *u = unix_sk(sk);
2496 	struct sk_buff *skb;
2497 	int err;
2498 
2499 	mutex_lock(&u->iolock);
2500 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2501 	mutex_unlock(&u->iolock);
2502 	if (!skb)
2503 		return err;
2504 
2505 	return recv_actor(sk, skb);
2506 }
2507 
2508 /*
2509  *	Sleep until more data has arrived. But check for races..
2510  */
2511 static long unix_stream_data_wait(struct sock *sk, long timeo,
2512 				  struct sk_buff *last, unsigned int last_len,
2513 				  bool freezable)
2514 {
2515 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2516 	struct sk_buff *tail;
2517 	DEFINE_WAIT(wait);
2518 
2519 	unix_state_lock(sk);
2520 
2521 	for (;;) {
2522 		prepare_to_wait(sk_sleep(sk), &wait, state);
2523 
2524 		tail = skb_peek_tail(&sk->sk_receive_queue);
2525 		if (tail != last ||
2526 		    (tail && tail->len != last_len) ||
2527 		    sk->sk_err ||
2528 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2529 		    signal_pending(current) ||
2530 		    !timeo)
2531 			break;
2532 
2533 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2534 		unix_state_unlock(sk);
2535 		timeo = schedule_timeout(timeo);
2536 		unix_state_lock(sk);
2537 
2538 		if (sock_flag(sk, SOCK_DEAD))
2539 			break;
2540 
2541 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2542 	}
2543 
2544 	finish_wait(sk_sleep(sk), &wait);
2545 	unix_state_unlock(sk);
2546 	return timeo;
2547 }
2548 
2549 static unsigned int unix_skb_len(const struct sk_buff *skb)
2550 {
2551 	return skb->len - UNIXCB(skb).consumed;
2552 }
2553 
2554 struct unix_stream_read_state {
2555 	int (*recv_actor)(struct sk_buff *, int, int,
2556 			  struct unix_stream_read_state *);
2557 	struct socket *socket;
2558 	struct msghdr *msg;
2559 	struct pipe_inode_info *pipe;
2560 	size_t size;
2561 	int flags;
2562 	unsigned int splice_flags;
2563 };
2564 
2565 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2566 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2567 {
2568 	struct socket *sock = state->socket;
2569 	struct sock *sk = sock->sk;
2570 	struct unix_sock *u = unix_sk(sk);
2571 	int chunk = 1;
2572 	struct sk_buff *oob_skb;
2573 
2574 	mutex_lock(&u->iolock);
2575 	unix_state_lock(sk);
2576 	spin_lock(&sk->sk_receive_queue.lock);
2577 
2578 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2579 		spin_unlock(&sk->sk_receive_queue.lock);
2580 		unix_state_unlock(sk);
2581 		mutex_unlock(&u->iolock);
2582 		return -EINVAL;
2583 	}
2584 
2585 	oob_skb = u->oob_skb;
2586 
2587 	if (!(state->flags & MSG_PEEK))
2588 		WRITE_ONCE(u->oob_skb, NULL);
2589 	else
2590 		skb_get(oob_skb);
2591 
2592 	spin_unlock(&sk->sk_receive_queue.lock);
2593 	unix_state_unlock(sk);
2594 
2595 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2596 
2597 	if (!(state->flags & MSG_PEEK))
2598 		UNIXCB(oob_skb).consumed += 1;
2599 
2600 	consume_skb(oob_skb);
2601 
2602 	mutex_unlock(&u->iolock);
2603 
2604 	if (chunk < 0)
2605 		return -EFAULT;
2606 
2607 	state->msg->msg_flags |= MSG_OOB;
2608 	return 1;
2609 }
2610 
2611 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2612 				  int flags, int copied)
2613 {
2614 	struct unix_sock *u = unix_sk(sk);
2615 
2616 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2617 		skb_unlink(skb, &sk->sk_receive_queue);
2618 		consume_skb(skb);
2619 		skb = NULL;
2620 	} else {
2621 		struct sk_buff *unlinked_skb = NULL;
2622 
2623 		spin_lock(&sk->sk_receive_queue.lock);
2624 
2625 		if (skb == u->oob_skb) {
2626 			if (copied) {
2627 				skb = NULL;
2628 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2629 				if (!(flags & MSG_PEEK)) {
2630 					WRITE_ONCE(u->oob_skb, NULL);
2631 					consume_skb(skb);
2632 				}
2633 			} else if (flags & MSG_PEEK) {
2634 				skb = NULL;
2635 			} else {
2636 				__skb_unlink(skb, &sk->sk_receive_queue);
2637 				WRITE_ONCE(u->oob_skb, NULL);
2638 				unlinked_skb = skb;
2639 				skb = skb_peek(&sk->sk_receive_queue);
2640 			}
2641 		}
2642 
2643 		spin_unlock(&sk->sk_receive_queue.lock);
2644 
2645 		if (unlinked_skb) {
2646 			WARN_ON_ONCE(skb_unref(unlinked_skb));
2647 			kfree_skb(unlinked_skb);
2648 		}
2649 	}
2650 	return skb;
2651 }
2652 #endif
2653 
2654 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2655 {
2656 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2657 		return -ENOTCONN;
2658 
2659 	return unix_read_skb(sk, recv_actor);
2660 }
2661 
2662 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2663 				    bool freezable)
2664 {
2665 	struct scm_cookie scm;
2666 	struct socket *sock = state->socket;
2667 	struct sock *sk = sock->sk;
2668 	struct unix_sock *u = unix_sk(sk);
2669 	int copied = 0;
2670 	int flags = state->flags;
2671 	int noblock = flags & MSG_DONTWAIT;
2672 	bool check_creds = false;
2673 	int target;
2674 	int err = 0;
2675 	long timeo;
2676 	int skip;
2677 	size_t size = state->size;
2678 	unsigned int last_len;
2679 
2680 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2681 		err = -EINVAL;
2682 		goto out;
2683 	}
2684 
2685 	if (unlikely(flags & MSG_OOB)) {
2686 		err = -EOPNOTSUPP;
2687 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2688 		err = unix_stream_recv_urg(state);
2689 #endif
2690 		goto out;
2691 	}
2692 
2693 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2694 	timeo = sock_rcvtimeo(sk, noblock);
2695 
2696 	memset(&scm, 0, sizeof(scm));
2697 
2698 	/* Lock the socket to prevent queue disordering
2699 	 * while sleeps in memcpy_tomsg
2700 	 */
2701 	mutex_lock(&u->iolock);
2702 
2703 	skip = max(sk_peek_offset(sk, flags), 0);
2704 
2705 	do {
2706 		int chunk;
2707 		bool drop_skb;
2708 		struct sk_buff *skb, *last;
2709 
2710 redo:
2711 		unix_state_lock(sk);
2712 		if (sock_flag(sk, SOCK_DEAD)) {
2713 			err = -ECONNRESET;
2714 			goto unlock;
2715 		}
2716 		last = skb = skb_peek(&sk->sk_receive_queue);
2717 		last_len = last ? last->len : 0;
2718 
2719 again:
2720 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2721 		if (skb) {
2722 			skb = manage_oob(skb, sk, flags, copied);
2723 			if (!skb && copied) {
2724 				unix_state_unlock(sk);
2725 				break;
2726 			}
2727 		}
2728 #endif
2729 		if (skb == NULL) {
2730 			if (copied >= target)
2731 				goto unlock;
2732 
2733 			/*
2734 			 *	POSIX 1003.1g mandates this order.
2735 			 */
2736 
2737 			err = sock_error(sk);
2738 			if (err)
2739 				goto unlock;
2740 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2741 				goto unlock;
2742 
2743 			unix_state_unlock(sk);
2744 			if (!timeo) {
2745 				err = -EAGAIN;
2746 				break;
2747 			}
2748 
2749 			mutex_unlock(&u->iolock);
2750 
2751 			timeo = unix_stream_data_wait(sk, timeo, last,
2752 						      last_len, freezable);
2753 
2754 			if (signal_pending(current)) {
2755 				err = sock_intr_errno(timeo);
2756 				scm_destroy(&scm);
2757 				goto out;
2758 			}
2759 
2760 			mutex_lock(&u->iolock);
2761 			goto redo;
2762 unlock:
2763 			unix_state_unlock(sk);
2764 			break;
2765 		}
2766 
2767 		while (skip >= unix_skb_len(skb)) {
2768 			skip -= unix_skb_len(skb);
2769 			last = skb;
2770 			last_len = skb->len;
2771 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2772 			if (!skb)
2773 				goto again;
2774 		}
2775 
2776 		unix_state_unlock(sk);
2777 
2778 		if (check_creds) {
2779 			/* Never glue messages from different writers */
2780 			if (!unix_skb_scm_eq(skb, &scm))
2781 				break;
2782 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2783 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2784 			/* Copy credentials */
2785 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2786 			unix_set_secdata(&scm, skb);
2787 			check_creds = true;
2788 		}
2789 
2790 		/* Copy address just once */
2791 		if (state->msg && state->msg->msg_name) {
2792 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2793 					 state->msg->msg_name);
2794 			unix_copy_addr(state->msg, skb->sk);
2795 
2796 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2797 							      state->msg->msg_name,
2798 							      &state->msg->msg_namelen);
2799 
2800 			sunaddr = NULL;
2801 		}
2802 
2803 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2804 		skb_get(skb);
2805 		chunk = state->recv_actor(skb, skip, chunk, state);
2806 		drop_skb = !unix_skb_len(skb);
2807 		/* skb is only safe to use if !drop_skb */
2808 		consume_skb(skb);
2809 		if (chunk < 0) {
2810 			if (copied == 0)
2811 				copied = -EFAULT;
2812 			break;
2813 		}
2814 		copied += chunk;
2815 		size -= chunk;
2816 
2817 		if (drop_skb) {
2818 			/* the skb was touched by a concurrent reader;
2819 			 * we should not expect anything from this skb
2820 			 * anymore and assume it invalid - we can be
2821 			 * sure it was dropped from the socket queue
2822 			 *
2823 			 * let's report a short read
2824 			 */
2825 			err = 0;
2826 			break;
2827 		}
2828 
2829 		/* Mark read part of skb as used */
2830 		if (!(flags & MSG_PEEK)) {
2831 			UNIXCB(skb).consumed += chunk;
2832 
2833 			sk_peek_offset_bwd(sk, chunk);
2834 
2835 			if (UNIXCB(skb).fp) {
2836 				scm_stat_del(sk, skb);
2837 				unix_detach_fds(&scm, skb);
2838 			}
2839 
2840 			if (unix_skb_len(skb))
2841 				break;
2842 
2843 			skb_unlink(skb, &sk->sk_receive_queue);
2844 			consume_skb(skb);
2845 
2846 			if (scm.fp)
2847 				break;
2848 		} else {
2849 			/* It is questionable, see note in unix_dgram_recvmsg.
2850 			 */
2851 			if (UNIXCB(skb).fp)
2852 				unix_peek_fds(&scm, skb);
2853 
2854 			sk_peek_offset_fwd(sk, chunk);
2855 
2856 			if (UNIXCB(skb).fp)
2857 				break;
2858 
2859 			skip = 0;
2860 			last = skb;
2861 			last_len = skb->len;
2862 			unix_state_lock(sk);
2863 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2864 			if (skb)
2865 				goto again;
2866 			unix_state_unlock(sk);
2867 			break;
2868 		}
2869 	} while (size);
2870 
2871 	mutex_unlock(&u->iolock);
2872 	if (state->msg)
2873 		scm_recv_unix(sock, state->msg, &scm, flags);
2874 	else
2875 		scm_destroy(&scm);
2876 out:
2877 	return copied ? : err;
2878 }
2879 
2880 static int unix_stream_read_actor(struct sk_buff *skb,
2881 				  int skip, int chunk,
2882 				  struct unix_stream_read_state *state)
2883 {
2884 	int ret;
2885 
2886 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2887 				    state->msg, chunk);
2888 	return ret ?: chunk;
2889 }
2890 
2891 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2892 			  size_t size, int flags)
2893 {
2894 	struct unix_stream_read_state state = {
2895 		.recv_actor = unix_stream_read_actor,
2896 		.socket = sk->sk_socket,
2897 		.msg = msg,
2898 		.size = size,
2899 		.flags = flags
2900 	};
2901 
2902 	return unix_stream_read_generic(&state, true);
2903 }
2904 
2905 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2906 			       size_t size, int flags)
2907 {
2908 	struct unix_stream_read_state state = {
2909 		.recv_actor = unix_stream_read_actor,
2910 		.socket = sock,
2911 		.msg = msg,
2912 		.size = size,
2913 		.flags = flags
2914 	};
2915 
2916 #ifdef CONFIG_BPF_SYSCALL
2917 	struct sock *sk = sock->sk;
2918 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2919 
2920 	if (prot != &unix_stream_proto)
2921 		return prot->recvmsg(sk, msg, size, flags, NULL);
2922 #endif
2923 	return unix_stream_read_generic(&state, true);
2924 }
2925 
2926 static int unix_stream_splice_actor(struct sk_buff *skb,
2927 				    int skip, int chunk,
2928 				    struct unix_stream_read_state *state)
2929 {
2930 	return skb_splice_bits(skb, state->socket->sk,
2931 			       UNIXCB(skb).consumed + skip,
2932 			       state->pipe, chunk, state->splice_flags);
2933 }
2934 
2935 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2936 				       struct pipe_inode_info *pipe,
2937 				       size_t size, unsigned int flags)
2938 {
2939 	struct unix_stream_read_state state = {
2940 		.recv_actor = unix_stream_splice_actor,
2941 		.socket = sock,
2942 		.pipe = pipe,
2943 		.size = size,
2944 		.splice_flags = flags,
2945 	};
2946 
2947 	if (unlikely(*ppos))
2948 		return -ESPIPE;
2949 
2950 	if (sock->file->f_flags & O_NONBLOCK ||
2951 	    flags & SPLICE_F_NONBLOCK)
2952 		state.flags = MSG_DONTWAIT;
2953 
2954 	return unix_stream_read_generic(&state, false);
2955 }
2956 
2957 static int unix_shutdown(struct socket *sock, int mode)
2958 {
2959 	struct sock *sk = sock->sk;
2960 	struct sock *other;
2961 
2962 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2963 		return -EINVAL;
2964 	/* This maps:
2965 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2966 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2967 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2968 	 */
2969 	++mode;
2970 
2971 	unix_state_lock(sk);
2972 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2973 	other = unix_peer(sk);
2974 	if (other)
2975 		sock_hold(other);
2976 	unix_state_unlock(sk);
2977 	sk->sk_state_change(sk);
2978 
2979 	if (other &&
2980 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2981 
2982 		int peer_mode = 0;
2983 		const struct proto *prot = READ_ONCE(other->sk_prot);
2984 
2985 		if (prot->unhash)
2986 			prot->unhash(other);
2987 		if (mode&RCV_SHUTDOWN)
2988 			peer_mode |= SEND_SHUTDOWN;
2989 		if (mode&SEND_SHUTDOWN)
2990 			peer_mode |= RCV_SHUTDOWN;
2991 		unix_state_lock(other);
2992 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2993 		unix_state_unlock(other);
2994 		other->sk_state_change(other);
2995 		if (peer_mode == SHUTDOWN_MASK)
2996 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2997 		else if (peer_mode & RCV_SHUTDOWN)
2998 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2999 	}
3000 	if (other)
3001 		sock_put(other);
3002 
3003 	return 0;
3004 }
3005 
3006 long unix_inq_len(struct sock *sk)
3007 {
3008 	struct sk_buff *skb;
3009 	long amount = 0;
3010 
3011 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3012 		return -EINVAL;
3013 
3014 	spin_lock(&sk->sk_receive_queue.lock);
3015 	if (sk->sk_type == SOCK_STREAM ||
3016 	    sk->sk_type == SOCK_SEQPACKET) {
3017 		skb_queue_walk(&sk->sk_receive_queue, skb)
3018 			amount += unix_skb_len(skb);
3019 	} else {
3020 		skb = skb_peek(&sk->sk_receive_queue);
3021 		if (skb)
3022 			amount = skb->len;
3023 	}
3024 	spin_unlock(&sk->sk_receive_queue.lock);
3025 
3026 	return amount;
3027 }
3028 EXPORT_SYMBOL_GPL(unix_inq_len);
3029 
3030 long unix_outq_len(struct sock *sk)
3031 {
3032 	return sk_wmem_alloc_get(sk);
3033 }
3034 EXPORT_SYMBOL_GPL(unix_outq_len);
3035 
3036 static int unix_open_file(struct sock *sk)
3037 {
3038 	struct path path;
3039 	struct file *f;
3040 	int fd;
3041 
3042 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3043 		return -EPERM;
3044 
3045 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3046 		return -ENOENT;
3047 
3048 	path = unix_sk(sk)->path;
3049 	if (!path.dentry)
3050 		return -ENOENT;
3051 
3052 	path_get(&path);
3053 
3054 	fd = get_unused_fd_flags(O_CLOEXEC);
3055 	if (fd < 0)
3056 		goto out;
3057 
3058 	f = dentry_open(&path, O_PATH, current_cred());
3059 	if (IS_ERR(f)) {
3060 		put_unused_fd(fd);
3061 		fd = PTR_ERR(f);
3062 		goto out;
3063 	}
3064 
3065 	fd_install(fd, f);
3066 out:
3067 	path_put(&path);
3068 
3069 	return fd;
3070 }
3071 
3072 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3073 {
3074 	struct sock *sk = sock->sk;
3075 	long amount = 0;
3076 	int err;
3077 
3078 	switch (cmd) {
3079 	case SIOCOUTQ:
3080 		amount = unix_outq_len(sk);
3081 		err = put_user(amount, (int __user *)arg);
3082 		break;
3083 	case SIOCINQ:
3084 		amount = unix_inq_len(sk);
3085 		if (amount < 0)
3086 			err = amount;
3087 		else
3088 			err = put_user(amount, (int __user *)arg);
3089 		break;
3090 	case SIOCUNIXFILE:
3091 		err = unix_open_file(sk);
3092 		break;
3093 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3094 	case SIOCATMARK:
3095 		{
3096 			struct sk_buff *skb;
3097 			int answ = 0;
3098 
3099 			skb = skb_peek(&sk->sk_receive_queue);
3100 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3101 				answ = 1;
3102 			err = put_user(answ, (int __user *)arg);
3103 		}
3104 		break;
3105 #endif
3106 	default:
3107 		err = -ENOIOCTLCMD;
3108 		break;
3109 	}
3110 	return err;
3111 }
3112 
3113 #ifdef CONFIG_COMPAT
3114 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3115 {
3116 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3117 }
3118 #endif
3119 
3120 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3121 {
3122 	struct sock *sk = sock->sk;
3123 	unsigned char state;
3124 	__poll_t mask;
3125 	u8 shutdown;
3126 
3127 	sock_poll_wait(file, sock, wait);
3128 	mask = 0;
3129 	shutdown = READ_ONCE(sk->sk_shutdown);
3130 	state = READ_ONCE(sk->sk_state);
3131 
3132 	/* exceptional events? */
3133 	if (READ_ONCE(sk->sk_err))
3134 		mask |= EPOLLERR;
3135 	if (shutdown == SHUTDOWN_MASK)
3136 		mask |= EPOLLHUP;
3137 	if (shutdown & RCV_SHUTDOWN)
3138 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3139 
3140 	/* readable? */
3141 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3142 		mask |= EPOLLIN | EPOLLRDNORM;
3143 	if (sk_is_readable(sk))
3144 		mask |= EPOLLIN | EPOLLRDNORM;
3145 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3146 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3147 		mask |= EPOLLPRI;
3148 #endif
3149 
3150 	/* Connection-based need to check for termination and startup */
3151 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3152 	    state == TCP_CLOSE)
3153 		mask |= EPOLLHUP;
3154 
3155 	/*
3156 	 * we set writable also when the other side has shut down the
3157 	 * connection. This prevents stuck sockets.
3158 	 */
3159 	if (unix_writable(sk, state))
3160 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3161 
3162 	return mask;
3163 }
3164 
3165 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3166 				    poll_table *wait)
3167 {
3168 	struct sock *sk = sock->sk, *other;
3169 	unsigned int writable;
3170 	unsigned char state;
3171 	__poll_t mask;
3172 	u8 shutdown;
3173 
3174 	sock_poll_wait(file, sock, wait);
3175 	mask = 0;
3176 	shutdown = READ_ONCE(sk->sk_shutdown);
3177 	state = READ_ONCE(sk->sk_state);
3178 
3179 	/* exceptional events? */
3180 	if (READ_ONCE(sk->sk_err) ||
3181 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3182 		mask |= EPOLLERR |
3183 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3184 
3185 	if (shutdown & RCV_SHUTDOWN)
3186 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3187 	if (shutdown == SHUTDOWN_MASK)
3188 		mask |= EPOLLHUP;
3189 
3190 	/* readable? */
3191 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3192 		mask |= EPOLLIN | EPOLLRDNORM;
3193 	if (sk_is_readable(sk))
3194 		mask |= EPOLLIN | EPOLLRDNORM;
3195 
3196 	/* Connection-based need to check for termination and startup */
3197 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3198 		mask |= EPOLLHUP;
3199 
3200 	/* No write status requested, avoid expensive OUT tests. */
3201 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3202 		return mask;
3203 
3204 	writable = unix_writable(sk, state);
3205 	if (writable) {
3206 		unix_state_lock(sk);
3207 
3208 		other = unix_peer(sk);
3209 		if (other && unix_peer(other) != sk &&
3210 		    unix_recvq_full_lockless(other) &&
3211 		    unix_dgram_peer_wake_me(sk, other))
3212 			writable = 0;
3213 
3214 		unix_state_unlock(sk);
3215 	}
3216 
3217 	if (writable)
3218 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3219 	else
3220 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3221 
3222 	return mask;
3223 }
3224 
3225 #ifdef CONFIG_PROC_FS
3226 
3227 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3228 
3229 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3230 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3231 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3232 
3233 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3234 {
3235 	unsigned long offset = get_offset(*pos);
3236 	unsigned long bucket = get_bucket(*pos);
3237 	unsigned long count = 0;
3238 	struct sock *sk;
3239 
3240 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3241 	     sk; sk = sk_next(sk)) {
3242 		if (++count == offset)
3243 			break;
3244 	}
3245 
3246 	return sk;
3247 }
3248 
3249 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3250 {
3251 	unsigned long bucket = get_bucket(*pos);
3252 	struct net *net = seq_file_net(seq);
3253 	struct sock *sk;
3254 
3255 	while (bucket < UNIX_HASH_SIZE) {
3256 		spin_lock(&net->unx.table.locks[bucket]);
3257 
3258 		sk = unix_from_bucket(seq, pos);
3259 		if (sk)
3260 			return sk;
3261 
3262 		spin_unlock(&net->unx.table.locks[bucket]);
3263 
3264 		*pos = set_bucket_offset(++bucket, 1);
3265 	}
3266 
3267 	return NULL;
3268 }
3269 
3270 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3271 				  loff_t *pos)
3272 {
3273 	unsigned long bucket = get_bucket(*pos);
3274 
3275 	sk = sk_next(sk);
3276 	if (sk)
3277 		return sk;
3278 
3279 
3280 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3281 
3282 	*pos = set_bucket_offset(++bucket, 1);
3283 
3284 	return unix_get_first(seq, pos);
3285 }
3286 
3287 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3288 {
3289 	if (!*pos)
3290 		return SEQ_START_TOKEN;
3291 
3292 	return unix_get_first(seq, pos);
3293 }
3294 
3295 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3296 {
3297 	++*pos;
3298 
3299 	if (v == SEQ_START_TOKEN)
3300 		return unix_get_first(seq, pos);
3301 
3302 	return unix_get_next(seq, v, pos);
3303 }
3304 
3305 static void unix_seq_stop(struct seq_file *seq, void *v)
3306 {
3307 	struct sock *sk = v;
3308 
3309 	if (sk)
3310 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3311 }
3312 
3313 static int unix_seq_show(struct seq_file *seq, void *v)
3314 {
3315 
3316 	if (v == SEQ_START_TOKEN)
3317 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3318 			 "Inode Path\n");
3319 	else {
3320 		struct sock *s = v;
3321 		struct unix_sock *u = unix_sk(s);
3322 		unix_state_lock(s);
3323 
3324 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3325 			s,
3326 			refcount_read(&s->sk_refcnt),
3327 			0,
3328 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3329 			s->sk_type,
3330 			s->sk_socket ?
3331 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3332 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3333 			sock_i_ino(s));
3334 
3335 		if (u->addr) {	// under a hash table lock here
3336 			int i, len;
3337 			seq_putc(seq, ' ');
3338 
3339 			i = 0;
3340 			len = u->addr->len -
3341 				offsetof(struct sockaddr_un, sun_path);
3342 			if (u->addr->name->sun_path[0]) {
3343 				len--;
3344 			} else {
3345 				seq_putc(seq, '@');
3346 				i++;
3347 			}
3348 			for ( ; i < len; i++)
3349 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3350 					 '@');
3351 		}
3352 		unix_state_unlock(s);
3353 		seq_putc(seq, '\n');
3354 	}
3355 
3356 	return 0;
3357 }
3358 
3359 static const struct seq_operations unix_seq_ops = {
3360 	.start  = unix_seq_start,
3361 	.next   = unix_seq_next,
3362 	.stop   = unix_seq_stop,
3363 	.show   = unix_seq_show,
3364 };
3365 
3366 #ifdef CONFIG_BPF_SYSCALL
3367 struct bpf_unix_iter_state {
3368 	struct seq_net_private p;
3369 	unsigned int cur_sk;
3370 	unsigned int end_sk;
3371 	unsigned int max_sk;
3372 	struct sock **batch;
3373 	bool st_bucket_done;
3374 };
3375 
3376 struct bpf_iter__unix {
3377 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3378 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3379 	uid_t uid __aligned(8);
3380 };
3381 
3382 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3383 			      struct unix_sock *unix_sk, uid_t uid)
3384 {
3385 	struct bpf_iter__unix ctx;
3386 
3387 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3388 	ctx.meta = meta;
3389 	ctx.unix_sk = unix_sk;
3390 	ctx.uid = uid;
3391 	return bpf_iter_run_prog(prog, &ctx);
3392 }
3393 
3394 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3395 
3396 {
3397 	struct bpf_unix_iter_state *iter = seq->private;
3398 	unsigned int expected = 1;
3399 	struct sock *sk;
3400 
3401 	sock_hold(start_sk);
3402 	iter->batch[iter->end_sk++] = start_sk;
3403 
3404 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3405 		if (iter->end_sk < iter->max_sk) {
3406 			sock_hold(sk);
3407 			iter->batch[iter->end_sk++] = sk;
3408 		}
3409 
3410 		expected++;
3411 	}
3412 
3413 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3414 
3415 	return expected;
3416 }
3417 
3418 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3419 {
3420 	while (iter->cur_sk < iter->end_sk)
3421 		sock_put(iter->batch[iter->cur_sk++]);
3422 }
3423 
3424 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3425 				       unsigned int new_batch_sz)
3426 {
3427 	struct sock **new_batch;
3428 
3429 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3430 			     GFP_USER | __GFP_NOWARN);
3431 	if (!new_batch)
3432 		return -ENOMEM;
3433 
3434 	bpf_iter_unix_put_batch(iter);
3435 	kvfree(iter->batch);
3436 	iter->batch = new_batch;
3437 	iter->max_sk = new_batch_sz;
3438 
3439 	return 0;
3440 }
3441 
3442 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3443 					loff_t *pos)
3444 {
3445 	struct bpf_unix_iter_state *iter = seq->private;
3446 	unsigned int expected;
3447 	bool resized = false;
3448 	struct sock *sk;
3449 
3450 	if (iter->st_bucket_done)
3451 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3452 
3453 again:
3454 	/* Get a new batch */
3455 	iter->cur_sk = 0;
3456 	iter->end_sk = 0;
3457 
3458 	sk = unix_get_first(seq, pos);
3459 	if (!sk)
3460 		return NULL; /* Done */
3461 
3462 	expected = bpf_iter_unix_hold_batch(seq, sk);
3463 
3464 	if (iter->end_sk == expected) {
3465 		iter->st_bucket_done = true;
3466 		return sk;
3467 	}
3468 
3469 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3470 		resized = true;
3471 		goto again;
3472 	}
3473 
3474 	return sk;
3475 }
3476 
3477 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3478 {
3479 	if (!*pos)
3480 		return SEQ_START_TOKEN;
3481 
3482 	/* bpf iter does not support lseek, so it always
3483 	 * continue from where it was stop()-ped.
3484 	 */
3485 	return bpf_iter_unix_batch(seq, pos);
3486 }
3487 
3488 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3489 {
3490 	struct bpf_unix_iter_state *iter = seq->private;
3491 	struct sock *sk;
3492 
3493 	/* Whenever seq_next() is called, the iter->cur_sk is
3494 	 * done with seq_show(), so advance to the next sk in
3495 	 * the batch.
3496 	 */
3497 	if (iter->cur_sk < iter->end_sk)
3498 		sock_put(iter->batch[iter->cur_sk++]);
3499 
3500 	++*pos;
3501 
3502 	if (iter->cur_sk < iter->end_sk)
3503 		sk = iter->batch[iter->cur_sk];
3504 	else
3505 		sk = bpf_iter_unix_batch(seq, pos);
3506 
3507 	return sk;
3508 }
3509 
3510 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3511 {
3512 	struct bpf_iter_meta meta;
3513 	struct bpf_prog *prog;
3514 	struct sock *sk = v;
3515 	uid_t uid;
3516 	bool slow;
3517 	int ret;
3518 
3519 	if (v == SEQ_START_TOKEN)
3520 		return 0;
3521 
3522 	slow = lock_sock_fast(sk);
3523 
3524 	if (unlikely(sk_unhashed(sk))) {
3525 		ret = SEQ_SKIP;
3526 		goto unlock;
3527 	}
3528 
3529 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3530 	meta.seq = seq;
3531 	prog = bpf_iter_get_info(&meta, false);
3532 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3533 unlock:
3534 	unlock_sock_fast(sk, slow);
3535 	return ret;
3536 }
3537 
3538 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3539 {
3540 	struct bpf_unix_iter_state *iter = seq->private;
3541 	struct bpf_iter_meta meta;
3542 	struct bpf_prog *prog;
3543 
3544 	if (!v) {
3545 		meta.seq = seq;
3546 		prog = bpf_iter_get_info(&meta, true);
3547 		if (prog)
3548 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3549 	}
3550 
3551 	if (iter->cur_sk < iter->end_sk)
3552 		bpf_iter_unix_put_batch(iter);
3553 }
3554 
3555 static const struct seq_operations bpf_iter_unix_seq_ops = {
3556 	.start	= bpf_iter_unix_seq_start,
3557 	.next	= bpf_iter_unix_seq_next,
3558 	.stop	= bpf_iter_unix_seq_stop,
3559 	.show	= bpf_iter_unix_seq_show,
3560 };
3561 #endif
3562 #endif
3563 
3564 static const struct net_proto_family unix_family_ops = {
3565 	.family = PF_UNIX,
3566 	.create = unix_create,
3567 	.owner	= THIS_MODULE,
3568 };
3569 
3570 
3571 static int __net_init unix_net_init(struct net *net)
3572 {
3573 	int i;
3574 
3575 	net->unx.sysctl_max_dgram_qlen = 10;
3576 	if (unix_sysctl_register(net))
3577 		goto out;
3578 
3579 #ifdef CONFIG_PROC_FS
3580 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3581 			     sizeof(struct seq_net_private)))
3582 		goto err_sysctl;
3583 #endif
3584 
3585 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3586 					      sizeof(spinlock_t), GFP_KERNEL);
3587 	if (!net->unx.table.locks)
3588 		goto err_proc;
3589 
3590 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3591 						sizeof(struct hlist_head),
3592 						GFP_KERNEL);
3593 	if (!net->unx.table.buckets)
3594 		goto free_locks;
3595 
3596 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3597 		spin_lock_init(&net->unx.table.locks[i]);
3598 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3599 	}
3600 
3601 	return 0;
3602 
3603 free_locks:
3604 	kvfree(net->unx.table.locks);
3605 err_proc:
3606 #ifdef CONFIG_PROC_FS
3607 	remove_proc_entry("unix", net->proc_net);
3608 err_sysctl:
3609 #endif
3610 	unix_sysctl_unregister(net);
3611 out:
3612 	return -ENOMEM;
3613 }
3614 
3615 static void __net_exit unix_net_exit(struct net *net)
3616 {
3617 	kvfree(net->unx.table.buckets);
3618 	kvfree(net->unx.table.locks);
3619 	unix_sysctl_unregister(net);
3620 	remove_proc_entry("unix", net->proc_net);
3621 }
3622 
3623 static struct pernet_operations unix_net_ops = {
3624 	.init = unix_net_init,
3625 	.exit = unix_net_exit,
3626 };
3627 
3628 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3629 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3630 		     struct unix_sock *unix_sk, uid_t uid)
3631 
3632 #define INIT_BATCH_SZ 16
3633 
3634 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3635 {
3636 	struct bpf_unix_iter_state *iter = priv_data;
3637 	int err;
3638 
3639 	err = bpf_iter_init_seq_net(priv_data, aux);
3640 	if (err)
3641 		return err;
3642 
3643 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3644 	if (err) {
3645 		bpf_iter_fini_seq_net(priv_data);
3646 		return err;
3647 	}
3648 
3649 	return 0;
3650 }
3651 
3652 static void bpf_iter_fini_unix(void *priv_data)
3653 {
3654 	struct bpf_unix_iter_state *iter = priv_data;
3655 
3656 	bpf_iter_fini_seq_net(priv_data);
3657 	kvfree(iter->batch);
3658 }
3659 
3660 static const struct bpf_iter_seq_info unix_seq_info = {
3661 	.seq_ops		= &bpf_iter_unix_seq_ops,
3662 	.init_seq_private	= bpf_iter_init_unix,
3663 	.fini_seq_private	= bpf_iter_fini_unix,
3664 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3665 };
3666 
3667 static const struct bpf_func_proto *
3668 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3669 			     const struct bpf_prog *prog)
3670 {
3671 	switch (func_id) {
3672 	case BPF_FUNC_setsockopt:
3673 		return &bpf_sk_setsockopt_proto;
3674 	case BPF_FUNC_getsockopt:
3675 		return &bpf_sk_getsockopt_proto;
3676 	default:
3677 		return NULL;
3678 	}
3679 }
3680 
3681 static struct bpf_iter_reg unix_reg_info = {
3682 	.target			= "unix",
3683 	.ctx_arg_info_size	= 1,
3684 	.ctx_arg_info		= {
3685 		{ offsetof(struct bpf_iter__unix, unix_sk),
3686 		  PTR_TO_BTF_ID_OR_NULL },
3687 	},
3688 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3689 	.seq_info		= &unix_seq_info,
3690 };
3691 
3692 static void __init bpf_iter_register(void)
3693 {
3694 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3695 	if (bpf_iter_reg_target(&unix_reg_info))
3696 		pr_warn("Warning: could not register bpf iterator unix\n");
3697 }
3698 #endif
3699 
3700 static int __init af_unix_init(void)
3701 {
3702 	int i, rc = -1;
3703 
3704 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3705 
3706 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3707 		spin_lock_init(&bsd_socket_locks[i]);
3708 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3709 	}
3710 
3711 	rc = proto_register(&unix_dgram_proto, 1);
3712 	if (rc != 0) {
3713 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3714 		goto out;
3715 	}
3716 
3717 	rc = proto_register(&unix_stream_proto, 1);
3718 	if (rc != 0) {
3719 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3720 		proto_unregister(&unix_dgram_proto);
3721 		goto out;
3722 	}
3723 
3724 	sock_register(&unix_family_ops);
3725 	register_pernet_subsys(&unix_net_ops);
3726 	unix_bpf_build_proto();
3727 
3728 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3729 	bpf_iter_register();
3730 #endif
3731 
3732 out:
3733 	return rc;
3734 }
3735 
3736 /* Later than subsys_initcall() because we depend on stuff initialised there */
3737 fs_initcall(af_unix_init);
3738