xref: /linux/net/unix/af_unix.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216 	return unix_peer(osk) == sk;
217 }
218 
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223 
224 static inline int unix_recvq_full(const struct sock *sk)
225 {
226 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228 
229 static inline int unix_recvq_full_lockless(const struct sock *sk)
230 {
231 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
232 		READ_ONCE(sk->sk_max_ack_backlog);
233 }
234 
235 struct sock *unix_peer_get(struct sock *s)
236 {
237 	struct sock *peer;
238 
239 	unix_state_lock(s);
240 	peer = unix_peer(s);
241 	if (peer)
242 		sock_hold(peer);
243 	unix_state_unlock(s);
244 	return peer;
245 }
246 EXPORT_SYMBOL_GPL(unix_peer_get);
247 
248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
249 					     int addr_len)
250 {
251 	struct unix_address *addr;
252 
253 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
254 	if (!addr)
255 		return NULL;
256 
257 	refcount_set(&addr->refcnt, 1);
258 	addr->len = addr_len;
259 	memcpy(addr->name, sunaddr, addr_len);
260 
261 	return addr;
262 }
263 
264 static inline void unix_release_addr(struct unix_address *addr)
265 {
266 	if (refcount_dec_and_test(&addr->refcnt))
267 		kfree(addr);
268 }
269 
270 /*
271  *	Check unix socket name:
272  *		- should be not zero length.
273  *	        - if started by not zero, should be NULL terminated (FS object)
274  *		- if started by zero, it is abstract name.
275  */
276 
277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
278 {
279 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
280 	    addr_len > sizeof(*sunaddr))
281 		return -EINVAL;
282 
283 	if (sunaddr->sun_family != AF_UNIX)
284 		return -EINVAL;
285 
286 	return 0;
287 }
288 
289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
290 {
291 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
292 	short offset = offsetof(struct sockaddr_storage, __data);
293 
294 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295 
296 	/* This may look like an off by one error but it is a bit more
297 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
298 	 * sun_path[108] doesn't as such exist.  However in kernel space
299 	 * we are guaranteed that it is a valid memory location in our
300 	 * kernel address buffer because syscall functions always pass
301 	 * a pointer of struct sockaddr_storage which has a bigger buffer
302 	 * than 108.  Also, we must terminate sun_path for strlen() in
303 	 * getname_kernel().
304 	 */
305 	addr->__data[addr_len - offset] = 0;
306 
307 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
308 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
309 	 * know the actual buffer.
310 	 */
311 	return strlen(addr->__data) + offset + 1;
312 }
313 
314 static void __unix_remove_socket(struct sock *sk)
315 {
316 	sk_del_node_init(sk);
317 }
318 
319 static void __unix_insert_socket(struct net *net, struct sock *sk)
320 {
321 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
323 }
324 
325 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
326 				 struct unix_address *addr, unsigned int hash)
327 {
328 	__unix_remove_socket(sk);
329 	smp_store_release(&unix_sk(sk)->addr, addr);
330 
331 	sk->sk_hash = hash;
332 	__unix_insert_socket(net, sk);
333 }
334 
335 static void unix_remove_socket(struct net *net, struct sock *sk)
336 {
337 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
338 	__unix_remove_socket(sk);
339 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 }
341 
342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
343 {
344 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
345 	__unix_insert_socket(net, sk);
346 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
347 }
348 
349 static void unix_insert_bsd_socket(struct sock *sk)
350 {
351 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
352 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
353 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354 }
355 
356 static void unix_remove_bsd_socket(struct sock *sk)
357 {
358 	if (!hlist_unhashed(&sk->sk_bind_node)) {
359 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
360 		__sk_del_bind_node(sk);
361 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
362 
363 		sk_node_init(&sk->sk_bind_node);
364 	}
365 }
366 
367 static struct sock *__unix_find_socket_byname(struct net *net,
368 					      struct sockaddr_un *sunname,
369 					      int len, unsigned int hash)
370 {
371 	struct sock *s;
372 
373 	sk_for_each(s, &net->unx.table.buckets[hash]) {
374 		struct unix_sock *u = unix_sk(s);
375 
376 		if (u->addr->len == len &&
377 		    !memcmp(u->addr->name, sunname, len))
378 			return s;
379 	}
380 	return NULL;
381 }
382 
383 static inline struct sock *unix_find_socket_byname(struct net *net,
384 						   struct sockaddr_un *sunname,
385 						   int len, unsigned int hash)
386 {
387 	struct sock *s;
388 
389 	spin_lock(&net->unx.table.locks[hash]);
390 	s = __unix_find_socket_byname(net, sunname, len, hash);
391 	if (s)
392 		sock_hold(s);
393 	spin_unlock(&net->unx.table.locks[hash]);
394 	return s;
395 }
396 
397 static struct sock *unix_find_socket_byinode(struct inode *i)
398 {
399 	unsigned int hash = unix_bsd_hash(i);
400 	struct sock *s;
401 
402 	spin_lock(&bsd_socket_locks[hash]);
403 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404 		struct dentry *dentry = unix_sk(s)->path.dentry;
405 
406 		if (dentry && d_backing_inode(dentry) == i) {
407 			sock_hold(s);
408 			spin_unlock(&bsd_socket_locks[hash]);
409 			return s;
410 		}
411 	}
412 	spin_unlock(&bsd_socket_locks[hash]);
413 	return NULL;
414 }
415 
416 /* Support code for asymmetrically connected dgram sockets
417  *
418  * If a datagram socket is connected to a socket not itself connected
419  * to the first socket (eg, /dev/log), clients may only enqueue more
420  * messages if the present receive queue of the server socket is not
421  * "too large". This means there's a second writeability condition
422  * poll and sendmsg need to test. The dgram recv code will do a wake
423  * up on the peer_wait wait queue of a socket upon reception of a
424  * datagram which needs to be propagated to sleeping would-be writers
425  * since these might not have sent anything so far. This can't be
426  * accomplished via poll_wait because the lifetime of the server
427  * socket might be less than that of its clients if these break their
428  * association with it or if the server socket is closed while clients
429  * are still connected to it and there's no way to inform "a polling
430  * implementation" that it should let go of a certain wait queue
431  *
432  * In order to propagate a wake up, a wait_queue_entry_t of the client
433  * socket is enqueued on the peer_wait queue of the server socket
434  * whose wake function does a wake_up on the ordinary client socket
435  * wait queue. This connection is established whenever a write (or
436  * poll for write) hit the flow control condition and broken when the
437  * association to the server socket is dissolved or after a wake up
438  * was relayed.
439  */
440 
441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
442 				      void *key)
443 {
444 	struct unix_sock *u;
445 	wait_queue_head_t *u_sleep;
446 
447 	u = container_of(q, struct unix_sock, peer_wake);
448 
449 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
450 			    q);
451 	u->peer_wake.private = NULL;
452 
453 	/* relaying can only happen while the wq still exists */
454 	u_sleep = sk_sleep(&u->sk);
455 	if (u_sleep)
456 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457 
458 	return 0;
459 }
460 
461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
462 {
463 	struct unix_sock *u, *u_other;
464 	int rc;
465 
466 	u = unix_sk(sk);
467 	u_other = unix_sk(other);
468 	rc = 0;
469 	spin_lock(&u_other->peer_wait.lock);
470 
471 	if (!u->peer_wake.private) {
472 		u->peer_wake.private = other;
473 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
474 
475 		rc = 1;
476 	}
477 
478 	spin_unlock(&u_other->peer_wait.lock);
479 	return rc;
480 }
481 
482 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483 					    struct sock *other)
484 {
485 	struct unix_sock *u, *u_other;
486 
487 	u = unix_sk(sk);
488 	u_other = unix_sk(other);
489 	spin_lock(&u_other->peer_wait.lock);
490 
491 	if (u->peer_wake.private == other) {
492 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
493 		u->peer_wake.private = NULL;
494 	}
495 
496 	spin_unlock(&u_other->peer_wait.lock);
497 }
498 
499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500 						   struct sock *other)
501 {
502 	unix_dgram_peer_wake_disconnect(sk, other);
503 	wake_up_interruptible_poll(sk_sleep(sk),
504 				   EPOLLOUT |
505 				   EPOLLWRNORM |
506 				   EPOLLWRBAND);
507 }
508 
509 /* preconditions:
510  *	- unix_peer(sk) == other
511  *	- association is stable
512  */
513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
514 {
515 	int connected;
516 
517 	connected = unix_dgram_peer_wake_connect(sk, other);
518 
519 	/* If other is SOCK_DEAD, we want to make sure we signal
520 	 * POLLOUT, such that a subsequent write() can get a
521 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522 	 * to other and its full, we will hang waiting for POLLOUT.
523 	 */
524 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
525 		return 1;
526 
527 	if (connected)
528 		unix_dgram_peer_wake_disconnect(sk, other);
529 
530 	return 0;
531 }
532 
533 static int unix_writable(const struct sock *sk)
534 {
535 	return sk->sk_state != TCP_LISTEN &&
536 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
537 }
538 
539 static void unix_write_space(struct sock *sk)
540 {
541 	struct socket_wq *wq;
542 
543 	rcu_read_lock();
544 	if (unix_writable(sk)) {
545 		wq = rcu_dereference(sk->sk_wq);
546 		if (skwq_has_sleeper(wq))
547 			wake_up_interruptible_sync_poll(&wq->wait,
548 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
549 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
550 	}
551 	rcu_read_unlock();
552 }
553 
554 /* When dgram socket disconnects (or changes its peer), we clear its receive
555  * queue of packets arrived from previous peer. First, it allows to do
556  * flow control based only on wmem_alloc; second, sk connected to peer
557  * may receive messages only from that peer. */
558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
559 {
560 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
561 		skb_queue_purge(&sk->sk_receive_queue);
562 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563 
564 		/* If one link of bidirectional dgram pipe is disconnected,
565 		 * we signal error. Messages are lost. Do not make this,
566 		 * when peer was not connected to us.
567 		 */
568 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
569 			WRITE_ONCE(other->sk_err, ECONNRESET);
570 			sk_error_report(other);
571 		}
572 	}
573 	other->sk_state = TCP_CLOSE;
574 }
575 
576 static void unix_sock_destructor(struct sock *sk)
577 {
578 	struct unix_sock *u = unix_sk(sk);
579 
580 	skb_queue_purge(&sk->sk_receive_queue);
581 
582 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
583 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
584 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
585 	if (!sock_flag(sk, SOCK_DEAD)) {
586 		pr_info("Attempt to release alive unix socket: %p\n", sk);
587 		return;
588 	}
589 
590 	if (u->addr)
591 		unix_release_addr(u->addr);
592 
593 	atomic_long_dec(&unix_nr_socks);
594 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
595 #ifdef UNIX_REFCNT_DEBUG
596 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
597 		atomic_long_read(&unix_nr_socks));
598 #endif
599 }
600 
601 static void unix_release_sock(struct sock *sk, int embrion)
602 {
603 	struct unix_sock *u = unix_sk(sk);
604 	struct sock *skpair;
605 	struct sk_buff *skb;
606 	struct path path;
607 	int state;
608 
609 	unix_remove_socket(sock_net(sk), sk);
610 	unix_remove_bsd_socket(sk);
611 
612 	/* Clear state */
613 	unix_state_lock(sk);
614 	sock_orphan(sk);
615 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
616 	path	     = u->path;
617 	u->path.dentry = NULL;
618 	u->path.mnt = NULL;
619 	state = sk->sk_state;
620 	sk->sk_state = TCP_CLOSE;
621 
622 	skpair = unix_peer(sk);
623 	unix_peer(sk) = NULL;
624 
625 	unix_state_unlock(sk);
626 
627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
628 	if (u->oob_skb) {
629 		kfree_skb(u->oob_skb);
630 		u->oob_skb = NULL;
631 	}
632 #endif
633 
634 	wake_up_interruptible_all(&u->peer_wait);
635 
636 	if (skpair != NULL) {
637 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
638 			unix_state_lock(skpair);
639 			/* No more writes */
640 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
641 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
642 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
643 			unix_state_unlock(skpair);
644 			skpair->sk_state_change(skpair);
645 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
646 		}
647 
648 		unix_dgram_peer_wake_disconnect(sk, skpair);
649 		sock_put(skpair); /* It may now die */
650 	}
651 
652 	/* Try to flush out this socket. Throw out buffers at least */
653 
654 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
655 		if (state == TCP_LISTEN)
656 			unix_release_sock(skb->sk, 1);
657 		/* passed fds are erased in the kfree_skb hook	      */
658 		UNIXCB(skb).consumed = skb->len;
659 		kfree_skb(skb);
660 	}
661 
662 	if (path.dentry)
663 		path_put(&path);
664 
665 	sock_put(sk);
666 
667 	/* ---- Socket is dead now and most probably destroyed ---- */
668 
669 	/*
670 	 * Fixme: BSD difference: In BSD all sockets connected to us get
671 	 *	  ECONNRESET and we die on the spot. In Linux we behave
672 	 *	  like files and pipes do and wait for the last
673 	 *	  dereference.
674 	 *
675 	 * Can't we simply set sock->err?
676 	 *
677 	 *	  What the above comment does talk about? --ANK(980817)
678 	 */
679 
680 	if (READ_ONCE(unix_tot_inflight))
681 		unix_gc();		/* Garbage collect fds */
682 }
683 
684 static void init_peercred(struct sock *sk)
685 {
686 	const struct cred *old_cred;
687 	struct pid *old_pid;
688 
689 	spin_lock(&sk->sk_peer_lock);
690 	old_pid = sk->sk_peer_pid;
691 	old_cred = sk->sk_peer_cred;
692 	sk->sk_peer_pid  = get_pid(task_tgid(current));
693 	sk->sk_peer_cred = get_current_cred();
694 	spin_unlock(&sk->sk_peer_lock);
695 
696 	put_pid(old_pid);
697 	put_cred(old_cred);
698 }
699 
700 static void copy_peercred(struct sock *sk, struct sock *peersk)
701 {
702 	const struct cred *old_cred;
703 	struct pid *old_pid;
704 
705 	if (sk < peersk) {
706 		spin_lock(&sk->sk_peer_lock);
707 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
708 	} else {
709 		spin_lock(&peersk->sk_peer_lock);
710 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711 	}
712 	old_pid = sk->sk_peer_pid;
713 	old_cred = sk->sk_peer_cred;
714 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
715 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
716 
717 	spin_unlock(&sk->sk_peer_lock);
718 	spin_unlock(&peersk->sk_peer_lock);
719 
720 	put_pid(old_pid);
721 	put_cred(old_cred);
722 }
723 
724 static int unix_listen(struct socket *sock, int backlog)
725 {
726 	int err;
727 	struct sock *sk = sock->sk;
728 	struct unix_sock *u = unix_sk(sk);
729 
730 	err = -EOPNOTSUPP;
731 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
732 		goto out;	/* Only stream/seqpacket sockets accept */
733 	err = -EINVAL;
734 	if (!u->addr)
735 		goto out;	/* No listens on an unbound socket */
736 	unix_state_lock(sk);
737 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
738 		goto out_unlock;
739 	if (backlog > sk->sk_max_ack_backlog)
740 		wake_up_interruptible_all(&u->peer_wait);
741 	sk->sk_max_ack_backlog	= backlog;
742 	sk->sk_state		= TCP_LISTEN;
743 	/* set credentials so connect can copy them */
744 	init_peercred(sk);
745 	err = 0;
746 
747 out_unlock:
748 	unix_state_unlock(sk);
749 out:
750 	return err;
751 }
752 
753 static int unix_release(struct socket *);
754 static int unix_bind(struct socket *, struct sockaddr *, int);
755 static int unix_stream_connect(struct socket *, struct sockaddr *,
756 			       int addr_len, int flags);
757 static int unix_socketpair(struct socket *, struct socket *);
758 static int unix_accept(struct socket *, struct socket *, int, bool);
759 static int unix_getname(struct socket *, struct sockaddr *, int);
760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
761 static __poll_t unix_dgram_poll(struct file *, struct socket *,
762 				    poll_table *);
763 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
764 #ifdef CONFIG_COMPAT
765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
766 #endif
767 static int unix_shutdown(struct socket *, int);
768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
771 				       struct pipe_inode_info *, size_t size,
772 				       unsigned int flags);
773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_dgram_connect(struct socket *, struct sockaddr *,
778 			      int, int);
779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
781 				  int);
782 
783 #ifdef CONFIG_PROC_FS
784 static int unix_count_nr_fds(struct sock *sk)
785 {
786 	struct sk_buff *skb;
787 	struct unix_sock *u;
788 	int nr_fds = 0;
789 
790 	spin_lock(&sk->sk_receive_queue.lock);
791 	skb = skb_peek(&sk->sk_receive_queue);
792 	while (skb) {
793 		u = unix_sk(skb->sk);
794 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
795 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
796 	}
797 	spin_unlock(&sk->sk_receive_queue.lock);
798 
799 	return nr_fds;
800 }
801 
802 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
803 {
804 	struct sock *sk = sock->sk;
805 	unsigned char s_state;
806 	struct unix_sock *u;
807 	int nr_fds = 0;
808 
809 	if (sk) {
810 		s_state = READ_ONCE(sk->sk_state);
811 		u = unix_sk(sk);
812 
813 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
814 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
815 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
816 		 */
817 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
818 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
819 		else if (s_state == TCP_LISTEN)
820 			nr_fds = unix_count_nr_fds(sk);
821 
822 		seq_printf(m, "scm_fds: %u\n", nr_fds);
823 	}
824 }
825 #else
826 #define unix_show_fdinfo NULL
827 #endif
828 
829 static const struct proto_ops unix_stream_ops = {
830 	.family =	PF_UNIX,
831 	.owner =	THIS_MODULE,
832 	.release =	unix_release,
833 	.bind =		unix_bind,
834 	.connect =	unix_stream_connect,
835 	.socketpair =	unix_socketpair,
836 	.accept =	unix_accept,
837 	.getname =	unix_getname,
838 	.poll =		unix_poll,
839 	.ioctl =	unix_ioctl,
840 #ifdef CONFIG_COMPAT
841 	.compat_ioctl =	unix_compat_ioctl,
842 #endif
843 	.listen =	unix_listen,
844 	.shutdown =	unix_shutdown,
845 	.sendmsg =	unix_stream_sendmsg,
846 	.recvmsg =	unix_stream_recvmsg,
847 	.read_skb =	unix_stream_read_skb,
848 	.mmap =		sock_no_mmap,
849 	.splice_read =	unix_stream_splice_read,
850 	.set_peek_off =	sk_set_peek_off,
851 	.show_fdinfo =	unix_show_fdinfo,
852 };
853 
854 static const struct proto_ops unix_dgram_ops = {
855 	.family =	PF_UNIX,
856 	.owner =	THIS_MODULE,
857 	.release =	unix_release,
858 	.bind =		unix_bind,
859 	.connect =	unix_dgram_connect,
860 	.socketpair =	unix_socketpair,
861 	.accept =	sock_no_accept,
862 	.getname =	unix_getname,
863 	.poll =		unix_dgram_poll,
864 	.ioctl =	unix_ioctl,
865 #ifdef CONFIG_COMPAT
866 	.compat_ioctl =	unix_compat_ioctl,
867 #endif
868 	.listen =	sock_no_listen,
869 	.shutdown =	unix_shutdown,
870 	.sendmsg =	unix_dgram_sendmsg,
871 	.read_skb =	unix_read_skb,
872 	.recvmsg =	unix_dgram_recvmsg,
873 	.mmap =		sock_no_mmap,
874 	.set_peek_off =	sk_set_peek_off,
875 	.show_fdinfo =	unix_show_fdinfo,
876 };
877 
878 static const struct proto_ops unix_seqpacket_ops = {
879 	.family =	PF_UNIX,
880 	.owner =	THIS_MODULE,
881 	.release =	unix_release,
882 	.bind =		unix_bind,
883 	.connect =	unix_stream_connect,
884 	.socketpair =	unix_socketpair,
885 	.accept =	unix_accept,
886 	.getname =	unix_getname,
887 	.poll =		unix_dgram_poll,
888 	.ioctl =	unix_ioctl,
889 #ifdef CONFIG_COMPAT
890 	.compat_ioctl =	unix_compat_ioctl,
891 #endif
892 	.listen =	unix_listen,
893 	.shutdown =	unix_shutdown,
894 	.sendmsg =	unix_seqpacket_sendmsg,
895 	.recvmsg =	unix_seqpacket_recvmsg,
896 	.mmap =		sock_no_mmap,
897 	.set_peek_off =	sk_set_peek_off,
898 	.show_fdinfo =	unix_show_fdinfo,
899 };
900 
901 static void unix_close(struct sock *sk, long timeout)
902 {
903 	/* Nothing to do here, unix socket does not need a ->close().
904 	 * This is merely for sockmap.
905 	 */
906 }
907 
908 static void unix_unhash(struct sock *sk)
909 {
910 	/* Nothing to do here, unix socket does not need a ->unhash().
911 	 * This is merely for sockmap.
912 	 */
913 }
914 
915 static bool unix_bpf_bypass_getsockopt(int level, int optname)
916 {
917 	if (level == SOL_SOCKET) {
918 		switch (optname) {
919 		case SO_PEERPIDFD:
920 			return true;
921 		default:
922 			return false;
923 		}
924 	}
925 
926 	return false;
927 }
928 
929 struct proto unix_dgram_proto = {
930 	.name			= "UNIX",
931 	.owner			= THIS_MODULE,
932 	.obj_size		= sizeof(struct unix_sock),
933 	.close			= unix_close,
934 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
935 #ifdef CONFIG_BPF_SYSCALL
936 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
937 #endif
938 };
939 
940 struct proto unix_stream_proto = {
941 	.name			= "UNIX-STREAM",
942 	.owner			= THIS_MODULE,
943 	.obj_size		= sizeof(struct unix_sock),
944 	.close			= unix_close,
945 	.unhash			= unix_unhash,
946 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
947 #ifdef CONFIG_BPF_SYSCALL
948 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
949 #endif
950 };
951 
952 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
953 {
954 	struct unix_sock *u;
955 	struct sock *sk;
956 	int err;
957 
958 	atomic_long_inc(&unix_nr_socks);
959 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
960 		err = -ENFILE;
961 		goto err;
962 	}
963 
964 	if (type == SOCK_STREAM)
965 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
966 	else /*dgram and  seqpacket */
967 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
968 
969 	if (!sk) {
970 		err = -ENOMEM;
971 		goto err;
972 	}
973 
974 	sock_init_data(sock, sk);
975 
976 	sk->sk_hash		= unix_unbound_hash(sk);
977 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
978 	sk->sk_write_space	= unix_write_space;
979 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
980 	sk->sk_destruct		= unix_sock_destructor;
981 	u = unix_sk(sk);
982 	u->inflight = 0;
983 	u->path.dentry = NULL;
984 	u->path.mnt = NULL;
985 	spin_lock_init(&u->lock);
986 	INIT_LIST_HEAD(&u->link);
987 	mutex_init(&u->iolock); /* single task reading lock */
988 	mutex_init(&u->bindlock); /* single task binding lock */
989 	init_waitqueue_head(&u->peer_wait);
990 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
991 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
992 	unix_insert_unbound_socket(net, sk);
993 
994 	sock_prot_inuse_add(net, sk->sk_prot, 1);
995 
996 	return sk;
997 
998 err:
999 	atomic_long_dec(&unix_nr_socks);
1000 	return ERR_PTR(err);
1001 }
1002 
1003 static int unix_create(struct net *net, struct socket *sock, int protocol,
1004 		       int kern)
1005 {
1006 	struct sock *sk;
1007 
1008 	if (protocol && protocol != PF_UNIX)
1009 		return -EPROTONOSUPPORT;
1010 
1011 	sock->state = SS_UNCONNECTED;
1012 
1013 	switch (sock->type) {
1014 	case SOCK_STREAM:
1015 		sock->ops = &unix_stream_ops;
1016 		break;
1017 		/*
1018 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1019 		 *	nothing uses it.
1020 		 */
1021 	case SOCK_RAW:
1022 		sock->type = SOCK_DGRAM;
1023 		fallthrough;
1024 	case SOCK_DGRAM:
1025 		sock->ops = &unix_dgram_ops;
1026 		break;
1027 	case SOCK_SEQPACKET:
1028 		sock->ops = &unix_seqpacket_ops;
1029 		break;
1030 	default:
1031 		return -ESOCKTNOSUPPORT;
1032 	}
1033 
1034 	sk = unix_create1(net, sock, kern, sock->type);
1035 	if (IS_ERR(sk))
1036 		return PTR_ERR(sk);
1037 
1038 	return 0;
1039 }
1040 
1041 static int unix_release(struct socket *sock)
1042 {
1043 	struct sock *sk = sock->sk;
1044 
1045 	if (!sk)
1046 		return 0;
1047 
1048 	sk->sk_prot->close(sk, 0);
1049 	unix_release_sock(sk, 0);
1050 	sock->sk = NULL;
1051 
1052 	return 0;
1053 }
1054 
1055 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1056 				  int type)
1057 {
1058 	struct inode *inode;
1059 	struct path path;
1060 	struct sock *sk;
1061 	int err;
1062 
1063 	unix_mkname_bsd(sunaddr, addr_len);
1064 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1065 	if (err)
1066 		goto fail;
1067 
1068 	err = path_permission(&path, MAY_WRITE);
1069 	if (err)
1070 		goto path_put;
1071 
1072 	err = -ECONNREFUSED;
1073 	inode = d_backing_inode(path.dentry);
1074 	if (!S_ISSOCK(inode->i_mode))
1075 		goto path_put;
1076 
1077 	sk = unix_find_socket_byinode(inode);
1078 	if (!sk)
1079 		goto path_put;
1080 
1081 	err = -EPROTOTYPE;
1082 	if (sk->sk_type == type)
1083 		touch_atime(&path);
1084 	else
1085 		goto sock_put;
1086 
1087 	path_put(&path);
1088 
1089 	return sk;
1090 
1091 sock_put:
1092 	sock_put(sk);
1093 path_put:
1094 	path_put(&path);
1095 fail:
1096 	return ERR_PTR(err);
1097 }
1098 
1099 static struct sock *unix_find_abstract(struct net *net,
1100 				       struct sockaddr_un *sunaddr,
1101 				       int addr_len, int type)
1102 {
1103 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1104 	struct dentry *dentry;
1105 	struct sock *sk;
1106 
1107 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1108 	if (!sk)
1109 		return ERR_PTR(-ECONNREFUSED);
1110 
1111 	dentry = unix_sk(sk)->path.dentry;
1112 	if (dentry)
1113 		touch_atime(&unix_sk(sk)->path);
1114 
1115 	return sk;
1116 }
1117 
1118 static struct sock *unix_find_other(struct net *net,
1119 				    struct sockaddr_un *sunaddr,
1120 				    int addr_len, int type)
1121 {
1122 	struct sock *sk;
1123 
1124 	if (sunaddr->sun_path[0])
1125 		sk = unix_find_bsd(sunaddr, addr_len, type);
1126 	else
1127 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1128 
1129 	return sk;
1130 }
1131 
1132 static int unix_autobind(struct sock *sk)
1133 {
1134 	unsigned int new_hash, old_hash = sk->sk_hash;
1135 	struct unix_sock *u = unix_sk(sk);
1136 	struct net *net = sock_net(sk);
1137 	struct unix_address *addr;
1138 	u32 lastnum, ordernum;
1139 	int err;
1140 
1141 	err = mutex_lock_interruptible(&u->bindlock);
1142 	if (err)
1143 		return err;
1144 
1145 	if (u->addr)
1146 		goto out;
1147 
1148 	err = -ENOMEM;
1149 	addr = kzalloc(sizeof(*addr) +
1150 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1151 	if (!addr)
1152 		goto out;
1153 
1154 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1155 	addr->name->sun_family = AF_UNIX;
1156 	refcount_set(&addr->refcnt, 1);
1157 
1158 	ordernum = get_random_u32();
1159 	lastnum = ordernum & 0xFFFFF;
1160 retry:
1161 	ordernum = (ordernum + 1) & 0xFFFFF;
1162 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1163 
1164 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1165 	unix_table_double_lock(net, old_hash, new_hash);
1166 
1167 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1168 		unix_table_double_unlock(net, old_hash, new_hash);
1169 
1170 		/* __unix_find_socket_byname() may take long time if many names
1171 		 * are already in use.
1172 		 */
1173 		cond_resched();
1174 
1175 		if (ordernum == lastnum) {
1176 			/* Give up if all names seems to be in use. */
1177 			err = -ENOSPC;
1178 			unix_release_addr(addr);
1179 			goto out;
1180 		}
1181 
1182 		goto retry;
1183 	}
1184 
1185 	__unix_set_addr_hash(net, sk, addr, new_hash);
1186 	unix_table_double_unlock(net, old_hash, new_hash);
1187 	err = 0;
1188 
1189 out:	mutex_unlock(&u->bindlock);
1190 	return err;
1191 }
1192 
1193 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1194 			 int addr_len)
1195 {
1196 	umode_t mode = S_IFSOCK |
1197 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1198 	unsigned int new_hash, old_hash = sk->sk_hash;
1199 	struct unix_sock *u = unix_sk(sk);
1200 	struct net *net = sock_net(sk);
1201 	struct mnt_idmap *idmap;
1202 	struct unix_address *addr;
1203 	struct dentry *dentry;
1204 	struct path parent;
1205 	int err;
1206 
1207 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1208 	addr = unix_create_addr(sunaddr, addr_len);
1209 	if (!addr)
1210 		return -ENOMEM;
1211 
1212 	/*
1213 	 * Get the parent directory, calculate the hash for last
1214 	 * component.
1215 	 */
1216 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1217 	if (IS_ERR(dentry)) {
1218 		err = PTR_ERR(dentry);
1219 		goto out;
1220 	}
1221 
1222 	/*
1223 	 * All right, let's create it.
1224 	 */
1225 	idmap = mnt_idmap(parent.mnt);
1226 	err = security_path_mknod(&parent, dentry, mode, 0);
1227 	if (!err)
1228 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1229 	if (err)
1230 		goto out_path;
1231 	err = mutex_lock_interruptible(&u->bindlock);
1232 	if (err)
1233 		goto out_unlink;
1234 	if (u->addr)
1235 		goto out_unlock;
1236 
1237 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1238 	unix_table_double_lock(net, old_hash, new_hash);
1239 	u->path.mnt = mntget(parent.mnt);
1240 	u->path.dentry = dget(dentry);
1241 	__unix_set_addr_hash(net, sk, addr, new_hash);
1242 	unix_table_double_unlock(net, old_hash, new_hash);
1243 	unix_insert_bsd_socket(sk);
1244 	mutex_unlock(&u->bindlock);
1245 	done_path_create(&parent, dentry);
1246 	return 0;
1247 
1248 out_unlock:
1249 	mutex_unlock(&u->bindlock);
1250 	err = -EINVAL;
1251 out_unlink:
1252 	/* failed after successful mknod?  unlink what we'd created... */
1253 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1254 out_path:
1255 	done_path_create(&parent, dentry);
1256 out:
1257 	unix_release_addr(addr);
1258 	return err == -EEXIST ? -EADDRINUSE : err;
1259 }
1260 
1261 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1262 			      int addr_len)
1263 {
1264 	unsigned int new_hash, old_hash = sk->sk_hash;
1265 	struct unix_sock *u = unix_sk(sk);
1266 	struct net *net = sock_net(sk);
1267 	struct unix_address *addr;
1268 	int err;
1269 
1270 	addr = unix_create_addr(sunaddr, addr_len);
1271 	if (!addr)
1272 		return -ENOMEM;
1273 
1274 	err = mutex_lock_interruptible(&u->bindlock);
1275 	if (err)
1276 		goto out;
1277 
1278 	if (u->addr) {
1279 		err = -EINVAL;
1280 		goto out_mutex;
1281 	}
1282 
1283 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1284 	unix_table_double_lock(net, old_hash, new_hash);
1285 
1286 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1287 		goto out_spin;
1288 
1289 	__unix_set_addr_hash(net, sk, addr, new_hash);
1290 	unix_table_double_unlock(net, old_hash, new_hash);
1291 	mutex_unlock(&u->bindlock);
1292 	return 0;
1293 
1294 out_spin:
1295 	unix_table_double_unlock(net, old_hash, new_hash);
1296 	err = -EADDRINUSE;
1297 out_mutex:
1298 	mutex_unlock(&u->bindlock);
1299 out:
1300 	unix_release_addr(addr);
1301 	return err;
1302 }
1303 
1304 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1305 {
1306 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1307 	struct sock *sk = sock->sk;
1308 	int err;
1309 
1310 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1311 	    sunaddr->sun_family == AF_UNIX)
1312 		return unix_autobind(sk);
1313 
1314 	err = unix_validate_addr(sunaddr, addr_len);
1315 	if (err)
1316 		return err;
1317 
1318 	if (sunaddr->sun_path[0])
1319 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1320 	else
1321 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1322 
1323 	return err;
1324 }
1325 
1326 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1327 {
1328 	if (unlikely(sk1 == sk2) || !sk2) {
1329 		unix_state_lock(sk1);
1330 		return;
1331 	}
1332 	if (sk1 > sk2)
1333 		swap(sk1, sk2);
1334 
1335 	unix_state_lock(sk1);
1336 	unix_state_lock_nested(sk2, U_LOCK_SECOND);
1337 }
1338 
1339 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1340 {
1341 	if (unlikely(sk1 == sk2) || !sk2) {
1342 		unix_state_unlock(sk1);
1343 		return;
1344 	}
1345 	unix_state_unlock(sk1);
1346 	unix_state_unlock(sk2);
1347 }
1348 
1349 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1350 			      int alen, int flags)
1351 {
1352 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1353 	struct sock *sk = sock->sk;
1354 	struct sock *other;
1355 	int err;
1356 
1357 	err = -EINVAL;
1358 	if (alen < offsetofend(struct sockaddr, sa_family))
1359 		goto out;
1360 
1361 	if (addr->sa_family != AF_UNSPEC) {
1362 		err = unix_validate_addr(sunaddr, alen);
1363 		if (err)
1364 			goto out;
1365 
1366 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1367 		if (err)
1368 			goto out;
1369 
1370 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1371 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1372 		    !unix_sk(sk)->addr) {
1373 			err = unix_autobind(sk);
1374 			if (err)
1375 				goto out;
1376 		}
1377 
1378 restart:
1379 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1380 		if (IS_ERR(other)) {
1381 			err = PTR_ERR(other);
1382 			goto out;
1383 		}
1384 
1385 		unix_state_double_lock(sk, other);
1386 
1387 		/* Apparently VFS overslept socket death. Retry. */
1388 		if (sock_flag(other, SOCK_DEAD)) {
1389 			unix_state_double_unlock(sk, other);
1390 			sock_put(other);
1391 			goto restart;
1392 		}
1393 
1394 		err = -EPERM;
1395 		if (!unix_may_send(sk, other))
1396 			goto out_unlock;
1397 
1398 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1399 		if (err)
1400 			goto out_unlock;
1401 
1402 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1403 	} else {
1404 		/*
1405 		 *	1003.1g breaking connected state with AF_UNSPEC
1406 		 */
1407 		other = NULL;
1408 		unix_state_double_lock(sk, other);
1409 	}
1410 
1411 	/*
1412 	 * If it was connected, reconnect.
1413 	 */
1414 	if (unix_peer(sk)) {
1415 		struct sock *old_peer = unix_peer(sk);
1416 
1417 		unix_peer(sk) = other;
1418 		if (!other)
1419 			sk->sk_state = TCP_CLOSE;
1420 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1421 
1422 		unix_state_double_unlock(sk, other);
1423 
1424 		if (other != old_peer)
1425 			unix_dgram_disconnected(sk, old_peer);
1426 		sock_put(old_peer);
1427 	} else {
1428 		unix_peer(sk) = other;
1429 		unix_state_double_unlock(sk, other);
1430 	}
1431 
1432 	return 0;
1433 
1434 out_unlock:
1435 	unix_state_double_unlock(sk, other);
1436 	sock_put(other);
1437 out:
1438 	return err;
1439 }
1440 
1441 static long unix_wait_for_peer(struct sock *other, long timeo)
1442 	__releases(&unix_sk(other)->lock)
1443 {
1444 	struct unix_sock *u = unix_sk(other);
1445 	int sched;
1446 	DEFINE_WAIT(wait);
1447 
1448 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1449 
1450 	sched = !sock_flag(other, SOCK_DEAD) &&
1451 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1452 		unix_recvq_full_lockless(other);
1453 
1454 	unix_state_unlock(other);
1455 
1456 	if (sched)
1457 		timeo = schedule_timeout(timeo);
1458 
1459 	finish_wait(&u->peer_wait, &wait);
1460 	return timeo;
1461 }
1462 
1463 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1464 			       int addr_len, int flags)
1465 {
1466 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1467 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1468 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1469 	struct net *net = sock_net(sk);
1470 	struct sk_buff *skb = NULL;
1471 	long timeo;
1472 	int err;
1473 	int st;
1474 
1475 	err = unix_validate_addr(sunaddr, addr_len);
1476 	if (err)
1477 		goto out;
1478 
1479 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1480 	if (err)
1481 		goto out;
1482 
1483 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1484 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1485 		err = unix_autobind(sk);
1486 		if (err)
1487 			goto out;
1488 	}
1489 
1490 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1491 
1492 	/* First of all allocate resources.
1493 	   If we will make it after state is locked,
1494 	   we will have to recheck all again in any case.
1495 	 */
1496 
1497 	/* create new sock for complete connection */
1498 	newsk = unix_create1(net, NULL, 0, sock->type);
1499 	if (IS_ERR(newsk)) {
1500 		err = PTR_ERR(newsk);
1501 		newsk = NULL;
1502 		goto out;
1503 	}
1504 
1505 	err = -ENOMEM;
1506 
1507 	/* Allocate skb for sending to listening sock */
1508 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1509 	if (skb == NULL)
1510 		goto out;
1511 
1512 restart:
1513 	/*  Find listening sock. */
1514 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1515 	if (IS_ERR(other)) {
1516 		err = PTR_ERR(other);
1517 		other = NULL;
1518 		goto out;
1519 	}
1520 
1521 	/* Latch state of peer */
1522 	unix_state_lock(other);
1523 
1524 	/* Apparently VFS overslept socket death. Retry. */
1525 	if (sock_flag(other, SOCK_DEAD)) {
1526 		unix_state_unlock(other);
1527 		sock_put(other);
1528 		goto restart;
1529 	}
1530 
1531 	err = -ECONNREFUSED;
1532 	if (other->sk_state != TCP_LISTEN)
1533 		goto out_unlock;
1534 	if (other->sk_shutdown & RCV_SHUTDOWN)
1535 		goto out_unlock;
1536 
1537 	if (unix_recvq_full(other)) {
1538 		err = -EAGAIN;
1539 		if (!timeo)
1540 			goto out_unlock;
1541 
1542 		timeo = unix_wait_for_peer(other, timeo);
1543 
1544 		err = sock_intr_errno(timeo);
1545 		if (signal_pending(current))
1546 			goto out;
1547 		sock_put(other);
1548 		goto restart;
1549 	}
1550 
1551 	/* Latch our state.
1552 
1553 	   It is tricky place. We need to grab our state lock and cannot
1554 	   drop lock on peer. It is dangerous because deadlock is
1555 	   possible. Connect to self case and simultaneous
1556 	   attempt to connect are eliminated by checking socket
1557 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1558 	   check this before attempt to grab lock.
1559 
1560 	   Well, and we have to recheck the state after socket locked.
1561 	 */
1562 	st = sk->sk_state;
1563 
1564 	switch (st) {
1565 	case TCP_CLOSE:
1566 		/* This is ok... continue with connect */
1567 		break;
1568 	case TCP_ESTABLISHED:
1569 		/* Socket is already connected */
1570 		err = -EISCONN;
1571 		goto out_unlock;
1572 	default:
1573 		err = -EINVAL;
1574 		goto out_unlock;
1575 	}
1576 
1577 	unix_state_lock_nested(sk, U_LOCK_SECOND);
1578 
1579 	if (sk->sk_state != st) {
1580 		unix_state_unlock(sk);
1581 		unix_state_unlock(other);
1582 		sock_put(other);
1583 		goto restart;
1584 	}
1585 
1586 	err = security_unix_stream_connect(sk, other, newsk);
1587 	if (err) {
1588 		unix_state_unlock(sk);
1589 		goto out_unlock;
1590 	}
1591 
1592 	/* The way is open! Fastly set all the necessary fields... */
1593 
1594 	sock_hold(sk);
1595 	unix_peer(newsk)	= sk;
1596 	newsk->sk_state		= TCP_ESTABLISHED;
1597 	newsk->sk_type		= sk->sk_type;
1598 	init_peercred(newsk);
1599 	newu = unix_sk(newsk);
1600 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1601 	otheru = unix_sk(other);
1602 
1603 	/* copy address information from listening to new sock
1604 	 *
1605 	 * The contents of *(otheru->addr) and otheru->path
1606 	 * are seen fully set up here, since we have found
1607 	 * otheru in hash under its lock.  Insertion into the
1608 	 * hash chain we'd found it in had been done in an
1609 	 * earlier critical area protected by the chain's lock,
1610 	 * the same one where we'd set *(otheru->addr) contents,
1611 	 * as well as otheru->path and otheru->addr itself.
1612 	 *
1613 	 * Using smp_store_release() here to set newu->addr
1614 	 * is enough to make those stores, as well as stores
1615 	 * to newu->path visible to anyone who gets newu->addr
1616 	 * by smp_load_acquire().  IOW, the same warranties
1617 	 * as for unix_sock instances bound in unix_bind() or
1618 	 * in unix_autobind().
1619 	 */
1620 	if (otheru->path.dentry) {
1621 		path_get(&otheru->path);
1622 		newu->path = otheru->path;
1623 	}
1624 	refcount_inc(&otheru->addr->refcnt);
1625 	smp_store_release(&newu->addr, otheru->addr);
1626 
1627 	/* Set credentials */
1628 	copy_peercred(sk, other);
1629 
1630 	sock->state	= SS_CONNECTED;
1631 	sk->sk_state	= TCP_ESTABLISHED;
1632 	sock_hold(newsk);
1633 
1634 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1635 	unix_peer(sk)	= newsk;
1636 
1637 	unix_state_unlock(sk);
1638 
1639 	/* take ten and send info to listening sock */
1640 	spin_lock(&other->sk_receive_queue.lock);
1641 	__skb_queue_tail(&other->sk_receive_queue, skb);
1642 	spin_unlock(&other->sk_receive_queue.lock);
1643 	unix_state_unlock(other);
1644 	other->sk_data_ready(other);
1645 	sock_put(other);
1646 	return 0;
1647 
1648 out_unlock:
1649 	if (other)
1650 		unix_state_unlock(other);
1651 
1652 out:
1653 	kfree_skb(skb);
1654 	if (newsk)
1655 		unix_release_sock(newsk, 0);
1656 	if (other)
1657 		sock_put(other);
1658 	return err;
1659 }
1660 
1661 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1662 {
1663 	struct sock *ska = socka->sk, *skb = sockb->sk;
1664 
1665 	/* Join our sockets back to back */
1666 	sock_hold(ska);
1667 	sock_hold(skb);
1668 	unix_peer(ska) = skb;
1669 	unix_peer(skb) = ska;
1670 	init_peercred(ska);
1671 	init_peercred(skb);
1672 
1673 	ska->sk_state = TCP_ESTABLISHED;
1674 	skb->sk_state = TCP_ESTABLISHED;
1675 	socka->state  = SS_CONNECTED;
1676 	sockb->state  = SS_CONNECTED;
1677 	return 0;
1678 }
1679 
1680 static void unix_sock_inherit_flags(const struct socket *old,
1681 				    struct socket *new)
1682 {
1683 	if (test_bit(SOCK_PASSCRED, &old->flags))
1684 		set_bit(SOCK_PASSCRED, &new->flags);
1685 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1686 		set_bit(SOCK_PASSPIDFD, &new->flags);
1687 	if (test_bit(SOCK_PASSSEC, &old->flags))
1688 		set_bit(SOCK_PASSSEC, &new->flags);
1689 }
1690 
1691 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1692 		       bool kern)
1693 {
1694 	struct sock *sk = sock->sk;
1695 	struct sock *tsk;
1696 	struct sk_buff *skb;
1697 	int err;
1698 
1699 	err = -EOPNOTSUPP;
1700 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1701 		goto out;
1702 
1703 	err = -EINVAL;
1704 	if (sk->sk_state != TCP_LISTEN)
1705 		goto out;
1706 
1707 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1708 	 * so that no locks are necessary.
1709 	 */
1710 
1711 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1712 				&err);
1713 	if (!skb) {
1714 		/* This means receive shutdown. */
1715 		if (err == 0)
1716 			err = -EINVAL;
1717 		goto out;
1718 	}
1719 
1720 	tsk = skb->sk;
1721 	skb_free_datagram(sk, skb);
1722 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1723 
1724 	/* attach accepted sock to socket */
1725 	unix_state_lock(tsk);
1726 	newsock->state = SS_CONNECTED;
1727 	unix_sock_inherit_flags(sock, newsock);
1728 	sock_graft(tsk, newsock);
1729 	unix_state_unlock(tsk);
1730 	return 0;
1731 
1732 out:
1733 	return err;
1734 }
1735 
1736 
1737 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1738 {
1739 	struct sock *sk = sock->sk;
1740 	struct unix_address *addr;
1741 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1742 	int err = 0;
1743 
1744 	if (peer) {
1745 		sk = unix_peer_get(sk);
1746 
1747 		err = -ENOTCONN;
1748 		if (!sk)
1749 			goto out;
1750 		err = 0;
1751 	} else {
1752 		sock_hold(sk);
1753 	}
1754 
1755 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1756 	if (!addr) {
1757 		sunaddr->sun_family = AF_UNIX;
1758 		sunaddr->sun_path[0] = 0;
1759 		err = offsetof(struct sockaddr_un, sun_path);
1760 	} else {
1761 		err = addr->len;
1762 		memcpy(sunaddr, addr->name, addr->len);
1763 
1764 		if (peer)
1765 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1766 					       CGROUP_UNIX_GETPEERNAME);
1767 		else
1768 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1769 					       CGROUP_UNIX_GETSOCKNAME);
1770 	}
1771 	sock_put(sk);
1772 out:
1773 	return err;
1774 }
1775 
1776 /* The "user->unix_inflight" variable is protected by the garbage
1777  * collection lock, and we just read it locklessly here. If you go
1778  * over the limit, there might be a tiny race in actually noticing
1779  * it across threads. Tough.
1780  */
1781 static inline bool too_many_unix_fds(struct task_struct *p)
1782 {
1783 	struct user_struct *user = current_user();
1784 
1785 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1786 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1787 	return false;
1788 }
1789 
1790 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1791 {
1792 	int i;
1793 
1794 	if (too_many_unix_fds(current))
1795 		return -ETOOMANYREFS;
1796 
1797 	/* Need to duplicate file references for the sake of garbage
1798 	 * collection.  Otherwise a socket in the fps might become a
1799 	 * candidate for GC while the skb is not yet queued.
1800 	 */
1801 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1802 	if (!UNIXCB(skb).fp)
1803 		return -ENOMEM;
1804 
1805 	for (i = scm->fp->count - 1; i >= 0; i--)
1806 		unix_inflight(scm->fp->user, scm->fp->fp[i]);
1807 
1808 	return 0;
1809 }
1810 
1811 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1812 {
1813 	int i;
1814 
1815 	scm->fp = UNIXCB(skb).fp;
1816 	UNIXCB(skb).fp = NULL;
1817 
1818 	for (i = scm->fp->count - 1; i >= 0; i--)
1819 		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1820 }
1821 
1822 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1823 {
1824 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1825 
1826 	/*
1827 	 * Garbage collection of unix sockets starts by selecting a set of
1828 	 * candidate sockets which have reference only from being in flight
1829 	 * (total_refs == inflight_refs).  This condition is checked once during
1830 	 * the candidate collection phase, and candidates are marked as such, so
1831 	 * that non-candidates can later be ignored.  While inflight_refs is
1832 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1833 	 * is an instantaneous decision.
1834 	 *
1835 	 * Once a candidate, however, the socket must not be reinstalled into a
1836 	 * file descriptor while the garbage collection is in progress.
1837 	 *
1838 	 * If the above conditions are met, then the directed graph of
1839 	 * candidates (*) does not change while unix_gc_lock is held.
1840 	 *
1841 	 * Any operations that changes the file count through file descriptors
1842 	 * (dup, close, sendmsg) does not change the graph since candidates are
1843 	 * not installed in fds.
1844 	 *
1845 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1846 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1847 	 * serialized with garbage collection.
1848 	 *
1849 	 * MSG_PEEK is special in that it does not change the inflight count,
1850 	 * yet does install the socket into an fd.  The following lock/unlock
1851 	 * pair is to ensure serialization with garbage collection.  It must be
1852 	 * done between incrementing the file count and installing the file into
1853 	 * an fd.
1854 	 *
1855 	 * If garbage collection starts after the barrier provided by the
1856 	 * lock/unlock, then it will see the elevated refcount and not mark this
1857 	 * as a candidate.  If a garbage collection is already in progress
1858 	 * before the file count was incremented, then the lock/unlock pair will
1859 	 * ensure that garbage collection is finished before progressing to
1860 	 * installing the fd.
1861 	 *
1862 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1863 	 * which is on the queue of listening socket A.
1864 	 */
1865 	spin_lock(&unix_gc_lock);
1866 	spin_unlock(&unix_gc_lock);
1867 }
1868 
1869 static void unix_destruct_scm(struct sk_buff *skb)
1870 {
1871 	struct scm_cookie scm;
1872 
1873 	memset(&scm, 0, sizeof(scm));
1874 	scm.pid  = UNIXCB(skb).pid;
1875 	if (UNIXCB(skb).fp)
1876 		unix_detach_fds(&scm, skb);
1877 
1878 	/* Alas, it calls VFS */
1879 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1880 	scm_destroy(&scm);
1881 	sock_wfree(skb);
1882 }
1883 
1884 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1885 {
1886 	int err = 0;
1887 
1888 	UNIXCB(skb).pid  = get_pid(scm->pid);
1889 	UNIXCB(skb).uid = scm->creds.uid;
1890 	UNIXCB(skb).gid = scm->creds.gid;
1891 	UNIXCB(skb).fp = NULL;
1892 	unix_get_secdata(scm, skb);
1893 	if (scm->fp && send_fds)
1894 		err = unix_attach_fds(scm, skb);
1895 
1896 	skb->destructor = unix_destruct_scm;
1897 	return err;
1898 }
1899 
1900 static bool unix_passcred_enabled(const struct socket *sock,
1901 				  const struct sock *other)
1902 {
1903 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1904 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1905 	       !other->sk_socket ||
1906 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1907 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1908 }
1909 
1910 /*
1911  * Some apps rely on write() giving SCM_CREDENTIALS
1912  * We include credentials if source or destination socket
1913  * asserted SOCK_PASSCRED.
1914  */
1915 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1916 			    const struct sock *other)
1917 {
1918 	if (UNIXCB(skb).pid)
1919 		return;
1920 	if (unix_passcred_enabled(sock, other)) {
1921 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1922 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1923 	}
1924 }
1925 
1926 static bool unix_skb_scm_eq(struct sk_buff *skb,
1927 			    struct scm_cookie *scm)
1928 {
1929 	return UNIXCB(skb).pid == scm->pid &&
1930 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1931 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1932 	       unix_secdata_eq(scm, skb);
1933 }
1934 
1935 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1936 {
1937 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1938 	struct unix_sock *u = unix_sk(sk);
1939 
1940 	if (unlikely(fp && fp->count))
1941 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1942 }
1943 
1944 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1945 {
1946 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1947 	struct unix_sock *u = unix_sk(sk);
1948 
1949 	if (unlikely(fp && fp->count))
1950 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1951 }
1952 
1953 /*
1954  *	Send AF_UNIX data.
1955  */
1956 
1957 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1958 			      size_t len)
1959 {
1960 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1961 	struct sock *sk = sock->sk, *other = NULL;
1962 	struct unix_sock *u = unix_sk(sk);
1963 	struct scm_cookie scm;
1964 	struct sk_buff *skb;
1965 	int data_len = 0;
1966 	int sk_locked;
1967 	long timeo;
1968 	int err;
1969 
1970 	err = scm_send(sock, msg, &scm, false);
1971 	if (err < 0)
1972 		return err;
1973 
1974 	wait_for_unix_gc(scm.fp);
1975 
1976 	err = -EOPNOTSUPP;
1977 	if (msg->msg_flags&MSG_OOB)
1978 		goto out;
1979 
1980 	if (msg->msg_namelen) {
1981 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1982 		if (err)
1983 			goto out;
1984 
1985 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1986 							    msg->msg_name,
1987 							    &msg->msg_namelen,
1988 							    NULL);
1989 		if (err)
1990 			goto out;
1991 	} else {
1992 		sunaddr = NULL;
1993 		err = -ENOTCONN;
1994 		other = unix_peer_get(sk);
1995 		if (!other)
1996 			goto out;
1997 	}
1998 
1999 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2000 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
2001 		err = unix_autobind(sk);
2002 		if (err)
2003 			goto out;
2004 	}
2005 
2006 	err = -EMSGSIZE;
2007 	if (len > sk->sk_sndbuf - 32)
2008 		goto out;
2009 
2010 	if (len > SKB_MAX_ALLOC) {
2011 		data_len = min_t(size_t,
2012 				 len - SKB_MAX_ALLOC,
2013 				 MAX_SKB_FRAGS * PAGE_SIZE);
2014 		data_len = PAGE_ALIGN(data_len);
2015 
2016 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2017 	}
2018 
2019 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2020 				   msg->msg_flags & MSG_DONTWAIT, &err,
2021 				   PAGE_ALLOC_COSTLY_ORDER);
2022 	if (skb == NULL)
2023 		goto out;
2024 
2025 	err = unix_scm_to_skb(&scm, skb, true);
2026 	if (err < 0)
2027 		goto out_free;
2028 
2029 	skb_put(skb, len - data_len);
2030 	skb->data_len = data_len;
2031 	skb->len = len;
2032 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2033 	if (err)
2034 		goto out_free;
2035 
2036 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2037 
2038 restart:
2039 	if (!other) {
2040 		err = -ECONNRESET;
2041 		if (sunaddr == NULL)
2042 			goto out_free;
2043 
2044 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2045 					sk->sk_type);
2046 		if (IS_ERR(other)) {
2047 			err = PTR_ERR(other);
2048 			other = NULL;
2049 			goto out_free;
2050 		}
2051 	}
2052 
2053 	if (sk_filter(other, skb) < 0) {
2054 		/* Toss the packet but do not return any error to the sender */
2055 		err = len;
2056 		goto out_free;
2057 	}
2058 
2059 	sk_locked = 0;
2060 	unix_state_lock(other);
2061 restart_locked:
2062 	err = -EPERM;
2063 	if (!unix_may_send(sk, other))
2064 		goto out_unlock;
2065 
2066 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2067 		/*
2068 		 *	Check with 1003.1g - what should
2069 		 *	datagram error
2070 		 */
2071 		unix_state_unlock(other);
2072 		sock_put(other);
2073 
2074 		if (!sk_locked)
2075 			unix_state_lock(sk);
2076 
2077 		err = 0;
2078 		if (sk->sk_type == SOCK_SEQPACKET) {
2079 			/* We are here only when racing with unix_release_sock()
2080 			 * is clearing @other. Never change state to TCP_CLOSE
2081 			 * unlike SOCK_DGRAM wants.
2082 			 */
2083 			unix_state_unlock(sk);
2084 			err = -EPIPE;
2085 		} else if (unix_peer(sk) == other) {
2086 			unix_peer(sk) = NULL;
2087 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2088 
2089 			sk->sk_state = TCP_CLOSE;
2090 			unix_state_unlock(sk);
2091 
2092 			unix_dgram_disconnected(sk, other);
2093 			sock_put(other);
2094 			err = -ECONNREFUSED;
2095 		} else {
2096 			unix_state_unlock(sk);
2097 		}
2098 
2099 		other = NULL;
2100 		if (err)
2101 			goto out_free;
2102 		goto restart;
2103 	}
2104 
2105 	err = -EPIPE;
2106 	if (other->sk_shutdown & RCV_SHUTDOWN)
2107 		goto out_unlock;
2108 
2109 	if (sk->sk_type != SOCK_SEQPACKET) {
2110 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2111 		if (err)
2112 			goto out_unlock;
2113 	}
2114 
2115 	/* other == sk && unix_peer(other) != sk if
2116 	 * - unix_peer(sk) == NULL, destination address bound to sk
2117 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2118 	 */
2119 	if (other != sk &&
2120 	    unlikely(unix_peer(other) != sk &&
2121 	    unix_recvq_full_lockless(other))) {
2122 		if (timeo) {
2123 			timeo = unix_wait_for_peer(other, timeo);
2124 
2125 			err = sock_intr_errno(timeo);
2126 			if (signal_pending(current))
2127 				goto out_free;
2128 
2129 			goto restart;
2130 		}
2131 
2132 		if (!sk_locked) {
2133 			unix_state_unlock(other);
2134 			unix_state_double_lock(sk, other);
2135 		}
2136 
2137 		if (unix_peer(sk) != other ||
2138 		    unix_dgram_peer_wake_me(sk, other)) {
2139 			err = -EAGAIN;
2140 			sk_locked = 1;
2141 			goto out_unlock;
2142 		}
2143 
2144 		if (!sk_locked) {
2145 			sk_locked = 1;
2146 			goto restart_locked;
2147 		}
2148 	}
2149 
2150 	if (unlikely(sk_locked))
2151 		unix_state_unlock(sk);
2152 
2153 	if (sock_flag(other, SOCK_RCVTSTAMP))
2154 		__net_timestamp(skb);
2155 	maybe_add_creds(skb, sock, other);
2156 	scm_stat_add(other, skb);
2157 	skb_queue_tail(&other->sk_receive_queue, skb);
2158 	unix_state_unlock(other);
2159 	other->sk_data_ready(other);
2160 	sock_put(other);
2161 	scm_destroy(&scm);
2162 	return len;
2163 
2164 out_unlock:
2165 	if (sk_locked)
2166 		unix_state_unlock(sk);
2167 	unix_state_unlock(other);
2168 out_free:
2169 	kfree_skb(skb);
2170 out:
2171 	if (other)
2172 		sock_put(other);
2173 	scm_destroy(&scm);
2174 	return err;
2175 }
2176 
2177 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2178  * bytes, and a minimum of a full page.
2179  */
2180 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2181 
2182 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2183 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2184 		     struct scm_cookie *scm, bool fds_sent)
2185 {
2186 	struct unix_sock *ousk = unix_sk(other);
2187 	struct sk_buff *skb;
2188 	int err = 0;
2189 
2190 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2191 
2192 	if (!skb)
2193 		return err;
2194 
2195 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2196 	if (err < 0) {
2197 		kfree_skb(skb);
2198 		return err;
2199 	}
2200 	skb_put(skb, 1);
2201 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2202 
2203 	if (err) {
2204 		kfree_skb(skb);
2205 		return err;
2206 	}
2207 
2208 	unix_state_lock(other);
2209 
2210 	if (sock_flag(other, SOCK_DEAD) ||
2211 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2212 		unix_state_unlock(other);
2213 		kfree_skb(skb);
2214 		return -EPIPE;
2215 	}
2216 
2217 	maybe_add_creds(skb, sock, other);
2218 	skb_get(skb);
2219 
2220 	if (ousk->oob_skb)
2221 		consume_skb(ousk->oob_skb);
2222 
2223 	WRITE_ONCE(ousk->oob_skb, skb);
2224 
2225 	scm_stat_add(other, skb);
2226 	skb_queue_tail(&other->sk_receive_queue, skb);
2227 	sk_send_sigurg(other);
2228 	unix_state_unlock(other);
2229 	other->sk_data_ready(other);
2230 
2231 	return err;
2232 }
2233 #endif
2234 
2235 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2236 			       size_t len)
2237 {
2238 	struct sock *sk = sock->sk;
2239 	struct sock *other = NULL;
2240 	int err, size;
2241 	struct sk_buff *skb;
2242 	int sent = 0;
2243 	struct scm_cookie scm;
2244 	bool fds_sent = false;
2245 	int data_len;
2246 
2247 	err = scm_send(sock, msg, &scm, false);
2248 	if (err < 0)
2249 		return err;
2250 
2251 	wait_for_unix_gc(scm.fp);
2252 
2253 	err = -EOPNOTSUPP;
2254 	if (msg->msg_flags & MSG_OOB) {
2255 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2256 		if (len)
2257 			len--;
2258 		else
2259 #endif
2260 			goto out_err;
2261 	}
2262 
2263 	if (msg->msg_namelen) {
2264 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2265 		goto out_err;
2266 	} else {
2267 		err = -ENOTCONN;
2268 		other = unix_peer(sk);
2269 		if (!other)
2270 			goto out_err;
2271 	}
2272 
2273 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2274 		goto pipe_err;
2275 
2276 	while (sent < len) {
2277 		size = len - sent;
2278 
2279 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2280 			skb = sock_alloc_send_pskb(sk, 0, 0,
2281 						   msg->msg_flags & MSG_DONTWAIT,
2282 						   &err, 0);
2283 		} else {
2284 			/* Keep two messages in the pipe so it schedules better */
2285 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2286 
2287 			/* allow fallback to order-0 allocations */
2288 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2289 
2290 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2291 
2292 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2293 
2294 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2295 						   msg->msg_flags & MSG_DONTWAIT, &err,
2296 						   get_order(UNIX_SKB_FRAGS_SZ));
2297 		}
2298 		if (!skb)
2299 			goto out_err;
2300 
2301 		/* Only send the fds in the first buffer */
2302 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2303 		if (err < 0) {
2304 			kfree_skb(skb);
2305 			goto out_err;
2306 		}
2307 		fds_sent = true;
2308 
2309 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2310 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2311 						   sk->sk_allocation);
2312 			if (err < 0) {
2313 				kfree_skb(skb);
2314 				goto out_err;
2315 			}
2316 			size = err;
2317 			refcount_add(size, &sk->sk_wmem_alloc);
2318 		} else {
2319 			skb_put(skb, size - data_len);
2320 			skb->data_len = data_len;
2321 			skb->len = size;
2322 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2323 			if (err) {
2324 				kfree_skb(skb);
2325 				goto out_err;
2326 			}
2327 		}
2328 
2329 		unix_state_lock(other);
2330 
2331 		if (sock_flag(other, SOCK_DEAD) ||
2332 		    (other->sk_shutdown & RCV_SHUTDOWN))
2333 			goto pipe_err_free;
2334 
2335 		maybe_add_creds(skb, sock, other);
2336 		scm_stat_add(other, skb);
2337 		skb_queue_tail(&other->sk_receive_queue, skb);
2338 		unix_state_unlock(other);
2339 		other->sk_data_ready(other);
2340 		sent += size;
2341 	}
2342 
2343 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2344 	if (msg->msg_flags & MSG_OOB) {
2345 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2346 		if (err)
2347 			goto out_err;
2348 		sent++;
2349 	}
2350 #endif
2351 
2352 	scm_destroy(&scm);
2353 
2354 	return sent;
2355 
2356 pipe_err_free:
2357 	unix_state_unlock(other);
2358 	kfree_skb(skb);
2359 pipe_err:
2360 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2361 		send_sig(SIGPIPE, current, 0);
2362 	err = -EPIPE;
2363 out_err:
2364 	scm_destroy(&scm);
2365 	return sent ? : err;
2366 }
2367 
2368 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2369 				  size_t len)
2370 {
2371 	int err;
2372 	struct sock *sk = sock->sk;
2373 
2374 	err = sock_error(sk);
2375 	if (err)
2376 		return err;
2377 
2378 	if (sk->sk_state != TCP_ESTABLISHED)
2379 		return -ENOTCONN;
2380 
2381 	if (msg->msg_namelen)
2382 		msg->msg_namelen = 0;
2383 
2384 	return unix_dgram_sendmsg(sock, msg, len);
2385 }
2386 
2387 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2388 				  size_t size, int flags)
2389 {
2390 	struct sock *sk = sock->sk;
2391 
2392 	if (sk->sk_state != TCP_ESTABLISHED)
2393 		return -ENOTCONN;
2394 
2395 	return unix_dgram_recvmsg(sock, msg, size, flags);
2396 }
2397 
2398 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2399 {
2400 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2401 
2402 	if (addr) {
2403 		msg->msg_namelen = addr->len;
2404 		memcpy(msg->msg_name, addr->name, addr->len);
2405 	}
2406 }
2407 
2408 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2409 			 int flags)
2410 {
2411 	struct scm_cookie scm;
2412 	struct socket *sock = sk->sk_socket;
2413 	struct unix_sock *u = unix_sk(sk);
2414 	struct sk_buff *skb, *last;
2415 	long timeo;
2416 	int skip;
2417 	int err;
2418 
2419 	err = -EOPNOTSUPP;
2420 	if (flags&MSG_OOB)
2421 		goto out;
2422 
2423 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2424 
2425 	do {
2426 		mutex_lock(&u->iolock);
2427 
2428 		skip = sk_peek_offset(sk, flags);
2429 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2430 					      &skip, &err, &last);
2431 		if (skb) {
2432 			if (!(flags & MSG_PEEK))
2433 				scm_stat_del(sk, skb);
2434 			break;
2435 		}
2436 
2437 		mutex_unlock(&u->iolock);
2438 
2439 		if (err != -EAGAIN)
2440 			break;
2441 	} while (timeo &&
2442 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2443 					      &err, &timeo, last));
2444 
2445 	if (!skb) { /* implies iolock unlocked */
2446 		unix_state_lock(sk);
2447 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2448 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2449 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2450 			err = 0;
2451 		unix_state_unlock(sk);
2452 		goto out;
2453 	}
2454 
2455 	if (wq_has_sleeper(&u->peer_wait))
2456 		wake_up_interruptible_sync_poll(&u->peer_wait,
2457 						EPOLLOUT | EPOLLWRNORM |
2458 						EPOLLWRBAND);
2459 
2460 	if (msg->msg_name) {
2461 		unix_copy_addr(msg, skb->sk);
2462 
2463 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2464 						      msg->msg_name,
2465 						      &msg->msg_namelen);
2466 	}
2467 
2468 	if (size > skb->len - skip)
2469 		size = skb->len - skip;
2470 	else if (size < skb->len - skip)
2471 		msg->msg_flags |= MSG_TRUNC;
2472 
2473 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2474 	if (err)
2475 		goto out_free;
2476 
2477 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2478 		__sock_recv_timestamp(msg, sk, skb);
2479 
2480 	memset(&scm, 0, sizeof(scm));
2481 
2482 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2483 	unix_set_secdata(&scm, skb);
2484 
2485 	if (!(flags & MSG_PEEK)) {
2486 		if (UNIXCB(skb).fp)
2487 			unix_detach_fds(&scm, skb);
2488 
2489 		sk_peek_offset_bwd(sk, skb->len);
2490 	} else {
2491 		/* It is questionable: on PEEK we could:
2492 		   - do not return fds - good, but too simple 8)
2493 		   - return fds, and do not return them on read (old strategy,
2494 		     apparently wrong)
2495 		   - clone fds (I chose it for now, it is the most universal
2496 		     solution)
2497 
2498 		   POSIX 1003.1g does not actually define this clearly
2499 		   at all. POSIX 1003.1g doesn't define a lot of things
2500 		   clearly however!
2501 
2502 		*/
2503 
2504 		sk_peek_offset_fwd(sk, size);
2505 
2506 		if (UNIXCB(skb).fp)
2507 			unix_peek_fds(&scm, skb);
2508 	}
2509 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2510 
2511 	scm_recv_unix(sock, msg, &scm, flags);
2512 
2513 out_free:
2514 	skb_free_datagram(sk, skb);
2515 	mutex_unlock(&u->iolock);
2516 out:
2517 	return err;
2518 }
2519 
2520 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2521 			      int flags)
2522 {
2523 	struct sock *sk = sock->sk;
2524 
2525 #ifdef CONFIG_BPF_SYSCALL
2526 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2527 
2528 	if (prot != &unix_dgram_proto)
2529 		return prot->recvmsg(sk, msg, size, flags, NULL);
2530 #endif
2531 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2532 }
2533 
2534 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2535 {
2536 	struct unix_sock *u = unix_sk(sk);
2537 	struct sk_buff *skb;
2538 	int err;
2539 
2540 	mutex_lock(&u->iolock);
2541 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2542 	mutex_unlock(&u->iolock);
2543 	if (!skb)
2544 		return err;
2545 
2546 	return recv_actor(sk, skb);
2547 }
2548 
2549 /*
2550  *	Sleep until more data has arrived. But check for races..
2551  */
2552 static long unix_stream_data_wait(struct sock *sk, long timeo,
2553 				  struct sk_buff *last, unsigned int last_len,
2554 				  bool freezable)
2555 {
2556 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2557 	struct sk_buff *tail;
2558 	DEFINE_WAIT(wait);
2559 
2560 	unix_state_lock(sk);
2561 
2562 	for (;;) {
2563 		prepare_to_wait(sk_sleep(sk), &wait, state);
2564 
2565 		tail = skb_peek_tail(&sk->sk_receive_queue);
2566 		if (tail != last ||
2567 		    (tail && tail->len != last_len) ||
2568 		    sk->sk_err ||
2569 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2570 		    signal_pending(current) ||
2571 		    !timeo)
2572 			break;
2573 
2574 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2575 		unix_state_unlock(sk);
2576 		timeo = schedule_timeout(timeo);
2577 		unix_state_lock(sk);
2578 
2579 		if (sock_flag(sk, SOCK_DEAD))
2580 			break;
2581 
2582 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2583 	}
2584 
2585 	finish_wait(sk_sleep(sk), &wait);
2586 	unix_state_unlock(sk);
2587 	return timeo;
2588 }
2589 
2590 static unsigned int unix_skb_len(const struct sk_buff *skb)
2591 {
2592 	return skb->len - UNIXCB(skb).consumed;
2593 }
2594 
2595 struct unix_stream_read_state {
2596 	int (*recv_actor)(struct sk_buff *, int, int,
2597 			  struct unix_stream_read_state *);
2598 	struct socket *socket;
2599 	struct msghdr *msg;
2600 	struct pipe_inode_info *pipe;
2601 	size_t size;
2602 	int flags;
2603 	unsigned int splice_flags;
2604 };
2605 
2606 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2607 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2608 {
2609 	struct socket *sock = state->socket;
2610 	struct sock *sk = sock->sk;
2611 	struct unix_sock *u = unix_sk(sk);
2612 	int chunk = 1;
2613 	struct sk_buff *oob_skb;
2614 
2615 	mutex_lock(&u->iolock);
2616 	unix_state_lock(sk);
2617 
2618 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2619 		unix_state_unlock(sk);
2620 		mutex_unlock(&u->iolock);
2621 		return -EINVAL;
2622 	}
2623 
2624 	oob_skb = u->oob_skb;
2625 
2626 	if (!(state->flags & MSG_PEEK))
2627 		WRITE_ONCE(u->oob_skb, NULL);
2628 	else
2629 		skb_get(oob_skb);
2630 	unix_state_unlock(sk);
2631 
2632 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2633 
2634 	if (!(state->flags & MSG_PEEK))
2635 		UNIXCB(oob_skb).consumed += 1;
2636 
2637 	consume_skb(oob_skb);
2638 
2639 	mutex_unlock(&u->iolock);
2640 
2641 	if (chunk < 0)
2642 		return -EFAULT;
2643 
2644 	state->msg->msg_flags |= MSG_OOB;
2645 	return 1;
2646 }
2647 
2648 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2649 				  int flags, int copied)
2650 {
2651 	struct unix_sock *u = unix_sk(sk);
2652 
2653 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2654 		skb_unlink(skb, &sk->sk_receive_queue);
2655 		consume_skb(skb);
2656 		skb = NULL;
2657 	} else {
2658 		if (skb == u->oob_skb) {
2659 			if (copied) {
2660 				skb = NULL;
2661 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2662 				if (!(flags & MSG_PEEK)) {
2663 					WRITE_ONCE(u->oob_skb, NULL);
2664 					consume_skb(skb);
2665 				}
2666 			} else if (!(flags & MSG_PEEK)) {
2667 				skb_unlink(skb, &sk->sk_receive_queue);
2668 				consume_skb(skb);
2669 				skb = skb_peek(&sk->sk_receive_queue);
2670 			}
2671 		}
2672 	}
2673 	return skb;
2674 }
2675 #endif
2676 
2677 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2678 {
2679 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2680 		return -ENOTCONN;
2681 
2682 	return unix_read_skb(sk, recv_actor);
2683 }
2684 
2685 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2686 				    bool freezable)
2687 {
2688 	struct scm_cookie scm;
2689 	struct socket *sock = state->socket;
2690 	struct sock *sk = sock->sk;
2691 	struct unix_sock *u = unix_sk(sk);
2692 	int copied = 0;
2693 	int flags = state->flags;
2694 	int noblock = flags & MSG_DONTWAIT;
2695 	bool check_creds = false;
2696 	int target;
2697 	int err = 0;
2698 	long timeo;
2699 	int skip;
2700 	size_t size = state->size;
2701 	unsigned int last_len;
2702 
2703 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2704 		err = -EINVAL;
2705 		goto out;
2706 	}
2707 
2708 	if (unlikely(flags & MSG_OOB)) {
2709 		err = -EOPNOTSUPP;
2710 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2711 		err = unix_stream_recv_urg(state);
2712 #endif
2713 		goto out;
2714 	}
2715 
2716 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2717 	timeo = sock_rcvtimeo(sk, noblock);
2718 
2719 	memset(&scm, 0, sizeof(scm));
2720 
2721 	/* Lock the socket to prevent queue disordering
2722 	 * while sleeps in memcpy_tomsg
2723 	 */
2724 	mutex_lock(&u->iolock);
2725 
2726 	skip = max(sk_peek_offset(sk, flags), 0);
2727 
2728 	do {
2729 		int chunk;
2730 		bool drop_skb;
2731 		struct sk_buff *skb, *last;
2732 
2733 redo:
2734 		unix_state_lock(sk);
2735 		if (sock_flag(sk, SOCK_DEAD)) {
2736 			err = -ECONNRESET;
2737 			goto unlock;
2738 		}
2739 		last = skb = skb_peek(&sk->sk_receive_queue);
2740 		last_len = last ? last->len : 0;
2741 
2742 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2743 		if (skb) {
2744 			skb = manage_oob(skb, sk, flags, copied);
2745 			if (!skb) {
2746 				unix_state_unlock(sk);
2747 				if (copied)
2748 					break;
2749 				goto redo;
2750 			}
2751 		}
2752 #endif
2753 again:
2754 		if (skb == NULL) {
2755 			if (copied >= target)
2756 				goto unlock;
2757 
2758 			/*
2759 			 *	POSIX 1003.1g mandates this order.
2760 			 */
2761 
2762 			err = sock_error(sk);
2763 			if (err)
2764 				goto unlock;
2765 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2766 				goto unlock;
2767 
2768 			unix_state_unlock(sk);
2769 			if (!timeo) {
2770 				err = -EAGAIN;
2771 				break;
2772 			}
2773 
2774 			mutex_unlock(&u->iolock);
2775 
2776 			timeo = unix_stream_data_wait(sk, timeo, last,
2777 						      last_len, freezable);
2778 
2779 			if (signal_pending(current)) {
2780 				err = sock_intr_errno(timeo);
2781 				scm_destroy(&scm);
2782 				goto out;
2783 			}
2784 
2785 			mutex_lock(&u->iolock);
2786 			goto redo;
2787 unlock:
2788 			unix_state_unlock(sk);
2789 			break;
2790 		}
2791 
2792 		while (skip >= unix_skb_len(skb)) {
2793 			skip -= unix_skb_len(skb);
2794 			last = skb;
2795 			last_len = skb->len;
2796 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2797 			if (!skb)
2798 				goto again;
2799 		}
2800 
2801 		unix_state_unlock(sk);
2802 
2803 		if (check_creds) {
2804 			/* Never glue messages from different writers */
2805 			if (!unix_skb_scm_eq(skb, &scm))
2806 				break;
2807 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2808 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2809 			/* Copy credentials */
2810 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2811 			unix_set_secdata(&scm, skb);
2812 			check_creds = true;
2813 		}
2814 
2815 		/* Copy address just once */
2816 		if (state->msg && state->msg->msg_name) {
2817 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2818 					 state->msg->msg_name);
2819 			unix_copy_addr(state->msg, skb->sk);
2820 
2821 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2822 							      state->msg->msg_name,
2823 							      &state->msg->msg_namelen);
2824 
2825 			sunaddr = NULL;
2826 		}
2827 
2828 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2829 		skb_get(skb);
2830 		chunk = state->recv_actor(skb, skip, chunk, state);
2831 		drop_skb = !unix_skb_len(skb);
2832 		/* skb is only safe to use if !drop_skb */
2833 		consume_skb(skb);
2834 		if (chunk < 0) {
2835 			if (copied == 0)
2836 				copied = -EFAULT;
2837 			break;
2838 		}
2839 		copied += chunk;
2840 		size -= chunk;
2841 
2842 		if (drop_skb) {
2843 			/* the skb was touched by a concurrent reader;
2844 			 * we should not expect anything from this skb
2845 			 * anymore and assume it invalid - we can be
2846 			 * sure it was dropped from the socket queue
2847 			 *
2848 			 * let's report a short read
2849 			 */
2850 			err = 0;
2851 			break;
2852 		}
2853 
2854 		/* Mark read part of skb as used */
2855 		if (!(flags & MSG_PEEK)) {
2856 			UNIXCB(skb).consumed += chunk;
2857 
2858 			sk_peek_offset_bwd(sk, chunk);
2859 
2860 			if (UNIXCB(skb).fp) {
2861 				scm_stat_del(sk, skb);
2862 				unix_detach_fds(&scm, skb);
2863 			}
2864 
2865 			if (unix_skb_len(skb))
2866 				break;
2867 
2868 			skb_unlink(skb, &sk->sk_receive_queue);
2869 			consume_skb(skb);
2870 
2871 			if (scm.fp)
2872 				break;
2873 		} else {
2874 			/* It is questionable, see note in unix_dgram_recvmsg.
2875 			 */
2876 			if (UNIXCB(skb).fp)
2877 				unix_peek_fds(&scm, skb);
2878 
2879 			sk_peek_offset_fwd(sk, chunk);
2880 
2881 			if (UNIXCB(skb).fp)
2882 				break;
2883 
2884 			skip = 0;
2885 			last = skb;
2886 			last_len = skb->len;
2887 			unix_state_lock(sk);
2888 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2889 			if (skb)
2890 				goto again;
2891 			unix_state_unlock(sk);
2892 			break;
2893 		}
2894 	} while (size);
2895 
2896 	mutex_unlock(&u->iolock);
2897 	if (state->msg)
2898 		scm_recv_unix(sock, state->msg, &scm, flags);
2899 	else
2900 		scm_destroy(&scm);
2901 out:
2902 	return copied ? : err;
2903 }
2904 
2905 static int unix_stream_read_actor(struct sk_buff *skb,
2906 				  int skip, int chunk,
2907 				  struct unix_stream_read_state *state)
2908 {
2909 	int ret;
2910 
2911 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2912 				    state->msg, chunk);
2913 	return ret ?: chunk;
2914 }
2915 
2916 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2917 			  size_t size, int flags)
2918 {
2919 	struct unix_stream_read_state state = {
2920 		.recv_actor = unix_stream_read_actor,
2921 		.socket = sk->sk_socket,
2922 		.msg = msg,
2923 		.size = size,
2924 		.flags = flags
2925 	};
2926 
2927 	return unix_stream_read_generic(&state, true);
2928 }
2929 
2930 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2931 			       size_t size, int flags)
2932 {
2933 	struct unix_stream_read_state state = {
2934 		.recv_actor = unix_stream_read_actor,
2935 		.socket = sock,
2936 		.msg = msg,
2937 		.size = size,
2938 		.flags = flags
2939 	};
2940 
2941 #ifdef CONFIG_BPF_SYSCALL
2942 	struct sock *sk = sock->sk;
2943 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2944 
2945 	if (prot != &unix_stream_proto)
2946 		return prot->recvmsg(sk, msg, size, flags, NULL);
2947 #endif
2948 	return unix_stream_read_generic(&state, true);
2949 }
2950 
2951 static int unix_stream_splice_actor(struct sk_buff *skb,
2952 				    int skip, int chunk,
2953 				    struct unix_stream_read_state *state)
2954 {
2955 	return skb_splice_bits(skb, state->socket->sk,
2956 			       UNIXCB(skb).consumed + skip,
2957 			       state->pipe, chunk, state->splice_flags);
2958 }
2959 
2960 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2961 				       struct pipe_inode_info *pipe,
2962 				       size_t size, unsigned int flags)
2963 {
2964 	struct unix_stream_read_state state = {
2965 		.recv_actor = unix_stream_splice_actor,
2966 		.socket = sock,
2967 		.pipe = pipe,
2968 		.size = size,
2969 		.splice_flags = flags,
2970 	};
2971 
2972 	if (unlikely(*ppos))
2973 		return -ESPIPE;
2974 
2975 	if (sock->file->f_flags & O_NONBLOCK ||
2976 	    flags & SPLICE_F_NONBLOCK)
2977 		state.flags = MSG_DONTWAIT;
2978 
2979 	return unix_stream_read_generic(&state, false);
2980 }
2981 
2982 static int unix_shutdown(struct socket *sock, int mode)
2983 {
2984 	struct sock *sk = sock->sk;
2985 	struct sock *other;
2986 
2987 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2988 		return -EINVAL;
2989 	/* This maps:
2990 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2991 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2992 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2993 	 */
2994 	++mode;
2995 
2996 	unix_state_lock(sk);
2997 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2998 	other = unix_peer(sk);
2999 	if (other)
3000 		sock_hold(other);
3001 	unix_state_unlock(sk);
3002 	sk->sk_state_change(sk);
3003 
3004 	if (other &&
3005 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3006 
3007 		int peer_mode = 0;
3008 		const struct proto *prot = READ_ONCE(other->sk_prot);
3009 
3010 		if (prot->unhash)
3011 			prot->unhash(other);
3012 		if (mode&RCV_SHUTDOWN)
3013 			peer_mode |= SEND_SHUTDOWN;
3014 		if (mode&SEND_SHUTDOWN)
3015 			peer_mode |= RCV_SHUTDOWN;
3016 		unix_state_lock(other);
3017 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3018 		unix_state_unlock(other);
3019 		other->sk_state_change(other);
3020 		if (peer_mode == SHUTDOWN_MASK)
3021 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3022 		else if (peer_mode & RCV_SHUTDOWN)
3023 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3024 	}
3025 	if (other)
3026 		sock_put(other);
3027 
3028 	return 0;
3029 }
3030 
3031 long unix_inq_len(struct sock *sk)
3032 {
3033 	struct sk_buff *skb;
3034 	long amount = 0;
3035 
3036 	if (sk->sk_state == TCP_LISTEN)
3037 		return -EINVAL;
3038 
3039 	spin_lock(&sk->sk_receive_queue.lock);
3040 	if (sk->sk_type == SOCK_STREAM ||
3041 	    sk->sk_type == SOCK_SEQPACKET) {
3042 		skb_queue_walk(&sk->sk_receive_queue, skb)
3043 			amount += unix_skb_len(skb);
3044 	} else {
3045 		skb = skb_peek(&sk->sk_receive_queue);
3046 		if (skb)
3047 			amount = skb->len;
3048 	}
3049 	spin_unlock(&sk->sk_receive_queue.lock);
3050 
3051 	return amount;
3052 }
3053 EXPORT_SYMBOL_GPL(unix_inq_len);
3054 
3055 long unix_outq_len(struct sock *sk)
3056 {
3057 	return sk_wmem_alloc_get(sk);
3058 }
3059 EXPORT_SYMBOL_GPL(unix_outq_len);
3060 
3061 static int unix_open_file(struct sock *sk)
3062 {
3063 	struct path path;
3064 	struct file *f;
3065 	int fd;
3066 
3067 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3068 		return -EPERM;
3069 
3070 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3071 		return -ENOENT;
3072 
3073 	path = unix_sk(sk)->path;
3074 	if (!path.dentry)
3075 		return -ENOENT;
3076 
3077 	path_get(&path);
3078 
3079 	fd = get_unused_fd_flags(O_CLOEXEC);
3080 	if (fd < 0)
3081 		goto out;
3082 
3083 	f = dentry_open(&path, O_PATH, current_cred());
3084 	if (IS_ERR(f)) {
3085 		put_unused_fd(fd);
3086 		fd = PTR_ERR(f);
3087 		goto out;
3088 	}
3089 
3090 	fd_install(fd, f);
3091 out:
3092 	path_put(&path);
3093 
3094 	return fd;
3095 }
3096 
3097 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3098 {
3099 	struct sock *sk = sock->sk;
3100 	long amount = 0;
3101 	int err;
3102 
3103 	switch (cmd) {
3104 	case SIOCOUTQ:
3105 		amount = unix_outq_len(sk);
3106 		err = put_user(amount, (int __user *)arg);
3107 		break;
3108 	case SIOCINQ:
3109 		amount = unix_inq_len(sk);
3110 		if (amount < 0)
3111 			err = amount;
3112 		else
3113 			err = put_user(amount, (int __user *)arg);
3114 		break;
3115 	case SIOCUNIXFILE:
3116 		err = unix_open_file(sk);
3117 		break;
3118 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3119 	case SIOCATMARK:
3120 		{
3121 			struct sk_buff *skb;
3122 			int answ = 0;
3123 
3124 			skb = skb_peek(&sk->sk_receive_queue);
3125 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3126 				answ = 1;
3127 			err = put_user(answ, (int __user *)arg);
3128 		}
3129 		break;
3130 #endif
3131 	default:
3132 		err = -ENOIOCTLCMD;
3133 		break;
3134 	}
3135 	return err;
3136 }
3137 
3138 #ifdef CONFIG_COMPAT
3139 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3140 {
3141 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3142 }
3143 #endif
3144 
3145 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3146 {
3147 	struct sock *sk = sock->sk;
3148 	__poll_t mask;
3149 	u8 shutdown;
3150 
3151 	sock_poll_wait(file, sock, wait);
3152 	mask = 0;
3153 	shutdown = READ_ONCE(sk->sk_shutdown);
3154 
3155 	/* exceptional events? */
3156 	if (READ_ONCE(sk->sk_err))
3157 		mask |= EPOLLERR;
3158 	if (shutdown == SHUTDOWN_MASK)
3159 		mask |= EPOLLHUP;
3160 	if (shutdown & RCV_SHUTDOWN)
3161 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3162 
3163 	/* readable? */
3164 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3165 		mask |= EPOLLIN | EPOLLRDNORM;
3166 	if (sk_is_readable(sk))
3167 		mask |= EPOLLIN | EPOLLRDNORM;
3168 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3169 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3170 		mask |= EPOLLPRI;
3171 #endif
3172 
3173 	/* Connection-based need to check for termination and startup */
3174 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3175 	    sk->sk_state == TCP_CLOSE)
3176 		mask |= EPOLLHUP;
3177 
3178 	/*
3179 	 * we set writable also when the other side has shut down the
3180 	 * connection. This prevents stuck sockets.
3181 	 */
3182 	if (unix_writable(sk))
3183 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3184 
3185 	return mask;
3186 }
3187 
3188 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3189 				    poll_table *wait)
3190 {
3191 	struct sock *sk = sock->sk, *other;
3192 	unsigned int writable;
3193 	__poll_t mask;
3194 	u8 shutdown;
3195 
3196 	sock_poll_wait(file, sock, wait);
3197 	mask = 0;
3198 	shutdown = READ_ONCE(sk->sk_shutdown);
3199 
3200 	/* exceptional events? */
3201 	if (READ_ONCE(sk->sk_err) ||
3202 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3203 		mask |= EPOLLERR |
3204 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3205 
3206 	if (shutdown & RCV_SHUTDOWN)
3207 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3208 	if (shutdown == SHUTDOWN_MASK)
3209 		mask |= EPOLLHUP;
3210 
3211 	/* readable? */
3212 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3213 		mask |= EPOLLIN | EPOLLRDNORM;
3214 	if (sk_is_readable(sk))
3215 		mask |= EPOLLIN | EPOLLRDNORM;
3216 
3217 	/* Connection-based need to check for termination and startup */
3218 	if (sk->sk_type == SOCK_SEQPACKET) {
3219 		if (sk->sk_state == TCP_CLOSE)
3220 			mask |= EPOLLHUP;
3221 		/* connection hasn't started yet? */
3222 		if (sk->sk_state == TCP_SYN_SENT)
3223 			return mask;
3224 	}
3225 
3226 	/* No write status requested, avoid expensive OUT tests. */
3227 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3228 		return mask;
3229 
3230 	writable = unix_writable(sk);
3231 	if (writable) {
3232 		unix_state_lock(sk);
3233 
3234 		other = unix_peer(sk);
3235 		if (other && unix_peer(other) != sk &&
3236 		    unix_recvq_full_lockless(other) &&
3237 		    unix_dgram_peer_wake_me(sk, other))
3238 			writable = 0;
3239 
3240 		unix_state_unlock(sk);
3241 	}
3242 
3243 	if (writable)
3244 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3245 	else
3246 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3247 
3248 	return mask;
3249 }
3250 
3251 #ifdef CONFIG_PROC_FS
3252 
3253 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3254 
3255 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3256 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3257 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3258 
3259 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3260 {
3261 	unsigned long offset = get_offset(*pos);
3262 	unsigned long bucket = get_bucket(*pos);
3263 	unsigned long count = 0;
3264 	struct sock *sk;
3265 
3266 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3267 	     sk; sk = sk_next(sk)) {
3268 		if (++count == offset)
3269 			break;
3270 	}
3271 
3272 	return sk;
3273 }
3274 
3275 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3276 {
3277 	unsigned long bucket = get_bucket(*pos);
3278 	struct net *net = seq_file_net(seq);
3279 	struct sock *sk;
3280 
3281 	while (bucket < UNIX_HASH_SIZE) {
3282 		spin_lock(&net->unx.table.locks[bucket]);
3283 
3284 		sk = unix_from_bucket(seq, pos);
3285 		if (sk)
3286 			return sk;
3287 
3288 		spin_unlock(&net->unx.table.locks[bucket]);
3289 
3290 		*pos = set_bucket_offset(++bucket, 1);
3291 	}
3292 
3293 	return NULL;
3294 }
3295 
3296 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3297 				  loff_t *pos)
3298 {
3299 	unsigned long bucket = get_bucket(*pos);
3300 
3301 	sk = sk_next(sk);
3302 	if (sk)
3303 		return sk;
3304 
3305 
3306 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3307 
3308 	*pos = set_bucket_offset(++bucket, 1);
3309 
3310 	return unix_get_first(seq, pos);
3311 }
3312 
3313 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3314 {
3315 	if (!*pos)
3316 		return SEQ_START_TOKEN;
3317 
3318 	return unix_get_first(seq, pos);
3319 }
3320 
3321 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3322 {
3323 	++*pos;
3324 
3325 	if (v == SEQ_START_TOKEN)
3326 		return unix_get_first(seq, pos);
3327 
3328 	return unix_get_next(seq, v, pos);
3329 }
3330 
3331 static void unix_seq_stop(struct seq_file *seq, void *v)
3332 {
3333 	struct sock *sk = v;
3334 
3335 	if (sk)
3336 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3337 }
3338 
3339 static int unix_seq_show(struct seq_file *seq, void *v)
3340 {
3341 
3342 	if (v == SEQ_START_TOKEN)
3343 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3344 			 "Inode Path\n");
3345 	else {
3346 		struct sock *s = v;
3347 		struct unix_sock *u = unix_sk(s);
3348 		unix_state_lock(s);
3349 
3350 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3351 			s,
3352 			refcount_read(&s->sk_refcnt),
3353 			0,
3354 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3355 			s->sk_type,
3356 			s->sk_socket ?
3357 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3358 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3359 			sock_i_ino(s));
3360 
3361 		if (u->addr) {	// under a hash table lock here
3362 			int i, len;
3363 			seq_putc(seq, ' ');
3364 
3365 			i = 0;
3366 			len = u->addr->len -
3367 				offsetof(struct sockaddr_un, sun_path);
3368 			if (u->addr->name->sun_path[0]) {
3369 				len--;
3370 			} else {
3371 				seq_putc(seq, '@');
3372 				i++;
3373 			}
3374 			for ( ; i < len; i++)
3375 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3376 					 '@');
3377 		}
3378 		unix_state_unlock(s);
3379 		seq_putc(seq, '\n');
3380 	}
3381 
3382 	return 0;
3383 }
3384 
3385 static const struct seq_operations unix_seq_ops = {
3386 	.start  = unix_seq_start,
3387 	.next   = unix_seq_next,
3388 	.stop   = unix_seq_stop,
3389 	.show   = unix_seq_show,
3390 };
3391 
3392 #ifdef CONFIG_BPF_SYSCALL
3393 struct bpf_unix_iter_state {
3394 	struct seq_net_private p;
3395 	unsigned int cur_sk;
3396 	unsigned int end_sk;
3397 	unsigned int max_sk;
3398 	struct sock **batch;
3399 	bool st_bucket_done;
3400 };
3401 
3402 struct bpf_iter__unix {
3403 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3404 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3405 	uid_t uid __aligned(8);
3406 };
3407 
3408 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3409 			      struct unix_sock *unix_sk, uid_t uid)
3410 {
3411 	struct bpf_iter__unix ctx;
3412 
3413 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3414 	ctx.meta = meta;
3415 	ctx.unix_sk = unix_sk;
3416 	ctx.uid = uid;
3417 	return bpf_iter_run_prog(prog, &ctx);
3418 }
3419 
3420 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3421 
3422 {
3423 	struct bpf_unix_iter_state *iter = seq->private;
3424 	unsigned int expected = 1;
3425 	struct sock *sk;
3426 
3427 	sock_hold(start_sk);
3428 	iter->batch[iter->end_sk++] = start_sk;
3429 
3430 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3431 		if (iter->end_sk < iter->max_sk) {
3432 			sock_hold(sk);
3433 			iter->batch[iter->end_sk++] = sk;
3434 		}
3435 
3436 		expected++;
3437 	}
3438 
3439 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3440 
3441 	return expected;
3442 }
3443 
3444 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3445 {
3446 	while (iter->cur_sk < iter->end_sk)
3447 		sock_put(iter->batch[iter->cur_sk++]);
3448 }
3449 
3450 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3451 				       unsigned int new_batch_sz)
3452 {
3453 	struct sock **new_batch;
3454 
3455 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3456 			     GFP_USER | __GFP_NOWARN);
3457 	if (!new_batch)
3458 		return -ENOMEM;
3459 
3460 	bpf_iter_unix_put_batch(iter);
3461 	kvfree(iter->batch);
3462 	iter->batch = new_batch;
3463 	iter->max_sk = new_batch_sz;
3464 
3465 	return 0;
3466 }
3467 
3468 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3469 					loff_t *pos)
3470 {
3471 	struct bpf_unix_iter_state *iter = seq->private;
3472 	unsigned int expected;
3473 	bool resized = false;
3474 	struct sock *sk;
3475 
3476 	if (iter->st_bucket_done)
3477 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3478 
3479 again:
3480 	/* Get a new batch */
3481 	iter->cur_sk = 0;
3482 	iter->end_sk = 0;
3483 
3484 	sk = unix_get_first(seq, pos);
3485 	if (!sk)
3486 		return NULL; /* Done */
3487 
3488 	expected = bpf_iter_unix_hold_batch(seq, sk);
3489 
3490 	if (iter->end_sk == expected) {
3491 		iter->st_bucket_done = true;
3492 		return sk;
3493 	}
3494 
3495 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3496 		resized = true;
3497 		goto again;
3498 	}
3499 
3500 	return sk;
3501 }
3502 
3503 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3504 {
3505 	if (!*pos)
3506 		return SEQ_START_TOKEN;
3507 
3508 	/* bpf iter does not support lseek, so it always
3509 	 * continue from where it was stop()-ped.
3510 	 */
3511 	return bpf_iter_unix_batch(seq, pos);
3512 }
3513 
3514 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3515 {
3516 	struct bpf_unix_iter_state *iter = seq->private;
3517 	struct sock *sk;
3518 
3519 	/* Whenever seq_next() is called, the iter->cur_sk is
3520 	 * done with seq_show(), so advance to the next sk in
3521 	 * the batch.
3522 	 */
3523 	if (iter->cur_sk < iter->end_sk)
3524 		sock_put(iter->batch[iter->cur_sk++]);
3525 
3526 	++*pos;
3527 
3528 	if (iter->cur_sk < iter->end_sk)
3529 		sk = iter->batch[iter->cur_sk];
3530 	else
3531 		sk = bpf_iter_unix_batch(seq, pos);
3532 
3533 	return sk;
3534 }
3535 
3536 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3537 {
3538 	struct bpf_iter_meta meta;
3539 	struct bpf_prog *prog;
3540 	struct sock *sk = v;
3541 	uid_t uid;
3542 	bool slow;
3543 	int ret;
3544 
3545 	if (v == SEQ_START_TOKEN)
3546 		return 0;
3547 
3548 	slow = lock_sock_fast(sk);
3549 
3550 	if (unlikely(sk_unhashed(sk))) {
3551 		ret = SEQ_SKIP;
3552 		goto unlock;
3553 	}
3554 
3555 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3556 	meta.seq = seq;
3557 	prog = bpf_iter_get_info(&meta, false);
3558 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3559 unlock:
3560 	unlock_sock_fast(sk, slow);
3561 	return ret;
3562 }
3563 
3564 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3565 {
3566 	struct bpf_unix_iter_state *iter = seq->private;
3567 	struct bpf_iter_meta meta;
3568 	struct bpf_prog *prog;
3569 
3570 	if (!v) {
3571 		meta.seq = seq;
3572 		prog = bpf_iter_get_info(&meta, true);
3573 		if (prog)
3574 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3575 	}
3576 
3577 	if (iter->cur_sk < iter->end_sk)
3578 		bpf_iter_unix_put_batch(iter);
3579 }
3580 
3581 static const struct seq_operations bpf_iter_unix_seq_ops = {
3582 	.start	= bpf_iter_unix_seq_start,
3583 	.next	= bpf_iter_unix_seq_next,
3584 	.stop	= bpf_iter_unix_seq_stop,
3585 	.show	= bpf_iter_unix_seq_show,
3586 };
3587 #endif
3588 #endif
3589 
3590 static const struct net_proto_family unix_family_ops = {
3591 	.family = PF_UNIX,
3592 	.create = unix_create,
3593 	.owner	= THIS_MODULE,
3594 };
3595 
3596 
3597 static int __net_init unix_net_init(struct net *net)
3598 {
3599 	int i;
3600 
3601 	net->unx.sysctl_max_dgram_qlen = 10;
3602 	if (unix_sysctl_register(net))
3603 		goto out;
3604 
3605 #ifdef CONFIG_PROC_FS
3606 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3607 			     sizeof(struct seq_net_private)))
3608 		goto err_sysctl;
3609 #endif
3610 
3611 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3612 					      sizeof(spinlock_t), GFP_KERNEL);
3613 	if (!net->unx.table.locks)
3614 		goto err_proc;
3615 
3616 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3617 						sizeof(struct hlist_head),
3618 						GFP_KERNEL);
3619 	if (!net->unx.table.buckets)
3620 		goto free_locks;
3621 
3622 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3623 		spin_lock_init(&net->unx.table.locks[i]);
3624 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3625 	}
3626 
3627 	return 0;
3628 
3629 free_locks:
3630 	kvfree(net->unx.table.locks);
3631 err_proc:
3632 #ifdef CONFIG_PROC_FS
3633 	remove_proc_entry("unix", net->proc_net);
3634 err_sysctl:
3635 #endif
3636 	unix_sysctl_unregister(net);
3637 out:
3638 	return -ENOMEM;
3639 }
3640 
3641 static void __net_exit unix_net_exit(struct net *net)
3642 {
3643 	kvfree(net->unx.table.buckets);
3644 	kvfree(net->unx.table.locks);
3645 	unix_sysctl_unregister(net);
3646 	remove_proc_entry("unix", net->proc_net);
3647 }
3648 
3649 static struct pernet_operations unix_net_ops = {
3650 	.init = unix_net_init,
3651 	.exit = unix_net_exit,
3652 };
3653 
3654 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3655 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3656 		     struct unix_sock *unix_sk, uid_t uid)
3657 
3658 #define INIT_BATCH_SZ 16
3659 
3660 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3661 {
3662 	struct bpf_unix_iter_state *iter = priv_data;
3663 	int err;
3664 
3665 	err = bpf_iter_init_seq_net(priv_data, aux);
3666 	if (err)
3667 		return err;
3668 
3669 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3670 	if (err) {
3671 		bpf_iter_fini_seq_net(priv_data);
3672 		return err;
3673 	}
3674 
3675 	return 0;
3676 }
3677 
3678 static void bpf_iter_fini_unix(void *priv_data)
3679 {
3680 	struct bpf_unix_iter_state *iter = priv_data;
3681 
3682 	bpf_iter_fini_seq_net(priv_data);
3683 	kvfree(iter->batch);
3684 }
3685 
3686 static const struct bpf_iter_seq_info unix_seq_info = {
3687 	.seq_ops		= &bpf_iter_unix_seq_ops,
3688 	.init_seq_private	= bpf_iter_init_unix,
3689 	.fini_seq_private	= bpf_iter_fini_unix,
3690 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3691 };
3692 
3693 static const struct bpf_func_proto *
3694 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3695 			     const struct bpf_prog *prog)
3696 {
3697 	switch (func_id) {
3698 	case BPF_FUNC_setsockopt:
3699 		return &bpf_sk_setsockopt_proto;
3700 	case BPF_FUNC_getsockopt:
3701 		return &bpf_sk_getsockopt_proto;
3702 	default:
3703 		return NULL;
3704 	}
3705 }
3706 
3707 static struct bpf_iter_reg unix_reg_info = {
3708 	.target			= "unix",
3709 	.ctx_arg_info_size	= 1,
3710 	.ctx_arg_info		= {
3711 		{ offsetof(struct bpf_iter__unix, unix_sk),
3712 		  PTR_TO_BTF_ID_OR_NULL },
3713 	},
3714 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3715 	.seq_info		= &unix_seq_info,
3716 };
3717 
3718 static void __init bpf_iter_register(void)
3719 {
3720 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3721 	if (bpf_iter_reg_target(&unix_reg_info))
3722 		pr_warn("Warning: could not register bpf iterator unix\n");
3723 }
3724 #endif
3725 
3726 static int __init af_unix_init(void)
3727 {
3728 	int i, rc = -1;
3729 
3730 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3731 
3732 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3733 		spin_lock_init(&bsd_socket_locks[i]);
3734 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3735 	}
3736 
3737 	rc = proto_register(&unix_dgram_proto, 1);
3738 	if (rc != 0) {
3739 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3740 		goto out;
3741 	}
3742 
3743 	rc = proto_register(&unix_stream_proto, 1);
3744 	if (rc != 0) {
3745 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3746 		proto_unregister(&unix_dgram_proto);
3747 		goto out;
3748 	}
3749 
3750 	sock_register(&unix_family_ops);
3751 	register_pernet_subsys(&unix_net_ops);
3752 	unix_bpf_build_proto();
3753 
3754 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3755 	bpf_iter_register();
3756 #endif
3757 
3758 out:
3759 	return rc;
3760 }
3761 
3762 /* Later than subsys_initcall() because we depend on stuff initialised there */
3763 fs_initcall(af_unix_init);
3764