xref: /linux/net/unix/af_unix.c (revision c2dfe29f30d8850af324449f416491b171af19aa)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 #include "scm.h"
122 
123 static atomic_long_t unix_nr_socks;
124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
126 
127 /* SMP locking strategy:
128  *    hash table is protected with spinlock.
129  *    each socket state is protected by separate spinlock.
130  */
131 
132 static unsigned int unix_unbound_hash(struct sock *sk)
133 {
134 	unsigned long hash = (unsigned long)sk;
135 
136 	hash ^= hash >> 16;
137 	hash ^= hash >> 8;
138 	hash ^= sk->sk_type;
139 
140 	return hash & UNIX_HASH_MOD;
141 }
142 
143 static unsigned int unix_bsd_hash(struct inode *i)
144 {
145 	return i->i_ino & UNIX_HASH_MOD;
146 }
147 
148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
149 				       int addr_len, int type)
150 {
151 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
152 	unsigned int hash;
153 
154 	hash = (__force unsigned int)csum_fold(csum);
155 	hash ^= hash >> 8;
156 	hash ^= type;
157 
158 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
159 }
160 
161 static void unix_table_double_lock(struct net *net,
162 				   unsigned int hash1, unsigned int hash2)
163 {
164 	if (hash1 == hash2) {
165 		spin_lock(&net->unx.table.locks[hash1]);
166 		return;
167 	}
168 
169 	if (hash1 > hash2)
170 		swap(hash1, hash2);
171 
172 	spin_lock(&net->unx.table.locks[hash1]);
173 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
174 }
175 
176 static void unix_table_double_unlock(struct net *net,
177 				     unsigned int hash1, unsigned int hash2)
178 {
179 	if (hash1 == hash2) {
180 		spin_unlock(&net->unx.table.locks[hash1]);
181 		return;
182 	}
183 
184 	spin_unlock(&net->unx.table.locks[hash1]);
185 	spin_unlock(&net->unx.table.locks[hash2]);
186 }
187 
188 #ifdef CONFIG_SECURITY_NETWORK
189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
190 {
191 	UNIXCB(skb).secid = scm->secid;
192 }
193 
194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
195 {
196 	scm->secid = UNIXCB(skb).secid;
197 }
198 
199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
200 {
201 	return (scm->secid == UNIXCB(skb).secid);
202 }
203 #else
204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
205 { }
206 
207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
208 { }
209 
210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
211 {
212 	return true;
213 }
214 #endif /* CONFIG_SECURITY_NETWORK */
215 
216 #define unix_peer(sk) (unix_sk(sk)->peer)
217 
218 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
219 {
220 	return unix_peer(osk) == sk;
221 }
222 
223 static inline int unix_may_send(struct sock *sk, struct sock *osk)
224 {
225 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
226 }
227 
228 static inline int unix_recvq_full(const struct sock *sk)
229 {
230 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
231 }
232 
233 static inline int unix_recvq_full_lockless(const struct sock *sk)
234 {
235 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
236 		READ_ONCE(sk->sk_max_ack_backlog);
237 }
238 
239 struct sock *unix_peer_get(struct sock *s)
240 {
241 	struct sock *peer;
242 
243 	unix_state_lock(s);
244 	peer = unix_peer(s);
245 	if (peer)
246 		sock_hold(peer);
247 	unix_state_unlock(s);
248 	return peer;
249 }
250 EXPORT_SYMBOL_GPL(unix_peer_get);
251 
252 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
253 					     int addr_len)
254 {
255 	struct unix_address *addr;
256 
257 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
258 	if (!addr)
259 		return NULL;
260 
261 	refcount_set(&addr->refcnt, 1);
262 	addr->len = addr_len;
263 	memcpy(addr->name, sunaddr, addr_len);
264 
265 	return addr;
266 }
267 
268 static inline void unix_release_addr(struct unix_address *addr)
269 {
270 	if (refcount_dec_and_test(&addr->refcnt))
271 		kfree(addr);
272 }
273 
274 /*
275  *	Check unix socket name:
276  *		- should be not zero length.
277  *	        - if started by not zero, should be NULL terminated (FS object)
278  *		- if started by zero, it is abstract name.
279  */
280 
281 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
282 {
283 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
284 	    addr_len > sizeof(*sunaddr))
285 		return -EINVAL;
286 
287 	if (sunaddr->sun_family != AF_UNIX)
288 		return -EINVAL;
289 
290 	return 0;
291 }
292 
293 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
294 {
295 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
296 	short offset = offsetof(struct sockaddr_storage, __data);
297 
298 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
299 
300 	/* This may look like an off by one error but it is a bit more
301 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
302 	 * sun_path[108] doesn't as such exist.  However in kernel space
303 	 * we are guaranteed that it is a valid memory location in our
304 	 * kernel address buffer because syscall functions always pass
305 	 * a pointer of struct sockaddr_storage which has a bigger buffer
306 	 * than 108.  Also, we must terminate sun_path for strlen() in
307 	 * getname_kernel().
308 	 */
309 	addr->__data[addr_len - offset] = 0;
310 
311 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
312 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
313 	 * know the actual buffer.
314 	 */
315 	return strlen(addr->__data) + offset + 1;
316 }
317 
318 static void __unix_remove_socket(struct sock *sk)
319 {
320 	sk_del_node_init(sk);
321 }
322 
323 static void __unix_insert_socket(struct net *net, struct sock *sk)
324 {
325 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
326 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
327 }
328 
329 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
330 				 struct unix_address *addr, unsigned int hash)
331 {
332 	__unix_remove_socket(sk);
333 	smp_store_release(&unix_sk(sk)->addr, addr);
334 
335 	sk->sk_hash = hash;
336 	__unix_insert_socket(net, sk);
337 }
338 
339 static void unix_remove_socket(struct net *net, struct sock *sk)
340 {
341 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
342 	__unix_remove_socket(sk);
343 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
344 }
345 
346 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
347 {
348 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
349 	__unix_insert_socket(net, sk);
350 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
351 }
352 
353 static void unix_insert_bsd_socket(struct sock *sk)
354 {
355 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
356 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
357 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
358 }
359 
360 static void unix_remove_bsd_socket(struct sock *sk)
361 {
362 	if (!hlist_unhashed(&sk->sk_bind_node)) {
363 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
364 		__sk_del_bind_node(sk);
365 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
366 
367 		sk_node_init(&sk->sk_bind_node);
368 	}
369 }
370 
371 static struct sock *__unix_find_socket_byname(struct net *net,
372 					      struct sockaddr_un *sunname,
373 					      int len, unsigned int hash)
374 {
375 	struct sock *s;
376 
377 	sk_for_each(s, &net->unx.table.buckets[hash]) {
378 		struct unix_sock *u = unix_sk(s);
379 
380 		if (u->addr->len == len &&
381 		    !memcmp(u->addr->name, sunname, len))
382 			return s;
383 	}
384 	return NULL;
385 }
386 
387 static inline struct sock *unix_find_socket_byname(struct net *net,
388 						   struct sockaddr_un *sunname,
389 						   int len, unsigned int hash)
390 {
391 	struct sock *s;
392 
393 	spin_lock(&net->unx.table.locks[hash]);
394 	s = __unix_find_socket_byname(net, sunname, len, hash);
395 	if (s)
396 		sock_hold(s);
397 	spin_unlock(&net->unx.table.locks[hash]);
398 	return s;
399 }
400 
401 static struct sock *unix_find_socket_byinode(struct inode *i)
402 {
403 	unsigned int hash = unix_bsd_hash(i);
404 	struct sock *s;
405 
406 	spin_lock(&bsd_socket_locks[hash]);
407 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
408 		struct dentry *dentry = unix_sk(s)->path.dentry;
409 
410 		if (dentry && d_backing_inode(dentry) == i) {
411 			sock_hold(s);
412 			spin_unlock(&bsd_socket_locks[hash]);
413 			return s;
414 		}
415 	}
416 	spin_unlock(&bsd_socket_locks[hash]);
417 	return NULL;
418 }
419 
420 /* Support code for asymmetrically connected dgram sockets
421  *
422  * If a datagram socket is connected to a socket not itself connected
423  * to the first socket (eg, /dev/log), clients may only enqueue more
424  * messages if the present receive queue of the server socket is not
425  * "too large". This means there's a second writeability condition
426  * poll and sendmsg need to test. The dgram recv code will do a wake
427  * up on the peer_wait wait queue of a socket upon reception of a
428  * datagram which needs to be propagated to sleeping would-be writers
429  * since these might not have sent anything so far. This can't be
430  * accomplished via poll_wait because the lifetime of the server
431  * socket might be less than that of its clients if these break their
432  * association with it or if the server socket is closed while clients
433  * are still connected to it and there's no way to inform "a polling
434  * implementation" that it should let go of a certain wait queue
435  *
436  * In order to propagate a wake up, a wait_queue_entry_t of the client
437  * socket is enqueued on the peer_wait queue of the server socket
438  * whose wake function does a wake_up on the ordinary client socket
439  * wait queue. This connection is established whenever a write (or
440  * poll for write) hit the flow control condition and broken when the
441  * association to the server socket is dissolved or after a wake up
442  * was relayed.
443  */
444 
445 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
446 				      void *key)
447 {
448 	struct unix_sock *u;
449 	wait_queue_head_t *u_sleep;
450 
451 	u = container_of(q, struct unix_sock, peer_wake);
452 
453 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
454 			    q);
455 	u->peer_wake.private = NULL;
456 
457 	/* relaying can only happen while the wq still exists */
458 	u_sleep = sk_sleep(&u->sk);
459 	if (u_sleep)
460 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
461 
462 	return 0;
463 }
464 
465 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
466 {
467 	struct unix_sock *u, *u_other;
468 	int rc;
469 
470 	u = unix_sk(sk);
471 	u_other = unix_sk(other);
472 	rc = 0;
473 	spin_lock(&u_other->peer_wait.lock);
474 
475 	if (!u->peer_wake.private) {
476 		u->peer_wake.private = other;
477 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
478 
479 		rc = 1;
480 	}
481 
482 	spin_unlock(&u_other->peer_wait.lock);
483 	return rc;
484 }
485 
486 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
487 					    struct sock *other)
488 {
489 	struct unix_sock *u, *u_other;
490 
491 	u = unix_sk(sk);
492 	u_other = unix_sk(other);
493 	spin_lock(&u_other->peer_wait.lock);
494 
495 	if (u->peer_wake.private == other) {
496 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
497 		u->peer_wake.private = NULL;
498 	}
499 
500 	spin_unlock(&u_other->peer_wait.lock);
501 }
502 
503 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
504 						   struct sock *other)
505 {
506 	unix_dgram_peer_wake_disconnect(sk, other);
507 	wake_up_interruptible_poll(sk_sleep(sk),
508 				   EPOLLOUT |
509 				   EPOLLWRNORM |
510 				   EPOLLWRBAND);
511 }
512 
513 /* preconditions:
514  *	- unix_peer(sk) == other
515  *	- association is stable
516  */
517 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
518 {
519 	int connected;
520 
521 	connected = unix_dgram_peer_wake_connect(sk, other);
522 
523 	/* If other is SOCK_DEAD, we want to make sure we signal
524 	 * POLLOUT, such that a subsequent write() can get a
525 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
526 	 * to other and its full, we will hang waiting for POLLOUT.
527 	 */
528 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
529 		return 1;
530 
531 	if (connected)
532 		unix_dgram_peer_wake_disconnect(sk, other);
533 
534 	return 0;
535 }
536 
537 static int unix_writable(const struct sock *sk)
538 {
539 	return sk->sk_state != TCP_LISTEN &&
540 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
541 }
542 
543 static void unix_write_space(struct sock *sk)
544 {
545 	struct socket_wq *wq;
546 
547 	rcu_read_lock();
548 	if (unix_writable(sk)) {
549 		wq = rcu_dereference(sk->sk_wq);
550 		if (skwq_has_sleeper(wq))
551 			wake_up_interruptible_sync_poll(&wq->wait,
552 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
553 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
554 	}
555 	rcu_read_unlock();
556 }
557 
558 /* When dgram socket disconnects (or changes its peer), we clear its receive
559  * queue of packets arrived from previous peer. First, it allows to do
560  * flow control based only on wmem_alloc; second, sk connected to peer
561  * may receive messages only from that peer. */
562 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
563 {
564 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
565 		skb_queue_purge(&sk->sk_receive_queue);
566 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
567 
568 		/* If one link of bidirectional dgram pipe is disconnected,
569 		 * we signal error. Messages are lost. Do not make this,
570 		 * when peer was not connected to us.
571 		 */
572 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
573 			WRITE_ONCE(other->sk_err, ECONNRESET);
574 			sk_error_report(other);
575 		}
576 	}
577 	other->sk_state = TCP_CLOSE;
578 }
579 
580 static void unix_sock_destructor(struct sock *sk)
581 {
582 	struct unix_sock *u = unix_sk(sk);
583 
584 	skb_queue_purge(&sk->sk_receive_queue);
585 
586 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
587 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
588 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
589 	if (!sock_flag(sk, SOCK_DEAD)) {
590 		pr_info("Attempt to release alive unix socket: %p\n", sk);
591 		return;
592 	}
593 
594 	if (u->addr)
595 		unix_release_addr(u->addr);
596 
597 	atomic_long_dec(&unix_nr_socks);
598 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
599 #ifdef UNIX_REFCNT_DEBUG
600 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
601 		atomic_long_read(&unix_nr_socks));
602 #endif
603 }
604 
605 static void unix_release_sock(struct sock *sk, int embrion)
606 {
607 	struct unix_sock *u = unix_sk(sk);
608 	struct sock *skpair;
609 	struct sk_buff *skb;
610 	struct path path;
611 	int state;
612 
613 	unix_remove_socket(sock_net(sk), sk);
614 	unix_remove_bsd_socket(sk);
615 
616 	/* Clear state */
617 	unix_state_lock(sk);
618 	sock_orphan(sk);
619 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
620 	path	     = u->path;
621 	u->path.dentry = NULL;
622 	u->path.mnt = NULL;
623 	state = sk->sk_state;
624 	sk->sk_state = TCP_CLOSE;
625 
626 	skpair = unix_peer(sk);
627 	unix_peer(sk) = NULL;
628 
629 	unix_state_unlock(sk);
630 
631 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
632 	if (u->oob_skb) {
633 		kfree_skb(u->oob_skb);
634 		u->oob_skb = NULL;
635 	}
636 #endif
637 
638 	wake_up_interruptible_all(&u->peer_wait);
639 
640 	if (skpair != NULL) {
641 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
642 			unix_state_lock(skpair);
643 			/* No more writes */
644 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
645 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
646 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
647 			unix_state_unlock(skpair);
648 			skpair->sk_state_change(skpair);
649 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
650 		}
651 
652 		unix_dgram_peer_wake_disconnect(sk, skpair);
653 		sock_put(skpair); /* It may now die */
654 	}
655 
656 	/* Try to flush out this socket. Throw out buffers at least */
657 
658 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
659 		if (state == TCP_LISTEN)
660 			unix_release_sock(skb->sk, 1);
661 		/* passed fds are erased in the kfree_skb hook	      */
662 		UNIXCB(skb).consumed = skb->len;
663 		kfree_skb(skb);
664 	}
665 
666 	if (path.dentry)
667 		path_put(&path);
668 
669 	sock_put(sk);
670 
671 	/* ---- Socket is dead now and most probably destroyed ---- */
672 
673 	/*
674 	 * Fixme: BSD difference: In BSD all sockets connected to us get
675 	 *	  ECONNRESET and we die on the spot. In Linux we behave
676 	 *	  like files and pipes do and wait for the last
677 	 *	  dereference.
678 	 *
679 	 * Can't we simply set sock->err?
680 	 *
681 	 *	  What the above comment does talk about? --ANK(980817)
682 	 */
683 
684 	if (READ_ONCE(unix_tot_inflight))
685 		unix_gc();		/* Garbage collect fds */
686 }
687 
688 static void init_peercred(struct sock *sk)
689 {
690 	const struct cred *old_cred;
691 	struct pid *old_pid;
692 
693 	spin_lock(&sk->sk_peer_lock);
694 	old_pid = sk->sk_peer_pid;
695 	old_cred = sk->sk_peer_cred;
696 	sk->sk_peer_pid  = get_pid(task_tgid(current));
697 	sk->sk_peer_cred = get_current_cred();
698 	spin_unlock(&sk->sk_peer_lock);
699 
700 	put_pid(old_pid);
701 	put_cred(old_cred);
702 }
703 
704 static void copy_peercred(struct sock *sk, struct sock *peersk)
705 {
706 	const struct cred *old_cred;
707 	struct pid *old_pid;
708 
709 	if (sk < peersk) {
710 		spin_lock(&sk->sk_peer_lock);
711 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
712 	} else {
713 		spin_lock(&peersk->sk_peer_lock);
714 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
715 	}
716 	old_pid = sk->sk_peer_pid;
717 	old_cred = sk->sk_peer_cred;
718 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
719 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
720 
721 	spin_unlock(&sk->sk_peer_lock);
722 	spin_unlock(&peersk->sk_peer_lock);
723 
724 	put_pid(old_pid);
725 	put_cred(old_cred);
726 }
727 
728 static int unix_listen(struct socket *sock, int backlog)
729 {
730 	int err;
731 	struct sock *sk = sock->sk;
732 	struct unix_sock *u = unix_sk(sk);
733 
734 	err = -EOPNOTSUPP;
735 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
736 		goto out;	/* Only stream/seqpacket sockets accept */
737 	err = -EINVAL;
738 	if (!u->addr)
739 		goto out;	/* No listens on an unbound socket */
740 	unix_state_lock(sk);
741 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
742 		goto out_unlock;
743 	if (backlog > sk->sk_max_ack_backlog)
744 		wake_up_interruptible_all(&u->peer_wait);
745 	sk->sk_max_ack_backlog	= backlog;
746 	sk->sk_state		= TCP_LISTEN;
747 	/* set credentials so connect can copy them */
748 	init_peercred(sk);
749 	err = 0;
750 
751 out_unlock:
752 	unix_state_unlock(sk);
753 out:
754 	return err;
755 }
756 
757 static int unix_release(struct socket *);
758 static int unix_bind(struct socket *, struct sockaddr *, int);
759 static int unix_stream_connect(struct socket *, struct sockaddr *,
760 			       int addr_len, int flags);
761 static int unix_socketpair(struct socket *, struct socket *);
762 static int unix_accept(struct socket *, struct socket *, int, bool);
763 static int unix_getname(struct socket *, struct sockaddr *, int);
764 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
765 static __poll_t unix_dgram_poll(struct file *, struct socket *,
766 				    poll_table *);
767 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
768 #ifdef CONFIG_COMPAT
769 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
770 #endif
771 static int unix_shutdown(struct socket *, int);
772 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
773 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
774 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
775 				       struct pipe_inode_info *, size_t size,
776 				       unsigned int flags);
777 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
778 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
779 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
780 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
781 static int unix_dgram_connect(struct socket *, struct sockaddr *,
782 			      int, int);
783 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
784 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
785 				  int);
786 
787 static int unix_set_peek_off(struct sock *sk, int val)
788 {
789 	struct unix_sock *u = unix_sk(sk);
790 
791 	if (mutex_lock_interruptible(&u->iolock))
792 		return -EINTR;
793 
794 	WRITE_ONCE(sk->sk_peek_off, val);
795 	mutex_unlock(&u->iolock);
796 
797 	return 0;
798 }
799 
800 #ifdef CONFIG_PROC_FS
801 static int unix_count_nr_fds(struct sock *sk)
802 {
803 	struct sk_buff *skb;
804 	struct unix_sock *u;
805 	int nr_fds = 0;
806 
807 	spin_lock(&sk->sk_receive_queue.lock);
808 	skb = skb_peek(&sk->sk_receive_queue);
809 	while (skb) {
810 		u = unix_sk(skb->sk);
811 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
812 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
813 	}
814 	spin_unlock(&sk->sk_receive_queue.lock);
815 
816 	return nr_fds;
817 }
818 
819 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
820 {
821 	struct sock *sk = sock->sk;
822 	unsigned char s_state;
823 	struct unix_sock *u;
824 	int nr_fds = 0;
825 
826 	if (sk) {
827 		s_state = READ_ONCE(sk->sk_state);
828 		u = unix_sk(sk);
829 
830 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
831 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
832 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
833 		 */
834 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
835 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
836 		else if (s_state == TCP_LISTEN)
837 			nr_fds = unix_count_nr_fds(sk);
838 
839 		seq_printf(m, "scm_fds: %u\n", nr_fds);
840 	}
841 }
842 #else
843 #define unix_show_fdinfo NULL
844 #endif
845 
846 static const struct proto_ops unix_stream_ops = {
847 	.family =	PF_UNIX,
848 	.owner =	THIS_MODULE,
849 	.release =	unix_release,
850 	.bind =		unix_bind,
851 	.connect =	unix_stream_connect,
852 	.socketpair =	unix_socketpair,
853 	.accept =	unix_accept,
854 	.getname =	unix_getname,
855 	.poll =		unix_poll,
856 	.ioctl =	unix_ioctl,
857 #ifdef CONFIG_COMPAT
858 	.compat_ioctl =	unix_compat_ioctl,
859 #endif
860 	.listen =	unix_listen,
861 	.shutdown =	unix_shutdown,
862 	.sendmsg =	unix_stream_sendmsg,
863 	.recvmsg =	unix_stream_recvmsg,
864 	.read_skb =	unix_stream_read_skb,
865 	.mmap =		sock_no_mmap,
866 	.splice_read =	unix_stream_splice_read,
867 	.set_peek_off =	unix_set_peek_off,
868 	.show_fdinfo =	unix_show_fdinfo,
869 };
870 
871 static const struct proto_ops unix_dgram_ops = {
872 	.family =	PF_UNIX,
873 	.owner =	THIS_MODULE,
874 	.release =	unix_release,
875 	.bind =		unix_bind,
876 	.connect =	unix_dgram_connect,
877 	.socketpair =	unix_socketpair,
878 	.accept =	sock_no_accept,
879 	.getname =	unix_getname,
880 	.poll =		unix_dgram_poll,
881 	.ioctl =	unix_ioctl,
882 #ifdef CONFIG_COMPAT
883 	.compat_ioctl =	unix_compat_ioctl,
884 #endif
885 	.listen =	sock_no_listen,
886 	.shutdown =	unix_shutdown,
887 	.sendmsg =	unix_dgram_sendmsg,
888 	.read_skb =	unix_read_skb,
889 	.recvmsg =	unix_dgram_recvmsg,
890 	.mmap =		sock_no_mmap,
891 	.set_peek_off =	unix_set_peek_off,
892 	.show_fdinfo =	unix_show_fdinfo,
893 };
894 
895 static const struct proto_ops unix_seqpacket_ops = {
896 	.family =	PF_UNIX,
897 	.owner =	THIS_MODULE,
898 	.release =	unix_release,
899 	.bind =		unix_bind,
900 	.connect =	unix_stream_connect,
901 	.socketpair =	unix_socketpair,
902 	.accept =	unix_accept,
903 	.getname =	unix_getname,
904 	.poll =		unix_dgram_poll,
905 	.ioctl =	unix_ioctl,
906 #ifdef CONFIG_COMPAT
907 	.compat_ioctl =	unix_compat_ioctl,
908 #endif
909 	.listen =	unix_listen,
910 	.shutdown =	unix_shutdown,
911 	.sendmsg =	unix_seqpacket_sendmsg,
912 	.recvmsg =	unix_seqpacket_recvmsg,
913 	.mmap =		sock_no_mmap,
914 	.set_peek_off =	unix_set_peek_off,
915 	.show_fdinfo =	unix_show_fdinfo,
916 };
917 
918 static void unix_close(struct sock *sk, long timeout)
919 {
920 	/* Nothing to do here, unix socket does not need a ->close().
921 	 * This is merely for sockmap.
922 	 */
923 }
924 
925 static void unix_unhash(struct sock *sk)
926 {
927 	/* Nothing to do here, unix socket does not need a ->unhash().
928 	 * This is merely for sockmap.
929 	 */
930 }
931 
932 static bool unix_bpf_bypass_getsockopt(int level, int optname)
933 {
934 	if (level == SOL_SOCKET) {
935 		switch (optname) {
936 		case SO_PEERPIDFD:
937 			return true;
938 		default:
939 			return false;
940 		}
941 	}
942 
943 	return false;
944 }
945 
946 struct proto unix_dgram_proto = {
947 	.name			= "UNIX",
948 	.owner			= THIS_MODULE,
949 	.obj_size		= sizeof(struct unix_sock),
950 	.close			= unix_close,
951 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
952 #ifdef CONFIG_BPF_SYSCALL
953 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
954 #endif
955 };
956 
957 struct proto unix_stream_proto = {
958 	.name			= "UNIX-STREAM",
959 	.owner			= THIS_MODULE,
960 	.obj_size		= sizeof(struct unix_sock),
961 	.close			= unix_close,
962 	.unhash			= unix_unhash,
963 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
964 #ifdef CONFIG_BPF_SYSCALL
965 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
966 #endif
967 };
968 
969 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
970 {
971 	struct unix_sock *u;
972 	struct sock *sk;
973 	int err;
974 
975 	atomic_long_inc(&unix_nr_socks);
976 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
977 		err = -ENFILE;
978 		goto err;
979 	}
980 
981 	if (type == SOCK_STREAM)
982 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
983 	else /*dgram and  seqpacket */
984 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
985 
986 	if (!sk) {
987 		err = -ENOMEM;
988 		goto err;
989 	}
990 
991 	sock_init_data(sock, sk);
992 
993 	sk->sk_hash		= unix_unbound_hash(sk);
994 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
995 	sk->sk_write_space	= unix_write_space;
996 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
997 	sk->sk_destruct		= unix_sock_destructor;
998 	u	  = unix_sk(sk);
999 	u->path.dentry = NULL;
1000 	u->path.mnt = NULL;
1001 	spin_lock_init(&u->lock);
1002 	atomic_long_set(&u->inflight, 0);
1003 	INIT_LIST_HEAD(&u->link);
1004 	mutex_init(&u->iolock); /* single task reading lock */
1005 	mutex_init(&u->bindlock); /* single task binding lock */
1006 	init_waitqueue_head(&u->peer_wait);
1007 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1008 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1009 	unix_insert_unbound_socket(net, sk);
1010 
1011 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1012 
1013 	return sk;
1014 
1015 err:
1016 	atomic_long_dec(&unix_nr_socks);
1017 	return ERR_PTR(err);
1018 }
1019 
1020 static int unix_create(struct net *net, struct socket *sock, int protocol,
1021 		       int kern)
1022 {
1023 	struct sock *sk;
1024 
1025 	if (protocol && protocol != PF_UNIX)
1026 		return -EPROTONOSUPPORT;
1027 
1028 	sock->state = SS_UNCONNECTED;
1029 
1030 	switch (sock->type) {
1031 	case SOCK_STREAM:
1032 		sock->ops = &unix_stream_ops;
1033 		break;
1034 		/*
1035 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1036 		 *	nothing uses it.
1037 		 */
1038 	case SOCK_RAW:
1039 		sock->type = SOCK_DGRAM;
1040 		fallthrough;
1041 	case SOCK_DGRAM:
1042 		sock->ops = &unix_dgram_ops;
1043 		break;
1044 	case SOCK_SEQPACKET:
1045 		sock->ops = &unix_seqpacket_ops;
1046 		break;
1047 	default:
1048 		return -ESOCKTNOSUPPORT;
1049 	}
1050 
1051 	sk = unix_create1(net, sock, kern, sock->type);
1052 	if (IS_ERR(sk))
1053 		return PTR_ERR(sk);
1054 
1055 	return 0;
1056 }
1057 
1058 static int unix_release(struct socket *sock)
1059 {
1060 	struct sock *sk = sock->sk;
1061 
1062 	if (!sk)
1063 		return 0;
1064 
1065 	sk->sk_prot->close(sk, 0);
1066 	unix_release_sock(sk, 0);
1067 	sock->sk = NULL;
1068 
1069 	return 0;
1070 }
1071 
1072 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1073 				  int type)
1074 {
1075 	struct inode *inode;
1076 	struct path path;
1077 	struct sock *sk;
1078 	int err;
1079 
1080 	unix_mkname_bsd(sunaddr, addr_len);
1081 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1082 	if (err)
1083 		goto fail;
1084 
1085 	err = path_permission(&path, MAY_WRITE);
1086 	if (err)
1087 		goto path_put;
1088 
1089 	err = -ECONNREFUSED;
1090 	inode = d_backing_inode(path.dentry);
1091 	if (!S_ISSOCK(inode->i_mode))
1092 		goto path_put;
1093 
1094 	sk = unix_find_socket_byinode(inode);
1095 	if (!sk)
1096 		goto path_put;
1097 
1098 	err = -EPROTOTYPE;
1099 	if (sk->sk_type == type)
1100 		touch_atime(&path);
1101 	else
1102 		goto sock_put;
1103 
1104 	path_put(&path);
1105 
1106 	return sk;
1107 
1108 sock_put:
1109 	sock_put(sk);
1110 path_put:
1111 	path_put(&path);
1112 fail:
1113 	return ERR_PTR(err);
1114 }
1115 
1116 static struct sock *unix_find_abstract(struct net *net,
1117 				       struct sockaddr_un *sunaddr,
1118 				       int addr_len, int type)
1119 {
1120 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1121 	struct dentry *dentry;
1122 	struct sock *sk;
1123 
1124 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1125 	if (!sk)
1126 		return ERR_PTR(-ECONNREFUSED);
1127 
1128 	dentry = unix_sk(sk)->path.dentry;
1129 	if (dentry)
1130 		touch_atime(&unix_sk(sk)->path);
1131 
1132 	return sk;
1133 }
1134 
1135 static struct sock *unix_find_other(struct net *net,
1136 				    struct sockaddr_un *sunaddr,
1137 				    int addr_len, int type)
1138 {
1139 	struct sock *sk;
1140 
1141 	if (sunaddr->sun_path[0])
1142 		sk = unix_find_bsd(sunaddr, addr_len, type);
1143 	else
1144 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1145 
1146 	return sk;
1147 }
1148 
1149 static int unix_autobind(struct sock *sk)
1150 {
1151 	unsigned int new_hash, old_hash = sk->sk_hash;
1152 	struct unix_sock *u = unix_sk(sk);
1153 	struct net *net = sock_net(sk);
1154 	struct unix_address *addr;
1155 	u32 lastnum, ordernum;
1156 	int err;
1157 
1158 	err = mutex_lock_interruptible(&u->bindlock);
1159 	if (err)
1160 		return err;
1161 
1162 	if (u->addr)
1163 		goto out;
1164 
1165 	err = -ENOMEM;
1166 	addr = kzalloc(sizeof(*addr) +
1167 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1168 	if (!addr)
1169 		goto out;
1170 
1171 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1172 	addr->name->sun_family = AF_UNIX;
1173 	refcount_set(&addr->refcnt, 1);
1174 
1175 	ordernum = get_random_u32();
1176 	lastnum = ordernum & 0xFFFFF;
1177 retry:
1178 	ordernum = (ordernum + 1) & 0xFFFFF;
1179 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1180 
1181 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1182 	unix_table_double_lock(net, old_hash, new_hash);
1183 
1184 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1185 		unix_table_double_unlock(net, old_hash, new_hash);
1186 
1187 		/* __unix_find_socket_byname() may take long time if many names
1188 		 * are already in use.
1189 		 */
1190 		cond_resched();
1191 
1192 		if (ordernum == lastnum) {
1193 			/* Give up if all names seems to be in use. */
1194 			err = -ENOSPC;
1195 			unix_release_addr(addr);
1196 			goto out;
1197 		}
1198 
1199 		goto retry;
1200 	}
1201 
1202 	__unix_set_addr_hash(net, sk, addr, new_hash);
1203 	unix_table_double_unlock(net, old_hash, new_hash);
1204 	err = 0;
1205 
1206 out:	mutex_unlock(&u->bindlock);
1207 	return err;
1208 }
1209 
1210 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1211 			 int addr_len)
1212 {
1213 	umode_t mode = S_IFSOCK |
1214 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1215 	unsigned int new_hash, old_hash = sk->sk_hash;
1216 	struct unix_sock *u = unix_sk(sk);
1217 	struct net *net = sock_net(sk);
1218 	struct mnt_idmap *idmap;
1219 	struct unix_address *addr;
1220 	struct dentry *dentry;
1221 	struct path parent;
1222 	int err;
1223 
1224 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1225 	addr = unix_create_addr(sunaddr, addr_len);
1226 	if (!addr)
1227 		return -ENOMEM;
1228 
1229 	/*
1230 	 * Get the parent directory, calculate the hash for last
1231 	 * component.
1232 	 */
1233 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1234 	if (IS_ERR(dentry)) {
1235 		err = PTR_ERR(dentry);
1236 		goto out;
1237 	}
1238 
1239 	/*
1240 	 * All right, let's create it.
1241 	 */
1242 	idmap = mnt_idmap(parent.mnt);
1243 	err = security_path_mknod(&parent, dentry, mode, 0);
1244 	if (!err)
1245 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1246 	if (err)
1247 		goto out_path;
1248 	err = mutex_lock_interruptible(&u->bindlock);
1249 	if (err)
1250 		goto out_unlink;
1251 	if (u->addr)
1252 		goto out_unlock;
1253 
1254 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1255 	unix_table_double_lock(net, old_hash, new_hash);
1256 	u->path.mnt = mntget(parent.mnt);
1257 	u->path.dentry = dget(dentry);
1258 	__unix_set_addr_hash(net, sk, addr, new_hash);
1259 	unix_table_double_unlock(net, old_hash, new_hash);
1260 	unix_insert_bsd_socket(sk);
1261 	mutex_unlock(&u->bindlock);
1262 	done_path_create(&parent, dentry);
1263 	return 0;
1264 
1265 out_unlock:
1266 	mutex_unlock(&u->bindlock);
1267 	err = -EINVAL;
1268 out_unlink:
1269 	/* failed after successful mknod?  unlink what we'd created... */
1270 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1271 out_path:
1272 	done_path_create(&parent, dentry);
1273 out:
1274 	unix_release_addr(addr);
1275 	return err == -EEXIST ? -EADDRINUSE : err;
1276 }
1277 
1278 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1279 			      int addr_len)
1280 {
1281 	unsigned int new_hash, old_hash = sk->sk_hash;
1282 	struct unix_sock *u = unix_sk(sk);
1283 	struct net *net = sock_net(sk);
1284 	struct unix_address *addr;
1285 	int err;
1286 
1287 	addr = unix_create_addr(sunaddr, addr_len);
1288 	if (!addr)
1289 		return -ENOMEM;
1290 
1291 	err = mutex_lock_interruptible(&u->bindlock);
1292 	if (err)
1293 		goto out;
1294 
1295 	if (u->addr) {
1296 		err = -EINVAL;
1297 		goto out_mutex;
1298 	}
1299 
1300 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1301 	unix_table_double_lock(net, old_hash, new_hash);
1302 
1303 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1304 		goto out_spin;
1305 
1306 	__unix_set_addr_hash(net, sk, addr, new_hash);
1307 	unix_table_double_unlock(net, old_hash, new_hash);
1308 	mutex_unlock(&u->bindlock);
1309 	return 0;
1310 
1311 out_spin:
1312 	unix_table_double_unlock(net, old_hash, new_hash);
1313 	err = -EADDRINUSE;
1314 out_mutex:
1315 	mutex_unlock(&u->bindlock);
1316 out:
1317 	unix_release_addr(addr);
1318 	return err;
1319 }
1320 
1321 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1322 {
1323 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1324 	struct sock *sk = sock->sk;
1325 	int err;
1326 
1327 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1328 	    sunaddr->sun_family == AF_UNIX)
1329 		return unix_autobind(sk);
1330 
1331 	err = unix_validate_addr(sunaddr, addr_len);
1332 	if (err)
1333 		return err;
1334 
1335 	if (sunaddr->sun_path[0])
1336 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1337 	else
1338 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1339 
1340 	return err;
1341 }
1342 
1343 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1344 {
1345 	if (unlikely(sk1 == sk2) || !sk2) {
1346 		unix_state_lock(sk1);
1347 		return;
1348 	}
1349 	if (sk1 < sk2) {
1350 		unix_state_lock(sk1);
1351 		unix_state_lock_nested(sk2);
1352 	} else {
1353 		unix_state_lock(sk2);
1354 		unix_state_lock_nested(sk1);
1355 	}
1356 }
1357 
1358 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1359 {
1360 	if (unlikely(sk1 == sk2) || !sk2) {
1361 		unix_state_unlock(sk1);
1362 		return;
1363 	}
1364 	unix_state_unlock(sk1);
1365 	unix_state_unlock(sk2);
1366 }
1367 
1368 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1369 			      int alen, int flags)
1370 {
1371 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1372 	struct sock *sk = sock->sk;
1373 	struct sock *other;
1374 	int err;
1375 
1376 	err = -EINVAL;
1377 	if (alen < offsetofend(struct sockaddr, sa_family))
1378 		goto out;
1379 
1380 	if (addr->sa_family != AF_UNSPEC) {
1381 		err = unix_validate_addr(sunaddr, alen);
1382 		if (err)
1383 			goto out;
1384 
1385 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1386 		if (err)
1387 			goto out;
1388 
1389 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1390 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1391 		    !unix_sk(sk)->addr) {
1392 			err = unix_autobind(sk);
1393 			if (err)
1394 				goto out;
1395 		}
1396 
1397 restart:
1398 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1399 		if (IS_ERR(other)) {
1400 			err = PTR_ERR(other);
1401 			goto out;
1402 		}
1403 
1404 		unix_state_double_lock(sk, other);
1405 
1406 		/* Apparently VFS overslept socket death. Retry. */
1407 		if (sock_flag(other, SOCK_DEAD)) {
1408 			unix_state_double_unlock(sk, other);
1409 			sock_put(other);
1410 			goto restart;
1411 		}
1412 
1413 		err = -EPERM;
1414 		if (!unix_may_send(sk, other))
1415 			goto out_unlock;
1416 
1417 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1418 		if (err)
1419 			goto out_unlock;
1420 
1421 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1422 	} else {
1423 		/*
1424 		 *	1003.1g breaking connected state with AF_UNSPEC
1425 		 */
1426 		other = NULL;
1427 		unix_state_double_lock(sk, other);
1428 	}
1429 
1430 	/*
1431 	 * If it was connected, reconnect.
1432 	 */
1433 	if (unix_peer(sk)) {
1434 		struct sock *old_peer = unix_peer(sk);
1435 
1436 		unix_peer(sk) = other;
1437 		if (!other)
1438 			sk->sk_state = TCP_CLOSE;
1439 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1440 
1441 		unix_state_double_unlock(sk, other);
1442 
1443 		if (other != old_peer)
1444 			unix_dgram_disconnected(sk, old_peer);
1445 		sock_put(old_peer);
1446 	} else {
1447 		unix_peer(sk) = other;
1448 		unix_state_double_unlock(sk, other);
1449 	}
1450 
1451 	return 0;
1452 
1453 out_unlock:
1454 	unix_state_double_unlock(sk, other);
1455 	sock_put(other);
1456 out:
1457 	return err;
1458 }
1459 
1460 static long unix_wait_for_peer(struct sock *other, long timeo)
1461 	__releases(&unix_sk(other)->lock)
1462 {
1463 	struct unix_sock *u = unix_sk(other);
1464 	int sched;
1465 	DEFINE_WAIT(wait);
1466 
1467 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1468 
1469 	sched = !sock_flag(other, SOCK_DEAD) &&
1470 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1471 		unix_recvq_full_lockless(other);
1472 
1473 	unix_state_unlock(other);
1474 
1475 	if (sched)
1476 		timeo = schedule_timeout(timeo);
1477 
1478 	finish_wait(&u->peer_wait, &wait);
1479 	return timeo;
1480 }
1481 
1482 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1483 			       int addr_len, int flags)
1484 {
1485 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1486 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1487 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1488 	struct net *net = sock_net(sk);
1489 	struct sk_buff *skb = NULL;
1490 	long timeo;
1491 	int err;
1492 	int st;
1493 
1494 	err = unix_validate_addr(sunaddr, addr_len);
1495 	if (err)
1496 		goto out;
1497 
1498 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1499 	if (err)
1500 		goto out;
1501 
1502 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1503 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1504 		err = unix_autobind(sk);
1505 		if (err)
1506 			goto out;
1507 	}
1508 
1509 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1510 
1511 	/* First of all allocate resources.
1512 	   If we will make it after state is locked,
1513 	   we will have to recheck all again in any case.
1514 	 */
1515 
1516 	/* create new sock for complete connection */
1517 	newsk = unix_create1(net, NULL, 0, sock->type);
1518 	if (IS_ERR(newsk)) {
1519 		err = PTR_ERR(newsk);
1520 		newsk = NULL;
1521 		goto out;
1522 	}
1523 
1524 	err = -ENOMEM;
1525 
1526 	/* Allocate skb for sending to listening sock */
1527 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1528 	if (skb == NULL)
1529 		goto out;
1530 
1531 restart:
1532 	/*  Find listening sock. */
1533 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1534 	if (IS_ERR(other)) {
1535 		err = PTR_ERR(other);
1536 		other = NULL;
1537 		goto out;
1538 	}
1539 
1540 	/* Latch state of peer */
1541 	unix_state_lock(other);
1542 
1543 	/* Apparently VFS overslept socket death. Retry. */
1544 	if (sock_flag(other, SOCK_DEAD)) {
1545 		unix_state_unlock(other);
1546 		sock_put(other);
1547 		goto restart;
1548 	}
1549 
1550 	err = -ECONNREFUSED;
1551 	if (other->sk_state != TCP_LISTEN)
1552 		goto out_unlock;
1553 	if (other->sk_shutdown & RCV_SHUTDOWN)
1554 		goto out_unlock;
1555 
1556 	if (unix_recvq_full(other)) {
1557 		err = -EAGAIN;
1558 		if (!timeo)
1559 			goto out_unlock;
1560 
1561 		timeo = unix_wait_for_peer(other, timeo);
1562 
1563 		err = sock_intr_errno(timeo);
1564 		if (signal_pending(current))
1565 			goto out;
1566 		sock_put(other);
1567 		goto restart;
1568 	}
1569 
1570 	/* Latch our state.
1571 
1572 	   It is tricky place. We need to grab our state lock and cannot
1573 	   drop lock on peer. It is dangerous because deadlock is
1574 	   possible. Connect to self case and simultaneous
1575 	   attempt to connect are eliminated by checking socket
1576 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1577 	   check this before attempt to grab lock.
1578 
1579 	   Well, and we have to recheck the state after socket locked.
1580 	 */
1581 	st = sk->sk_state;
1582 
1583 	switch (st) {
1584 	case TCP_CLOSE:
1585 		/* This is ok... continue with connect */
1586 		break;
1587 	case TCP_ESTABLISHED:
1588 		/* Socket is already connected */
1589 		err = -EISCONN;
1590 		goto out_unlock;
1591 	default:
1592 		err = -EINVAL;
1593 		goto out_unlock;
1594 	}
1595 
1596 	unix_state_lock_nested(sk);
1597 
1598 	if (sk->sk_state != st) {
1599 		unix_state_unlock(sk);
1600 		unix_state_unlock(other);
1601 		sock_put(other);
1602 		goto restart;
1603 	}
1604 
1605 	err = security_unix_stream_connect(sk, other, newsk);
1606 	if (err) {
1607 		unix_state_unlock(sk);
1608 		goto out_unlock;
1609 	}
1610 
1611 	/* The way is open! Fastly set all the necessary fields... */
1612 
1613 	sock_hold(sk);
1614 	unix_peer(newsk)	= sk;
1615 	newsk->sk_state		= TCP_ESTABLISHED;
1616 	newsk->sk_type		= sk->sk_type;
1617 	init_peercred(newsk);
1618 	newu = unix_sk(newsk);
1619 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1620 	otheru = unix_sk(other);
1621 
1622 	/* copy address information from listening to new sock
1623 	 *
1624 	 * The contents of *(otheru->addr) and otheru->path
1625 	 * are seen fully set up here, since we have found
1626 	 * otheru in hash under its lock.  Insertion into the
1627 	 * hash chain we'd found it in had been done in an
1628 	 * earlier critical area protected by the chain's lock,
1629 	 * the same one where we'd set *(otheru->addr) contents,
1630 	 * as well as otheru->path and otheru->addr itself.
1631 	 *
1632 	 * Using smp_store_release() here to set newu->addr
1633 	 * is enough to make those stores, as well as stores
1634 	 * to newu->path visible to anyone who gets newu->addr
1635 	 * by smp_load_acquire().  IOW, the same warranties
1636 	 * as for unix_sock instances bound in unix_bind() or
1637 	 * in unix_autobind().
1638 	 */
1639 	if (otheru->path.dentry) {
1640 		path_get(&otheru->path);
1641 		newu->path = otheru->path;
1642 	}
1643 	refcount_inc(&otheru->addr->refcnt);
1644 	smp_store_release(&newu->addr, otheru->addr);
1645 
1646 	/* Set credentials */
1647 	copy_peercred(sk, other);
1648 
1649 	sock->state	= SS_CONNECTED;
1650 	sk->sk_state	= TCP_ESTABLISHED;
1651 	sock_hold(newsk);
1652 
1653 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1654 	unix_peer(sk)	= newsk;
1655 
1656 	unix_state_unlock(sk);
1657 
1658 	/* take ten and send info to listening sock */
1659 	spin_lock(&other->sk_receive_queue.lock);
1660 	__skb_queue_tail(&other->sk_receive_queue, skb);
1661 	spin_unlock(&other->sk_receive_queue.lock);
1662 	unix_state_unlock(other);
1663 	other->sk_data_ready(other);
1664 	sock_put(other);
1665 	return 0;
1666 
1667 out_unlock:
1668 	if (other)
1669 		unix_state_unlock(other);
1670 
1671 out:
1672 	kfree_skb(skb);
1673 	if (newsk)
1674 		unix_release_sock(newsk, 0);
1675 	if (other)
1676 		sock_put(other);
1677 	return err;
1678 }
1679 
1680 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1681 {
1682 	struct sock *ska = socka->sk, *skb = sockb->sk;
1683 
1684 	/* Join our sockets back to back */
1685 	sock_hold(ska);
1686 	sock_hold(skb);
1687 	unix_peer(ska) = skb;
1688 	unix_peer(skb) = ska;
1689 	init_peercred(ska);
1690 	init_peercred(skb);
1691 
1692 	ska->sk_state = TCP_ESTABLISHED;
1693 	skb->sk_state = TCP_ESTABLISHED;
1694 	socka->state  = SS_CONNECTED;
1695 	sockb->state  = SS_CONNECTED;
1696 	return 0;
1697 }
1698 
1699 static void unix_sock_inherit_flags(const struct socket *old,
1700 				    struct socket *new)
1701 {
1702 	if (test_bit(SOCK_PASSCRED, &old->flags))
1703 		set_bit(SOCK_PASSCRED, &new->flags);
1704 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1705 		set_bit(SOCK_PASSPIDFD, &new->flags);
1706 	if (test_bit(SOCK_PASSSEC, &old->flags))
1707 		set_bit(SOCK_PASSSEC, &new->flags);
1708 }
1709 
1710 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1711 		       bool kern)
1712 {
1713 	struct sock *sk = sock->sk;
1714 	struct sock *tsk;
1715 	struct sk_buff *skb;
1716 	int err;
1717 
1718 	err = -EOPNOTSUPP;
1719 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1720 		goto out;
1721 
1722 	err = -EINVAL;
1723 	if (sk->sk_state != TCP_LISTEN)
1724 		goto out;
1725 
1726 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1727 	 * so that no locks are necessary.
1728 	 */
1729 
1730 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1731 				&err);
1732 	if (!skb) {
1733 		/* This means receive shutdown. */
1734 		if (err == 0)
1735 			err = -EINVAL;
1736 		goto out;
1737 	}
1738 
1739 	tsk = skb->sk;
1740 	skb_free_datagram(sk, skb);
1741 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1742 
1743 	/* attach accepted sock to socket */
1744 	unix_state_lock(tsk);
1745 	newsock->state = SS_CONNECTED;
1746 	unix_sock_inherit_flags(sock, newsock);
1747 	sock_graft(tsk, newsock);
1748 	unix_state_unlock(tsk);
1749 	return 0;
1750 
1751 out:
1752 	return err;
1753 }
1754 
1755 
1756 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1757 {
1758 	struct sock *sk = sock->sk;
1759 	struct unix_address *addr;
1760 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1761 	int err = 0;
1762 
1763 	if (peer) {
1764 		sk = unix_peer_get(sk);
1765 
1766 		err = -ENOTCONN;
1767 		if (!sk)
1768 			goto out;
1769 		err = 0;
1770 	} else {
1771 		sock_hold(sk);
1772 	}
1773 
1774 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1775 	if (!addr) {
1776 		sunaddr->sun_family = AF_UNIX;
1777 		sunaddr->sun_path[0] = 0;
1778 		err = offsetof(struct sockaddr_un, sun_path);
1779 	} else {
1780 		err = addr->len;
1781 		memcpy(sunaddr, addr->name, addr->len);
1782 
1783 		if (peer)
1784 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1785 					       CGROUP_UNIX_GETPEERNAME);
1786 		else
1787 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1788 					       CGROUP_UNIX_GETSOCKNAME);
1789 	}
1790 	sock_put(sk);
1791 out:
1792 	return err;
1793 }
1794 
1795 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1796 {
1797 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1798 
1799 	/*
1800 	 * Garbage collection of unix sockets starts by selecting a set of
1801 	 * candidate sockets which have reference only from being in flight
1802 	 * (total_refs == inflight_refs).  This condition is checked once during
1803 	 * the candidate collection phase, and candidates are marked as such, so
1804 	 * that non-candidates can later be ignored.  While inflight_refs is
1805 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1806 	 * is an instantaneous decision.
1807 	 *
1808 	 * Once a candidate, however, the socket must not be reinstalled into a
1809 	 * file descriptor while the garbage collection is in progress.
1810 	 *
1811 	 * If the above conditions are met, then the directed graph of
1812 	 * candidates (*) does not change while unix_gc_lock is held.
1813 	 *
1814 	 * Any operations that changes the file count through file descriptors
1815 	 * (dup, close, sendmsg) does not change the graph since candidates are
1816 	 * not installed in fds.
1817 	 *
1818 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1819 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1820 	 * serialized with garbage collection.
1821 	 *
1822 	 * MSG_PEEK is special in that it does not change the inflight count,
1823 	 * yet does install the socket into an fd.  The following lock/unlock
1824 	 * pair is to ensure serialization with garbage collection.  It must be
1825 	 * done between incrementing the file count and installing the file into
1826 	 * an fd.
1827 	 *
1828 	 * If garbage collection starts after the barrier provided by the
1829 	 * lock/unlock, then it will see the elevated refcount and not mark this
1830 	 * as a candidate.  If a garbage collection is already in progress
1831 	 * before the file count was incremented, then the lock/unlock pair will
1832 	 * ensure that garbage collection is finished before progressing to
1833 	 * installing the fd.
1834 	 *
1835 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1836 	 * which is on the queue of listening socket A.
1837 	 */
1838 	spin_lock(&unix_gc_lock);
1839 	spin_unlock(&unix_gc_lock);
1840 }
1841 
1842 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1843 {
1844 	int err = 0;
1845 
1846 	UNIXCB(skb).pid  = get_pid(scm->pid);
1847 	UNIXCB(skb).uid = scm->creds.uid;
1848 	UNIXCB(skb).gid = scm->creds.gid;
1849 	UNIXCB(skb).fp = NULL;
1850 	unix_get_secdata(scm, skb);
1851 	if (scm->fp && send_fds)
1852 		err = unix_attach_fds(scm, skb);
1853 
1854 	skb->destructor = unix_destruct_scm;
1855 	return err;
1856 }
1857 
1858 static bool unix_passcred_enabled(const struct socket *sock,
1859 				  const struct sock *other)
1860 {
1861 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1862 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1863 	       !other->sk_socket ||
1864 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1865 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1866 }
1867 
1868 /*
1869  * Some apps rely on write() giving SCM_CREDENTIALS
1870  * We include credentials if source or destination socket
1871  * asserted SOCK_PASSCRED.
1872  */
1873 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1874 			    const struct sock *other)
1875 {
1876 	if (UNIXCB(skb).pid)
1877 		return;
1878 	if (unix_passcred_enabled(sock, other)) {
1879 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1880 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1881 	}
1882 }
1883 
1884 static bool unix_skb_scm_eq(struct sk_buff *skb,
1885 			    struct scm_cookie *scm)
1886 {
1887 	return UNIXCB(skb).pid == scm->pid &&
1888 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1889 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1890 	       unix_secdata_eq(scm, skb);
1891 }
1892 
1893 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1894 {
1895 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1896 	struct unix_sock *u = unix_sk(sk);
1897 
1898 	if (unlikely(fp && fp->count))
1899 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1900 }
1901 
1902 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1903 {
1904 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1905 	struct unix_sock *u = unix_sk(sk);
1906 
1907 	if (unlikely(fp && fp->count))
1908 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1909 }
1910 
1911 /*
1912  *	Send AF_UNIX data.
1913  */
1914 
1915 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1916 			      size_t len)
1917 {
1918 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1919 	struct sock *sk = sock->sk, *other = NULL;
1920 	struct unix_sock *u = unix_sk(sk);
1921 	struct scm_cookie scm;
1922 	struct sk_buff *skb;
1923 	int data_len = 0;
1924 	int sk_locked;
1925 	long timeo;
1926 	int err;
1927 
1928 	wait_for_unix_gc();
1929 	err = scm_send(sock, msg, &scm, false);
1930 	if (err < 0)
1931 		return err;
1932 
1933 	err = -EOPNOTSUPP;
1934 	if (msg->msg_flags&MSG_OOB)
1935 		goto out;
1936 
1937 	if (msg->msg_namelen) {
1938 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1939 		if (err)
1940 			goto out;
1941 
1942 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1943 							    msg->msg_name,
1944 							    &msg->msg_namelen,
1945 							    NULL);
1946 		if (err)
1947 			goto out;
1948 	} else {
1949 		sunaddr = NULL;
1950 		err = -ENOTCONN;
1951 		other = unix_peer_get(sk);
1952 		if (!other)
1953 			goto out;
1954 	}
1955 
1956 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1957 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1958 		err = unix_autobind(sk);
1959 		if (err)
1960 			goto out;
1961 	}
1962 
1963 	err = -EMSGSIZE;
1964 	if (len > sk->sk_sndbuf - 32)
1965 		goto out;
1966 
1967 	if (len > SKB_MAX_ALLOC) {
1968 		data_len = min_t(size_t,
1969 				 len - SKB_MAX_ALLOC,
1970 				 MAX_SKB_FRAGS * PAGE_SIZE);
1971 		data_len = PAGE_ALIGN(data_len);
1972 
1973 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1974 	}
1975 
1976 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1977 				   msg->msg_flags & MSG_DONTWAIT, &err,
1978 				   PAGE_ALLOC_COSTLY_ORDER);
1979 	if (skb == NULL)
1980 		goto out;
1981 
1982 	err = unix_scm_to_skb(&scm, skb, true);
1983 	if (err < 0)
1984 		goto out_free;
1985 
1986 	skb_put(skb, len - data_len);
1987 	skb->data_len = data_len;
1988 	skb->len = len;
1989 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1990 	if (err)
1991 		goto out_free;
1992 
1993 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1994 
1995 restart:
1996 	if (!other) {
1997 		err = -ECONNRESET;
1998 		if (sunaddr == NULL)
1999 			goto out_free;
2000 
2001 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2002 					sk->sk_type);
2003 		if (IS_ERR(other)) {
2004 			err = PTR_ERR(other);
2005 			other = NULL;
2006 			goto out_free;
2007 		}
2008 	}
2009 
2010 	if (sk_filter(other, skb) < 0) {
2011 		/* Toss the packet but do not return any error to the sender */
2012 		err = len;
2013 		goto out_free;
2014 	}
2015 
2016 	sk_locked = 0;
2017 	unix_state_lock(other);
2018 restart_locked:
2019 	err = -EPERM;
2020 	if (!unix_may_send(sk, other))
2021 		goto out_unlock;
2022 
2023 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2024 		/*
2025 		 *	Check with 1003.1g - what should
2026 		 *	datagram error
2027 		 */
2028 		unix_state_unlock(other);
2029 		sock_put(other);
2030 
2031 		if (!sk_locked)
2032 			unix_state_lock(sk);
2033 
2034 		err = 0;
2035 		if (sk->sk_type == SOCK_SEQPACKET) {
2036 			/* We are here only when racing with unix_release_sock()
2037 			 * is clearing @other. Never change state to TCP_CLOSE
2038 			 * unlike SOCK_DGRAM wants.
2039 			 */
2040 			unix_state_unlock(sk);
2041 			err = -EPIPE;
2042 		} else if (unix_peer(sk) == other) {
2043 			unix_peer(sk) = NULL;
2044 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2045 
2046 			sk->sk_state = TCP_CLOSE;
2047 			unix_state_unlock(sk);
2048 
2049 			unix_dgram_disconnected(sk, other);
2050 			sock_put(other);
2051 			err = -ECONNREFUSED;
2052 		} else {
2053 			unix_state_unlock(sk);
2054 		}
2055 
2056 		other = NULL;
2057 		if (err)
2058 			goto out_free;
2059 		goto restart;
2060 	}
2061 
2062 	err = -EPIPE;
2063 	if (other->sk_shutdown & RCV_SHUTDOWN)
2064 		goto out_unlock;
2065 
2066 	if (sk->sk_type != SOCK_SEQPACKET) {
2067 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2068 		if (err)
2069 			goto out_unlock;
2070 	}
2071 
2072 	/* other == sk && unix_peer(other) != sk if
2073 	 * - unix_peer(sk) == NULL, destination address bound to sk
2074 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2075 	 */
2076 	if (other != sk &&
2077 	    unlikely(unix_peer(other) != sk &&
2078 	    unix_recvq_full_lockless(other))) {
2079 		if (timeo) {
2080 			timeo = unix_wait_for_peer(other, timeo);
2081 
2082 			err = sock_intr_errno(timeo);
2083 			if (signal_pending(current))
2084 				goto out_free;
2085 
2086 			goto restart;
2087 		}
2088 
2089 		if (!sk_locked) {
2090 			unix_state_unlock(other);
2091 			unix_state_double_lock(sk, other);
2092 		}
2093 
2094 		if (unix_peer(sk) != other ||
2095 		    unix_dgram_peer_wake_me(sk, other)) {
2096 			err = -EAGAIN;
2097 			sk_locked = 1;
2098 			goto out_unlock;
2099 		}
2100 
2101 		if (!sk_locked) {
2102 			sk_locked = 1;
2103 			goto restart_locked;
2104 		}
2105 	}
2106 
2107 	if (unlikely(sk_locked))
2108 		unix_state_unlock(sk);
2109 
2110 	if (sock_flag(other, SOCK_RCVTSTAMP))
2111 		__net_timestamp(skb);
2112 	maybe_add_creds(skb, sock, other);
2113 	scm_stat_add(other, skb);
2114 	skb_queue_tail(&other->sk_receive_queue, skb);
2115 	unix_state_unlock(other);
2116 	other->sk_data_ready(other);
2117 	sock_put(other);
2118 	scm_destroy(&scm);
2119 	return len;
2120 
2121 out_unlock:
2122 	if (sk_locked)
2123 		unix_state_unlock(sk);
2124 	unix_state_unlock(other);
2125 out_free:
2126 	kfree_skb(skb);
2127 out:
2128 	if (other)
2129 		sock_put(other);
2130 	scm_destroy(&scm);
2131 	return err;
2132 }
2133 
2134 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2135  * bytes, and a minimum of a full page.
2136  */
2137 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2138 
2139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2140 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2141 		     struct scm_cookie *scm, bool fds_sent)
2142 {
2143 	struct unix_sock *ousk = unix_sk(other);
2144 	struct sk_buff *skb;
2145 	int err = 0;
2146 
2147 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2148 
2149 	if (!skb)
2150 		return err;
2151 
2152 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2153 	if (err < 0) {
2154 		kfree_skb(skb);
2155 		return err;
2156 	}
2157 	skb_put(skb, 1);
2158 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2159 
2160 	if (err) {
2161 		kfree_skb(skb);
2162 		return err;
2163 	}
2164 
2165 	unix_state_lock(other);
2166 
2167 	if (sock_flag(other, SOCK_DEAD) ||
2168 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2169 		unix_state_unlock(other);
2170 		kfree_skb(skb);
2171 		return -EPIPE;
2172 	}
2173 
2174 	maybe_add_creds(skb, sock, other);
2175 	skb_get(skb);
2176 
2177 	if (ousk->oob_skb)
2178 		consume_skb(ousk->oob_skb);
2179 
2180 	WRITE_ONCE(ousk->oob_skb, skb);
2181 
2182 	scm_stat_add(other, skb);
2183 	skb_queue_tail(&other->sk_receive_queue, skb);
2184 	sk_send_sigurg(other);
2185 	unix_state_unlock(other);
2186 	other->sk_data_ready(other);
2187 
2188 	return err;
2189 }
2190 #endif
2191 
2192 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2193 			       size_t len)
2194 {
2195 	struct sock *sk = sock->sk;
2196 	struct sock *other = NULL;
2197 	int err, size;
2198 	struct sk_buff *skb;
2199 	int sent = 0;
2200 	struct scm_cookie scm;
2201 	bool fds_sent = false;
2202 	int data_len;
2203 
2204 	wait_for_unix_gc();
2205 	err = scm_send(sock, msg, &scm, false);
2206 	if (err < 0)
2207 		return err;
2208 
2209 	err = -EOPNOTSUPP;
2210 	if (msg->msg_flags & MSG_OOB) {
2211 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2212 		if (len)
2213 			len--;
2214 		else
2215 #endif
2216 			goto out_err;
2217 	}
2218 
2219 	if (msg->msg_namelen) {
2220 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2221 		goto out_err;
2222 	} else {
2223 		err = -ENOTCONN;
2224 		other = unix_peer(sk);
2225 		if (!other)
2226 			goto out_err;
2227 	}
2228 
2229 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2230 		goto pipe_err;
2231 
2232 	while (sent < len) {
2233 		size = len - sent;
2234 
2235 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2236 			skb = sock_alloc_send_pskb(sk, 0, 0,
2237 						   msg->msg_flags & MSG_DONTWAIT,
2238 						   &err, 0);
2239 		} else {
2240 			/* Keep two messages in the pipe so it schedules better */
2241 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2242 
2243 			/* allow fallback to order-0 allocations */
2244 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2245 
2246 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2247 
2248 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2249 
2250 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2251 						   msg->msg_flags & MSG_DONTWAIT, &err,
2252 						   get_order(UNIX_SKB_FRAGS_SZ));
2253 		}
2254 		if (!skb)
2255 			goto out_err;
2256 
2257 		/* Only send the fds in the first buffer */
2258 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2259 		if (err < 0) {
2260 			kfree_skb(skb);
2261 			goto out_err;
2262 		}
2263 		fds_sent = true;
2264 
2265 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2266 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2267 						   sk->sk_allocation);
2268 			if (err < 0) {
2269 				kfree_skb(skb);
2270 				goto out_err;
2271 			}
2272 			size = err;
2273 			refcount_add(size, &sk->sk_wmem_alloc);
2274 		} else {
2275 			skb_put(skb, size - data_len);
2276 			skb->data_len = data_len;
2277 			skb->len = size;
2278 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2279 			if (err) {
2280 				kfree_skb(skb);
2281 				goto out_err;
2282 			}
2283 		}
2284 
2285 		unix_state_lock(other);
2286 
2287 		if (sock_flag(other, SOCK_DEAD) ||
2288 		    (other->sk_shutdown & RCV_SHUTDOWN))
2289 			goto pipe_err_free;
2290 
2291 		maybe_add_creds(skb, sock, other);
2292 		scm_stat_add(other, skb);
2293 		skb_queue_tail(&other->sk_receive_queue, skb);
2294 		unix_state_unlock(other);
2295 		other->sk_data_ready(other);
2296 		sent += size;
2297 	}
2298 
2299 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2300 	if (msg->msg_flags & MSG_OOB) {
2301 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2302 		if (err)
2303 			goto out_err;
2304 		sent++;
2305 	}
2306 #endif
2307 
2308 	scm_destroy(&scm);
2309 
2310 	return sent;
2311 
2312 pipe_err_free:
2313 	unix_state_unlock(other);
2314 	kfree_skb(skb);
2315 pipe_err:
2316 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2317 		send_sig(SIGPIPE, current, 0);
2318 	err = -EPIPE;
2319 out_err:
2320 	scm_destroy(&scm);
2321 	return sent ? : err;
2322 }
2323 
2324 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2325 				  size_t len)
2326 {
2327 	int err;
2328 	struct sock *sk = sock->sk;
2329 
2330 	err = sock_error(sk);
2331 	if (err)
2332 		return err;
2333 
2334 	if (sk->sk_state != TCP_ESTABLISHED)
2335 		return -ENOTCONN;
2336 
2337 	if (msg->msg_namelen)
2338 		msg->msg_namelen = 0;
2339 
2340 	return unix_dgram_sendmsg(sock, msg, len);
2341 }
2342 
2343 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2344 				  size_t size, int flags)
2345 {
2346 	struct sock *sk = sock->sk;
2347 
2348 	if (sk->sk_state != TCP_ESTABLISHED)
2349 		return -ENOTCONN;
2350 
2351 	return unix_dgram_recvmsg(sock, msg, size, flags);
2352 }
2353 
2354 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2355 {
2356 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2357 
2358 	if (addr) {
2359 		msg->msg_namelen = addr->len;
2360 		memcpy(msg->msg_name, addr->name, addr->len);
2361 	}
2362 }
2363 
2364 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2365 			 int flags)
2366 {
2367 	struct scm_cookie scm;
2368 	struct socket *sock = sk->sk_socket;
2369 	struct unix_sock *u = unix_sk(sk);
2370 	struct sk_buff *skb, *last;
2371 	long timeo;
2372 	int skip;
2373 	int err;
2374 
2375 	err = -EOPNOTSUPP;
2376 	if (flags&MSG_OOB)
2377 		goto out;
2378 
2379 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2380 
2381 	do {
2382 		mutex_lock(&u->iolock);
2383 
2384 		skip = sk_peek_offset(sk, flags);
2385 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2386 					      &skip, &err, &last);
2387 		if (skb) {
2388 			if (!(flags & MSG_PEEK))
2389 				scm_stat_del(sk, skb);
2390 			break;
2391 		}
2392 
2393 		mutex_unlock(&u->iolock);
2394 
2395 		if (err != -EAGAIN)
2396 			break;
2397 	} while (timeo &&
2398 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2399 					      &err, &timeo, last));
2400 
2401 	if (!skb) { /* implies iolock unlocked */
2402 		unix_state_lock(sk);
2403 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2404 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2405 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2406 			err = 0;
2407 		unix_state_unlock(sk);
2408 		goto out;
2409 	}
2410 
2411 	if (wq_has_sleeper(&u->peer_wait))
2412 		wake_up_interruptible_sync_poll(&u->peer_wait,
2413 						EPOLLOUT | EPOLLWRNORM |
2414 						EPOLLWRBAND);
2415 
2416 	if (msg->msg_name) {
2417 		unix_copy_addr(msg, skb->sk);
2418 
2419 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2420 						      msg->msg_name,
2421 						      &msg->msg_namelen);
2422 	}
2423 
2424 	if (size > skb->len - skip)
2425 		size = skb->len - skip;
2426 	else if (size < skb->len - skip)
2427 		msg->msg_flags |= MSG_TRUNC;
2428 
2429 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2430 	if (err)
2431 		goto out_free;
2432 
2433 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2434 		__sock_recv_timestamp(msg, sk, skb);
2435 
2436 	memset(&scm, 0, sizeof(scm));
2437 
2438 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2439 	unix_set_secdata(&scm, skb);
2440 
2441 	if (!(flags & MSG_PEEK)) {
2442 		if (UNIXCB(skb).fp)
2443 			unix_detach_fds(&scm, skb);
2444 
2445 		sk_peek_offset_bwd(sk, skb->len);
2446 	} else {
2447 		/* It is questionable: on PEEK we could:
2448 		   - do not return fds - good, but too simple 8)
2449 		   - return fds, and do not return them on read (old strategy,
2450 		     apparently wrong)
2451 		   - clone fds (I chose it for now, it is the most universal
2452 		     solution)
2453 
2454 		   POSIX 1003.1g does not actually define this clearly
2455 		   at all. POSIX 1003.1g doesn't define a lot of things
2456 		   clearly however!
2457 
2458 		*/
2459 
2460 		sk_peek_offset_fwd(sk, size);
2461 
2462 		if (UNIXCB(skb).fp)
2463 			unix_peek_fds(&scm, skb);
2464 	}
2465 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2466 
2467 	scm_recv_unix(sock, msg, &scm, flags);
2468 
2469 out_free:
2470 	skb_free_datagram(sk, skb);
2471 	mutex_unlock(&u->iolock);
2472 out:
2473 	return err;
2474 }
2475 
2476 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2477 			      int flags)
2478 {
2479 	struct sock *sk = sock->sk;
2480 
2481 #ifdef CONFIG_BPF_SYSCALL
2482 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2483 
2484 	if (prot != &unix_dgram_proto)
2485 		return prot->recvmsg(sk, msg, size, flags, NULL);
2486 #endif
2487 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2488 }
2489 
2490 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2491 {
2492 	struct unix_sock *u = unix_sk(sk);
2493 	struct sk_buff *skb;
2494 	int err;
2495 
2496 	mutex_lock(&u->iolock);
2497 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2498 	mutex_unlock(&u->iolock);
2499 	if (!skb)
2500 		return err;
2501 
2502 	return recv_actor(sk, skb);
2503 }
2504 
2505 /*
2506  *	Sleep until more data has arrived. But check for races..
2507  */
2508 static long unix_stream_data_wait(struct sock *sk, long timeo,
2509 				  struct sk_buff *last, unsigned int last_len,
2510 				  bool freezable)
2511 {
2512 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2513 	struct sk_buff *tail;
2514 	DEFINE_WAIT(wait);
2515 
2516 	unix_state_lock(sk);
2517 
2518 	for (;;) {
2519 		prepare_to_wait(sk_sleep(sk), &wait, state);
2520 
2521 		tail = skb_peek_tail(&sk->sk_receive_queue);
2522 		if (tail != last ||
2523 		    (tail && tail->len != last_len) ||
2524 		    sk->sk_err ||
2525 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2526 		    signal_pending(current) ||
2527 		    !timeo)
2528 			break;
2529 
2530 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2531 		unix_state_unlock(sk);
2532 		timeo = schedule_timeout(timeo);
2533 		unix_state_lock(sk);
2534 
2535 		if (sock_flag(sk, SOCK_DEAD))
2536 			break;
2537 
2538 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2539 	}
2540 
2541 	finish_wait(sk_sleep(sk), &wait);
2542 	unix_state_unlock(sk);
2543 	return timeo;
2544 }
2545 
2546 static unsigned int unix_skb_len(const struct sk_buff *skb)
2547 {
2548 	return skb->len - UNIXCB(skb).consumed;
2549 }
2550 
2551 struct unix_stream_read_state {
2552 	int (*recv_actor)(struct sk_buff *, int, int,
2553 			  struct unix_stream_read_state *);
2554 	struct socket *socket;
2555 	struct msghdr *msg;
2556 	struct pipe_inode_info *pipe;
2557 	size_t size;
2558 	int flags;
2559 	unsigned int splice_flags;
2560 };
2561 
2562 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2563 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2564 {
2565 	struct socket *sock = state->socket;
2566 	struct sock *sk = sock->sk;
2567 	struct unix_sock *u = unix_sk(sk);
2568 	int chunk = 1;
2569 	struct sk_buff *oob_skb;
2570 
2571 	mutex_lock(&u->iolock);
2572 	unix_state_lock(sk);
2573 
2574 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2575 		unix_state_unlock(sk);
2576 		mutex_unlock(&u->iolock);
2577 		return -EINVAL;
2578 	}
2579 
2580 	oob_skb = u->oob_skb;
2581 
2582 	if (!(state->flags & MSG_PEEK))
2583 		WRITE_ONCE(u->oob_skb, NULL);
2584 	else
2585 		skb_get(oob_skb);
2586 	unix_state_unlock(sk);
2587 
2588 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2589 
2590 	if (!(state->flags & MSG_PEEK))
2591 		UNIXCB(oob_skb).consumed += 1;
2592 
2593 	consume_skb(oob_skb);
2594 
2595 	mutex_unlock(&u->iolock);
2596 
2597 	if (chunk < 0)
2598 		return -EFAULT;
2599 
2600 	state->msg->msg_flags |= MSG_OOB;
2601 	return 1;
2602 }
2603 
2604 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2605 				  int flags, int copied)
2606 {
2607 	struct unix_sock *u = unix_sk(sk);
2608 
2609 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2610 		skb_unlink(skb, &sk->sk_receive_queue);
2611 		consume_skb(skb);
2612 		skb = NULL;
2613 	} else {
2614 		if (skb == u->oob_skb) {
2615 			if (copied) {
2616 				skb = NULL;
2617 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2618 				if (!(flags & MSG_PEEK)) {
2619 					WRITE_ONCE(u->oob_skb, NULL);
2620 					consume_skb(skb);
2621 				}
2622 			} else if (!(flags & MSG_PEEK)) {
2623 				skb_unlink(skb, &sk->sk_receive_queue);
2624 				consume_skb(skb);
2625 				skb = skb_peek(&sk->sk_receive_queue);
2626 			}
2627 		}
2628 	}
2629 	return skb;
2630 }
2631 #endif
2632 
2633 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2634 {
2635 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2636 		return -ENOTCONN;
2637 
2638 	return unix_read_skb(sk, recv_actor);
2639 }
2640 
2641 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2642 				    bool freezable)
2643 {
2644 	struct scm_cookie scm;
2645 	struct socket *sock = state->socket;
2646 	struct sock *sk = sock->sk;
2647 	struct unix_sock *u = unix_sk(sk);
2648 	int copied = 0;
2649 	int flags = state->flags;
2650 	int noblock = flags & MSG_DONTWAIT;
2651 	bool check_creds = false;
2652 	int target;
2653 	int err = 0;
2654 	long timeo;
2655 	int skip;
2656 	size_t size = state->size;
2657 	unsigned int last_len;
2658 
2659 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2660 		err = -EINVAL;
2661 		goto out;
2662 	}
2663 
2664 	if (unlikely(flags & MSG_OOB)) {
2665 		err = -EOPNOTSUPP;
2666 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2667 		err = unix_stream_recv_urg(state);
2668 #endif
2669 		goto out;
2670 	}
2671 
2672 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2673 	timeo = sock_rcvtimeo(sk, noblock);
2674 
2675 	memset(&scm, 0, sizeof(scm));
2676 
2677 	/* Lock the socket to prevent queue disordering
2678 	 * while sleeps in memcpy_tomsg
2679 	 */
2680 	mutex_lock(&u->iolock);
2681 
2682 	skip = max(sk_peek_offset(sk, flags), 0);
2683 
2684 	do {
2685 		int chunk;
2686 		bool drop_skb;
2687 		struct sk_buff *skb, *last;
2688 
2689 redo:
2690 		unix_state_lock(sk);
2691 		if (sock_flag(sk, SOCK_DEAD)) {
2692 			err = -ECONNRESET;
2693 			goto unlock;
2694 		}
2695 		last = skb = skb_peek(&sk->sk_receive_queue);
2696 		last_len = last ? last->len : 0;
2697 
2698 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2699 		if (skb) {
2700 			skb = manage_oob(skb, sk, flags, copied);
2701 			if (!skb) {
2702 				unix_state_unlock(sk);
2703 				if (copied)
2704 					break;
2705 				goto redo;
2706 			}
2707 		}
2708 #endif
2709 again:
2710 		if (skb == NULL) {
2711 			if (copied >= target)
2712 				goto unlock;
2713 
2714 			/*
2715 			 *	POSIX 1003.1g mandates this order.
2716 			 */
2717 
2718 			err = sock_error(sk);
2719 			if (err)
2720 				goto unlock;
2721 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2722 				goto unlock;
2723 
2724 			unix_state_unlock(sk);
2725 			if (!timeo) {
2726 				err = -EAGAIN;
2727 				break;
2728 			}
2729 
2730 			mutex_unlock(&u->iolock);
2731 
2732 			timeo = unix_stream_data_wait(sk, timeo, last,
2733 						      last_len, freezable);
2734 
2735 			if (signal_pending(current)) {
2736 				err = sock_intr_errno(timeo);
2737 				scm_destroy(&scm);
2738 				goto out;
2739 			}
2740 
2741 			mutex_lock(&u->iolock);
2742 			goto redo;
2743 unlock:
2744 			unix_state_unlock(sk);
2745 			break;
2746 		}
2747 
2748 		while (skip >= unix_skb_len(skb)) {
2749 			skip -= unix_skb_len(skb);
2750 			last = skb;
2751 			last_len = skb->len;
2752 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2753 			if (!skb)
2754 				goto again;
2755 		}
2756 
2757 		unix_state_unlock(sk);
2758 
2759 		if (check_creds) {
2760 			/* Never glue messages from different writers */
2761 			if (!unix_skb_scm_eq(skb, &scm))
2762 				break;
2763 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2764 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2765 			/* Copy credentials */
2766 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2767 			unix_set_secdata(&scm, skb);
2768 			check_creds = true;
2769 		}
2770 
2771 		/* Copy address just once */
2772 		if (state->msg && state->msg->msg_name) {
2773 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2774 					 state->msg->msg_name);
2775 			unix_copy_addr(state->msg, skb->sk);
2776 
2777 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2778 							      state->msg->msg_name,
2779 							      &state->msg->msg_namelen);
2780 
2781 			sunaddr = NULL;
2782 		}
2783 
2784 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2785 		skb_get(skb);
2786 		chunk = state->recv_actor(skb, skip, chunk, state);
2787 		drop_skb = !unix_skb_len(skb);
2788 		/* skb is only safe to use if !drop_skb */
2789 		consume_skb(skb);
2790 		if (chunk < 0) {
2791 			if (copied == 0)
2792 				copied = -EFAULT;
2793 			break;
2794 		}
2795 		copied += chunk;
2796 		size -= chunk;
2797 
2798 		if (drop_skb) {
2799 			/* the skb was touched by a concurrent reader;
2800 			 * we should not expect anything from this skb
2801 			 * anymore and assume it invalid - we can be
2802 			 * sure it was dropped from the socket queue
2803 			 *
2804 			 * let's report a short read
2805 			 */
2806 			err = 0;
2807 			break;
2808 		}
2809 
2810 		/* Mark read part of skb as used */
2811 		if (!(flags & MSG_PEEK)) {
2812 			UNIXCB(skb).consumed += chunk;
2813 
2814 			sk_peek_offset_bwd(sk, chunk);
2815 
2816 			if (UNIXCB(skb).fp) {
2817 				scm_stat_del(sk, skb);
2818 				unix_detach_fds(&scm, skb);
2819 			}
2820 
2821 			if (unix_skb_len(skb))
2822 				break;
2823 
2824 			skb_unlink(skb, &sk->sk_receive_queue);
2825 			consume_skb(skb);
2826 
2827 			if (scm.fp)
2828 				break;
2829 		} else {
2830 			/* It is questionable, see note in unix_dgram_recvmsg.
2831 			 */
2832 			if (UNIXCB(skb).fp)
2833 				unix_peek_fds(&scm, skb);
2834 
2835 			sk_peek_offset_fwd(sk, chunk);
2836 
2837 			if (UNIXCB(skb).fp)
2838 				break;
2839 
2840 			skip = 0;
2841 			last = skb;
2842 			last_len = skb->len;
2843 			unix_state_lock(sk);
2844 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2845 			if (skb)
2846 				goto again;
2847 			unix_state_unlock(sk);
2848 			break;
2849 		}
2850 	} while (size);
2851 
2852 	mutex_unlock(&u->iolock);
2853 	if (state->msg)
2854 		scm_recv_unix(sock, state->msg, &scm, flags);
2855 	else
2856 		scm_destroy(&scm);
2857 out:
2858 	return copied ? : err;
2859 }
2860 
2861 static int unix_stream_read_actor(struct sk_buff *skb,
2862 				  int skip, int chunk,
2863 				  struct unix_stream_read_state *state)
2864 {
2865 	int ret;
2866 
2867 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2868 				    state->msg, chunk);
2869 	return ret ?: chunk;
2870 }
2871 
2872 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2873 			  size_t size, int flags)
2874 {
2875 	struct unix_stream_read_state state = {
2876 		.recv_actor = unix_stream_read_actor,
2877 		.socket = sk->sk_socket,
2878 		.msg = msg,
2879 		.size = size,
2880 		.flags = flags
2881 	};
2882 
2883 	return unix_stream_read_generic(&state, true);
2884 }
2885 
2886 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2887 			       size_t size, int flags)
2888 {
2889 	struct unix_stream_read_state state = {
2890 		.recv_actor = unix_stream_read_actor,
2891 		.socket = sock,
2892 		.msg = msg,
2893 		.size = size,
2894 		.flags = flags
2895 	};
2896 
2897 #ifdef CONFIG_BPF_SYSCALL
2898 	struct sock *sk = sock->sk;
2899 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2900 
2901 	if (prot != &unix_stream_proto)
2902 		return prot->recvmsg(sk, msg, size, flags, NULL);
2903 #endif
2904 	return unix_stream_read_generic(&state, true);
2905 }
2906 
2907 static int unix_stream_splice_actor(struct sk_buff *skb,
2908 				    int skip, int chunk,
2909 				    struct unix_stream_read_state *state)
2910 {
2911 	return skb_splice_bits(skb, state->socket->sk,
2912 			       UNIXCB(skb).consumed + skip,
2913 			       state->pipe, chunk, state->splice_flags);
2914 }
2915 
2916 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2917 				       struct pipe_inode_info *pipe,
2918 				       size_t size, unsigned int flags)
2919 {
2920 	struct unix_stream_read_state state = {
2921 		.recv_actor = unix_stream_splice_actor,
2922 		.socket = sock,
2923 		.pipe = pipe,
2924 		.size = size,
2925 		.splice_flags = flags,
2926 	};
2927 
2928 	if (unlikely(*ppos))
2929 		return -ESPIPE;
2930 
2931 	if (sock->file->f_flags & O_NONBLOCK ||
2932 	    flags & SPLICE_F_NONBLOCK)
2933 		state.flags = MSG_DONTWAIT;
2934 
2935 	return unix_stream_read_generic(&state, false);
2936 }
2937 
2938 static int unix_shutdown(struct socket *sock, int mode)
2939 {
2940 	struct sock *sk = sock->sk;
2941 	struct sock *other;
2942 
2943 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2944 		return -EINVAL;
2945 	/* This maps:
2946 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2947 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2948 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2949 	 */
2950 	++mode;
2951 
2952 	unix_state_lock(sk);
2953 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2954 	other = unix_peer(sk);
2955 	if (other)
2956 		sock_hold(other);
2957 	unix_state_unlock(sk);
2958 	sk->sk_state_change(sk);
2959 
2960 	if (other &&
2961 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2962 
2963 		int peer_mode = 0;
2964 		const struct proto *prot = READ_ONCE(other->sk_prot);
2965 
2966 		if (prot->unhash)
2967 			prot->unhash(other);
2968 		if (mode&RCV_SHUTDOWN)
2969 			peer_mode |= SEND_SHUTDOWN;
2970 		if (mode&SEND_SHUTDOWN)
2971 			peer_mode |= RCV_SHUTDOWN;
2972 		unix_state_lock(other);
2973 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2974 		unix_state_unlock(other);
2975 		other->sk_state_change(other);
2976 		if (peer_mode == SHUTDOWN_MASK)
2977 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2978 		else if (peer_mode & RCV_SHUTDOWN)
2979 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2980 	}
2981 	if (other)
2982 		sock_put(other);
2983 
2984 	return 0;
2985 }
2986 
2987 long unix_inq_len(struct sock *sk)
2988 {
2989 	struct sk_buff *skb;
2990 	long amount = 0;
2991 
2992 	if (sk->sk_state == TCP_LISTEN)
2993 		return -EINVAL;
2994 
2995 	spin_lock(&sk->sk_receive_queue.lock);
2996 	if (sk->sk_type == SOCK_STREAM ||
2997 	    sk->sk_type == SOCK_SEQPACKET) {
2998 		skb_queue_walk(&sk->sk_receive_queue, skb)
2999 			amount += unix_skb_len(skb);
3000 	} else {
3001 		skb = skb_peek(&sk->sk_receive_queue);
3002 		if (skb)
3003 			amount = skb->len;
3004 	}
3005 	spin_unlock(&sk->sk_receive_queue.lock);
3006 
3007 	return amount;
3008 }
3009 EXPORT_SYMBOL_GPL(unix_inq_len);
3010 
3011 long unix_outq_len(struct sock *sk)
3012 {
3013 	return sk_wmem_alloc_get(sk);
3014 }
3015 EXPORT_SYMBOL_GPL(unix_outq_len);
3016 
3017 static int unix_open_file(struct sock *sk)
3018 {
3019 	struct path path;
3020 	struct file *f;
3021 	int fd;
3022 
3023 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3024 		return -EPERM;
3025 
3026 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3027 		return -ENOENT;
3028 
3029 	path = unix_sk(sk)->path;
3030 	if (!path.dentry)
3031 		return -ENOENT;
3032 
3033 	path_get(&path);
3034 
3035 	fd = get_unused_fd_flags(O_CLOEXEC);
3036 	if (fd < 0)
3037 		goto out;
3038 
3039 	f = dentry_open(&path, O_PATH, current_cred());
3040 	if (IS_ERR(f)) {
3041 		put_unused_fd(fd);
3042 		fd = PTR_ERR(f);
3043 		goto out;
3044 	}
3045 
3046 	fd_install(fd, f);
3047 out:
3048 	path_put(&path);
3049 
3050 	return fd;
3051 }
3052 
3053 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3054 {
3055 	struct sock *sk = sock->sk;
3056 	long amount = 0;
3057 	int err;
3058 
3059 	switch (cmd) {
3060 	case SIOCOUTQ:
3061 		amount = unix_outq_len(sk);
3062 		err = put_user(amount, (int __user *)arg);
3063 		break;
3064 	case SIOCINQ:
3065 		amount = unix_inq_len(sk);
3066 		if (amount < 0)
3067 			err = amount;
3068 		else
3069 			err = put_user(amount, (int __user *)arg);
3070 		break;
3071 	case SIOCUNIXFILE:
3072 		err = unix_open_file(sk);
3073 		break;
3074 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3075 	case SIOCATMARK:
3076 		{
3077 			struct sk_buff *skb;
3078 			int answ = 0;
3079 
3080 			skb = skb_peek(&sk->sk_receive_queue);
3081 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3082 				answ = 1;
3083 			err = put_user(answ, (int __user *)arg);
3084 		}
3085 		break;
3086 #endif
3087 	default:
3088 		err = -ENOIOCTLCMD;
3089 		break;
3090 	}
3091 	return err;
3092 }
3093 
3094 #ifdef CONFIG_COMPAT
3095 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3096 {
3097 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3098 }
3099 #endif
3100 
3101 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3102 {
3103 	struct sock *sk = sock->sk;
3104 	__poll_t mask;
3105 	u8 shutdown;
3106 
3107 	sock_poll_wait(file, sock, wait);
3108 	mask = 0;
3109 	shutdown = READ_ONCE(sk->sk_shutdown);
3110 
3111 	/* exceptional events? */
3112 	if (READ_ONCE(sk->sk_err))
3113 		mask |= EPOLLERR;
3114 	if (shutdown == SHUTDOWN_MASK)
3115 		mask |= EPOLLHUP;
3116 	if (shutdown & RCV_SHUTDOWN)
3117 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3118 
3119 	/* readable? */
3120 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3121 		mask |= EPOLLIN | EPOLLRDNORM;
3122 	if (sk_is_readable(sk))
3123 		mask |= EPOLLIN | EPOLLRDNORM;
3124 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3125 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3126 		mask |= EPOLLPRI;
3127 #endif
3128 
3129 	/* Connection-based need to check for termination and startup */
3130 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3131 	    sk->sk_state == TCP_CLOSE)
3132 		mask |= EPOLLHUP;
3133 
3134 	/*
3135 	 * we set writable also when the other side has shut down the
3136 	 * connection. This prevents stuck sockets.
3137 	 */
3138 	if (unix_writable(sk))
3139 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3140 
3141 	return mask;
3142 }
3143 
3144 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3145 				    poll_table *wait)
3146 {
3147 	struct sock *sk = sock->sk, *other;
3148 	unsigned int writable;
3149 	__poll_t mask;
3150 	u8 shutdown;
3151 
3152 	sock_poll_wait(file, sock, wait);
3153 	mask = 0;
3154 	shutdown = READ_ONCE(sk->sk_shutdown);
3155 
3156 	/* exceptional events? */
3157 	if (READ_ONCE(sk->sk_err) ||
3158 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3159 		mask |= EPOLLERR |
3160 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3161 
3162 	if (shutdown & RCV_SHUTDOWN)
3163 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3164 	if (shutdown == SHUTDOWN_MASK)
3165 		mask |= EPOLLHUP;
3166 
3167 	/* readable? */
3168 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3169 		mask |= EPOLLIN | EPOLLRDNORM;
3170 	if (sk_is_readable(sk))
3171 		mask |= EPOLLIN | EPOLLRDNORM;
3172 
3173 	/* Connection-based need to check for termination and startup */
3174 	if (sk->sk_type == SOCK_SEQPACKET) {
3175 		if (sk->sk_state == TCP_CLOSE)
3176 			mask |= EPOLLHUP;
3177 		/* connection hasn't started yet? */
3178 		if (sk->sk_state == TCP_SYN_SENT)
3179 			return mask;
3180 	}
3181 
3182 	/* No write status requested, avoid expensive OUT tests. */
3183 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3184 		return mask;
3185 
3186 	writable = unix_writable(sk);
3187 	if (writable) {
3188 		unix_state_lock(sk);
3189 
3190 		other = unix_peer(sk);
3191 		if (other && unix_peer(other) != sk &&
3192 		    unix_recvq_full_lockless(other) &&
3193 		    unix_dgram_peer_wake_me(sk, other))
3194 			writable = 0;
3195 
3196 		unix_state_unlock(sk);
3197 	}
3198 
3199 	if (writable)
3200 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3201 	else
3202 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3203 
3204 	return mask;
3205 }
3206 
3207 #ifdef CONFIG_PROC_FS
3208 
3209 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3210 
3211 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3212 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3213 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3214 
3215 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3216 {
3217 	unsigned long offset = get_offset(*pos);
3218 	unsigned long bucket = get_bucket(*pos);
3219 	unsigned long count = 0;
3220 	struct sock *sk;
3221 
3222 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3223 	     sk; sk = sk_next(sk)) {
3224 		if (++count == offset)
3225 			break;
3226 	}
3227 
3228 	return sk;
3229 }
3230 
3231 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3232 {
3233 	unsigned long bucket = get_bucket(*pos);
3234 	struct net *net = seq_file_net(seq);
3235 	struct sock *sk;
3236 
3237 	while (bucket < UNIX_HASH_SIZE) {
3238 		spin_lock(&net->unx.table.locks[bucket]);
3239 
3240 		sk = unix_from_bucket(seq, pos);
3241 		if (sk)
3242 			return sk;
3243 
3244 		spin_unlock(&net->unx.table.locks[bucket]);
3245 
3246 		*pos = set_bucket_offset(++bucket, 1);
3247 	}
3248 
3249 	return NULL;
3250 }
3251 
3252 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3253 				  loff_t *pos)
3254 {
3255 	unsigned long bucket = get_bucket(*pos);
3256 
3257 	sk = sk_next(sk);
3258 	if (sk)
3259 		return sk;
3260 
3261 
3262 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3263 
3264 	*pos = set_bucket_offset(++bucket, 1);
3265 
3266 	return unix_get_first(seq, pos);
3267 }
3268 
3269 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3270 {
3271 	if (!*pos)
3272 		return SEQ_START_TOKEN;
3273 
3274 	return unix_get_first(seq, pos);
3275 }
3276 
3277 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3278 {
3279 	++*pos;
3280 
3281 	if (v == SEQ_START_TOKEN)
3282 		return unix_get_first(seq, pos);
3283 
3284 	return unix_get_next(seq, v, pos);
3285 }
3286 
3287 static void unix_seq_stop(struct seq_file *seq, void *v)
3288 {
3289 	struct sock *sk = v;
3290 
3291 	if (sk)
3292 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3293 }
3294 
3295 static int unix_seq_show(struct seq_file *seq, void *v)
3296 {
3297 
3298 	if (v == SEQ_START_TOKEN)
3299 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3300 			 "Inode Path\n");
3301 	else {
3302 		struct sock *s = v;
3303 		struct unix_sock *u = unix_sk(s);
3304 		unix_state_lock(s);
3305 
3306 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3307 			s,
3308 			refcount_read(&s->sk_refcnt),
3309 			0,
3310 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3311 			s->sk_type,
3312 			s->sk_socket ?
3313 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3314 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3315 			sock_i_ino(s));
3316 
3317 		if (u->addr) {	// under a hash table lock here
3318 			int i, len;
3319 			seq_putc(seq, ' ');
3320 
3321 			i = 0;
3322 			len = u->addr->len -
3323 				offsetof(struct sockaddr_un, sun_path);
3324 			if (u->addr->name->sun_path[0]) {
3325 				len--;
3326 			} else {
3327 				seq_putc(seq, '@');
3328 				i++;
3329 			}
3330 			for ( ; i < len; i++)
3331 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3332 					 '@');
3333 		}
3334 		unix_state_unlock(s);
3335 		seq_putc(seq, '\n');
3336 	}
3337 
3338 	return 0;
3339 }
3340 
3341 static const struct seq_operations unix_seq_ops = {
3342 	.start  = unix_seq_start,
3343 	.next   = unix_seq_next,
3344 	.stop   = unix_seq_stop,
3345 	.show   = unix_seq_show,
3346 };
3347 
3348 #ifdef CONFIG_BPF_SYSCALL
3349 struct bpf_unix_iter_state {
3350 	struct seq_net_private p;
3351 	unsigned int cur_sk;
3352 	unsigned int end_sk;
3353 	unsigned int max_sk;
3354 	struct sock **batch;
3355 	bool st_bucket_done;
3356 };
3357 
3358 struct bpf_iter__unix {
3359 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3360 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3361 	uid_t uid __aligned(8);
3362 };
3363 
3364 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3365 			      struct unix_sock *unix_sk, uid_t uid)
3366 {
3367 	struct bpf_iter__unix ctx;
3368 
3369 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3370 	ctx.meta = meta;
3371 	ctx.unix_sk = unix_sk;
3372 	ctx.uid = uid;
3373 	return bpf_iter_run_prog(prog, &ctx);
3374 }
3375 
3376 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3377 
3378 {
3379 	struct bpf_unix_iter_state *iter = seq->private;
3380 	unsigned int expected = 1;
3381 	struct sock *sk;
3382 
3383 	sock_hold(start_sk);
3384 	iter->batch[iter->end_sk++] = start_sk;
3385 
3386 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3387 		if (iter->end_sk < iter->max_sk) {
3388 			sock_hold(sk);
3389 			iter->batch[iter->end_sk++] = sk;
3390 		}
3391 
3392 		expected++;
3393 	}
3394 
3395 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3396 
3397 	return expected;
3398 }
3399 
3400 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3401 {
3402 	while (iter->cur_sk < iter->end_sk)
3403 		sock_put(iter->batch[iter->cur_sk++]);
3404 }
3405 
3406 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3407 				       unsigned int new_batch_sz)
3408 {
3409 	struct sock **new_batch;
3410 
3411 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3412 			     GFP_USER | __GFP_NOWARN);
3413 	if (!new_batch)
3414 		return -ENOMEM;
3415 
3416 	bpf_iter_unix_put_batch(iter);
3417 	kvfree(iter->batch);
3418 	iter->batch = new_batch;
3419 	iter->max_sk = new_batch_sz;
3420 
3421 	return 0;
3422 }
3423 
3424 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3425 					loff_t *pos)
3426 {
3427 	struct bpf_unix_iter_state *iter = seq->private;
3428 	unsigned int expected;
3429 	bool resized = false;
3430 	struct sock *sk;
3431 
3432 	if (iter->st_bucket_done)
3433 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3434 
3435 again:
3436 	/* Get a new batch */
3437 	iter->cur_sk = 0;
3438 	iter->end_sk = 0;
3439 
3440 	sk = unix_get_first(seq, pos);
3441 	if (!sk)
3442 		return NULL; /* Done */
3443 
3444 	expected = bpf_iter_unix_hold_batch(seq, sk);
3445 
3446 	if (iter->end_sk == expected) {
3447 		iter->st_bucket_done = true;
3448 		return sk;
3449 	}
3450 
3451 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3452 		resized = true;
3453 		goto again;
3454 	}
3455 
3456 	return sk;
3457 }
3458 
3459 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3460 {
3461 	if (!*pos)
3462 		return SEQ_START_TOKEN;
3463 
3464 	/* bpf iter does not support lseek, so it always
3465 	 * continue from where it was stop()-ped.
3466 	 */
3467 	return bpf_iter_unix_batch(seq, pos);
3468 }
3469 
3470 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3471 {
3472 	struct bpf_unix_iter_state *iter = seq->private;
3473 	struct sock *sk;
3474 
3475 	/* Whenever seq_next() is called, the iter->cur_sk is
3476 	 * done with seq_show(), so advance to the next sk in
3477 	 * the batch.
3478 	 */
3479 	if (iter->cur_sk < iter->end_sk)
3480 		sock_put(iter->batch[iter->cur_sk++]);
3481 
3482 	++*pos;
3483 
3484 	if (iter->cur_sk < iter->end_sk)
3485 		sk = iter->batch[iter->cur_sk];
3486 	else
3487 		sk = bpf_iter_unix_batch(seq, pos);
3488 
3489 	return sk;
3490 }
3491 
3492 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3493 {
3494 	struct bpf_iter_meta meta;
3495 	struct bpf_prog *prog;
3496 	struct sock *sk = v;
3497 	uid_t uid;
3498 	bool slow;
3499 	int ret;
3500 
3501 	if (v == SEQ_START_TOKEN)
3502 		return 0;
3503 
3504 	slow = lock_sock_fast(sk);
3505 
3506 	if (unlikely(sk_unhashed(sk))) {
3507 		ret = SEQ_SKIP;
3508 		goto unlock;
3509 	}
3510 
3511 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3512 	meta.seq = seq;
3513 	prog = bpf_iter_get_info(&meta, false);
3514 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3515 unlock:
3516 	unlock_sock_fast(sk, slow);
3517 	return ret;
3518 }
3519 
3520 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3521 {
3522 	struct bpf_unix_iter_state *iter = seq->private;
3523 	struct bpf_iter_meta meta;
3524 	struct bpf_prog *prog;
3525 
3526 	if (!v) {
3527 		meta.seq = seq;
3528 		prog = bpf_iter_get_info(&meta, true);
3529 		if (prog)
3530 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3531 	}
3532 
3533 	if (iter->cur_sk < iter->end_sk)
3534 		bpf_iter_unix_put_batch(iter);
3535 }
3536 
3537 static const struct seq_operations bpf_iter_unix_seq_ops = {
3538 	.start	= bpf_iter_unix_seq_start,
3539 	.next	= bpf_iter_unix_seq_next,
3540 	.stop	= bpf_iter_unix_seq_stop,
3541 	.show	= bpf_iter_unix_seq_show,
3542 };
3543 #endif
3544 #endif
3545 
3546 static const struct net_proto_family unix_family_ops = {
3547 	.family = PF_UNIX,
3548 	.create = unix_create,
3549 	.owner	= THIS_MODULE,
3550 };
3551 
3552 
3553 static int __net_init unix_net_init(struct net *net)
3554 {
3555 	int i;
3556 
3557 	net->unx.sysctl_max_dgram_qlen = 10;
3558 	if (unix_sysctl_register(net))
3559 		goto out;
3560 
3561 #ifdef CONFIG_PROC_FS
3562 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3563 			     sizeof(struct seq_net_private)))
3564 		goto err_sysctl;
3565 #endif
3566 
3567 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3568 					      sizeof(spinlock_t), GFP_KERNEL);
3569 	if (!net->unx.table.locks)
3570 		goto err_proc;
3571 
3572 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3573 						sizeof(struct hlist_head),
3574 						GFP_KERNEL);
3575 	if (!net->unx.table.buckets)
3576 		goto free_locks;
3577 
3578 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3579 		spin_lock_init(&net->unx.table.locks[i]);
3580 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3581 	}
3582 
3583 	return 0;
3584 
3585 free_locks:
3586 	kvfree(net->unx.table.locks);
3587 err_proc:
3588 #ifdef CONFIG_PROC_FS
3589 	remove_proc_entry("unix", net->proc_net);
3590 err_sysctl:
3591 #endif
3592 	unix_sysctl_unregister(net);
3593 out:
3594 	return -ENOMEM;
3595 }
3596 
3597 static void __net_exit unix_net_exit(struct net *net)
3598 {
3599 	kvfree(net->unx.table.buckets);
3600 	kvfree(net->unx.table.locks);
3601 	unix_sysctl_unregister(net);
3602 	remove_proc_entry("unix", net->proc_net);
3603 }
3604 
3605 static struct pernet_operations unix_net_ops = {
3606 	.init = unix_net_init,
3607 	.exit = unix_net_exit,
3608 };
3609 
3610 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3611 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3612 		     struct unix_sock *unix_sk, uid_t uid)
3613 
3614 #define INIT_BATCH_SZ 16
3615 
3616 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3617 {
3618 	struct bpf_unix_iter_state *iter = priv_data;
3619 	int err;
3620 
3621 	err = bpf_iter_init_seq_net(priv_data, aux);
3622 	if (err)
3623 		return err;
3624 
3625 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3626 	if (err) {
3627 		bpf_iter_fini_seq_net(priv_data);
3628 		return err;
3629 	}
3630 
3631 	return 0;
3632 }
3633 
3634 static void bpf_iter_fini_unix(void *priv_data)
3635 {
3636 	struct bpf_unix_iter_state *iter = priv_data;
3637 
3638 	bpf_iter_fini_seq_net(priv_data);
3639 	kvfree(iter->batch);
3640 }
3641 
3642 static const struct bpf_iter_seq_info unix_seq_info = {
3643 	.seq_ops		= &bpf_iter_unix_seq_ops,
3644 	.init_seq_private	= bpf_iter_init_unix,
3645 	.fini_seq_private	= bpf_iter_fini_unix,
3646 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3647 };
3648 
3649 static const struct bpf_func_proto *
3650 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3651 			     const struct bpf_prog *prog)
3652 {
3653 	switch (func_id) {
3654 	case BPF_FUNC_setsockopt:
3655 		return &bpf_sk_setsockopt_proto;
3656 	case BPF_FUNC_getsockopt:
3657 		return &bpf_sk_getsockopt_proto;
3658 	default:
3659 		return NULL;
3660 	}
3661 }
3662 
3663 static struct bpf_iter_reg unix_reg_info = {
3664 	.target			= "unix",
3665 	.ctx_arg_info_size	= 1,
3666 	.ctx_arg_info		= {
3667 		{ offsetof(struct bpf_iter__unix, unix_sk),
3668 		  PTR_TO_BTF_ID_OR_NULL },
3669 	},
3670 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3671 	.seq_info		= &unix_seq_info,
3672 };
3673 
3674 static void __init bpf_iter_register(void)
3675 {
3676 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3677 	if (bpf_iter_reg_target(&unix_reg_info))
3678 		pr_warn("Warning: could not register bpf iterator unix\n");
3679 }
3680 #endif
3681 
3682 static int __init af_unix_init(void)
3683 {
3684 	int i, rc = -1;
3685 
3686 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3687 
3688 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3689 		spin_lock_init(&bsd_socket_locks[i]);
3690 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3691 	}
3692 
3693 	rc = proto_register(&unix_dgram_proto, 1);
3694 	if (rc != 0) {
3695 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3696 		goto out;
3697 	}
3698 
3699 	rc = proto_register(&unix_stream_proto, 1);
3700 	if (rc != 0) {
3701 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3702 		proto_unregister(&unix_dgram_proto);
3703 		goto out;
3704 	}
3705 
3706 	sock_register(&unix_family_ops);
3707 	register_pernet_subsys(&unix_net_ops);
3708 	unix_bpf_build_proto();
3709 
3710 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3711 	bpf_iter_register();
3712 #endif
3713 
3714 out:
3715 	return rc;
3716 }
3717 
3718 /* Later than subsys_initcall() because we depend on stuff initialised there */
3719 fs_initcall(af_unix_init);
3720