xref: /linux/net/unix/af_unix.c (revision 031fba65fc202abf1f193e321be7a2c274fd88ba)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 #include "scm.h"
122 
123 static atomic_long_t unix_nr_socks;
124 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
125 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
126 
127 /* SMP locking strategy:
128  *    hash table is protected with spinlock.
129  *    each socket state is protected by separate spinlock.
130  */
131 
132 static unsigned int unix_unbound_hash(struct sock *sk)
133 {
134 	unsigned long hash = (unsigned long)sk;
135 
136 	hash ^= hash >> 16;
137 	hash ^= hash >> 8;
138 	hash ^= sk->sk_type;
139 
140 	return hash & UNIX_HASH_MOD;
141 }
142 
143 static unsigned int unix_bsd_hash(struct inode *i)
144 {
145 	return i->i_ino & UNIX_HASH_MOD;
146 }
147 
148 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
149 				       int addr_len, int type)
150 {
151 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
152 	unsigned int hash;
153 
154 	hash = (__force unsigned int)csum_fold(csum);
155 	hash ^= hash >> 8;
156 	hash ^= type;
157 
158 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
159 }
160 
161 static void unix_table_double_lock(struct net *net,
162 				   unsigned int hash1, unsigned int hash2)
163 {
164 	if (hash1 == hash2) {
165 		spin_lock(&net->unx.table.locks[hash1]);
166 		return;
167 	}
168 
169 	if (hash1 > hash2)
170 		swap(hash1, hash2);
171 
172 	spin_lock(&net->unx.table.locks[hash1]);
173 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
174 }
175 
176 static void unix_table_double_unlock(struct net *net,
177 				     unsigned int hash1, unsigned int hash2)
178 {
179 	if (hash1 == hash2) {
180 		spin_unlock(&net->unx.table.locks[hash1]);
181 		return;
182 	}
183 
184 	spin_unlock(&net->unx.table.locks[hash1]);
185 	spin_unlock(&net->unx.table.locks[hash2]);
186 }
187 
188 #ifdef CONFIG_SECURITY_NETWORK
189 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
190 {
191 	UNIXCB(skb).secid = scm->secid;
192 }
193 
194 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
195 {
196 	scm->secid = UNIXCB(skb).secid;
197 }
198 
199 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
200 {
201 	return (scm->secid == UNIXCB(skb).secid);
202 }
203 #else
204 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
205 { }
206 
207 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
208 { }
209 
210 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
211 {
212 	return true;
213 }
214 #endif /* CONFIG_SECURITY_NETWORK */
215 
216 #define unix_peer(sk) (unix_sk(sk)->peer)
217 
218 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
219 {
220 	return unix_peer(osk) == sk;
221 }
222 
223 static inline int unix_may_send(struct sock *sk, struct sock *osk)
224 {
225 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
226 }
227 
228 static inline int unix_recvq_full(const struct sock *sk)
229 {
230 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
231 }
232 
233 static inline int unix_recvq_full_lockless(const struct sock *sk)
234 {
235 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
236 		READ_ONCE(sk->sk_max_ack_backlog);
237 }
238 
239 struct sock *unix_peer_get(struct sock *s)
240 {
241 	struct sock *peer;
242 
243 	unix_state_lock(s);
244 	peer = unix_peer(s);
245 	if (peer)
246 		sock_hold(peer);
247 	unix_state_unlock(s);
248 	return peer;
249 }
250 EXPORT_SYMBOL_GPL(unix_peer_get);
251 
252 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
253 					     int addr_len)
254 {
255 	struct unix_address *addr;
256 
257 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
258 	if (!addr)
259 		return NULL;
260 
261 	refcount_set(&addr->refcnt, 1);
262 	addr->len = addr_len;
263 	memcpy(addr->name, sunaddr, addr_len);
264 
265 	return addr;
266 }
267 
268 static inline void unix_release_addr(struct unix_address *addr)
269 {
270 	if (refcount_dec_and_test(&addr->refcnt))
271 		kfree(addr);
272 }
273 
274 /*
275  *	Check unix socket name:
276  *		- should be not zero length.
277  *	        - if started by not zero, should be NULL terminated (FS object)
278  *		- if started by zero, it is abstract name.
279  */
280 
281 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
282 {
283 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
284 	    addr_len > sizeof(*sunaddr))
285 		return -EINVAL;
286 
287 	if (sunaddr->sun_family != AF_UNIX)
288 		return -EINVAL;
289 
290 	return 0;
291 }
292 
293 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
294 {
295 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
296 	short offset = offsetof(struct sockaddr_storage, __data);
297 
298 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
299 
300 	/* This may look like an off by one error but it is a bit more
301 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
302 	 * sun_path[108] doesn't as such exist.  However in kernel space
303 	 * we are guaranteed that it is a valid memory location in our
304 	 * kernel address buffer because syscall functions always pass
305 	 * a pointer of struct sockaddr_storage which has a bigger buffer
306 	 * than 108.  Also, we must terminate sun_path for strlen() in
307 	 * getname_kernel().
308 	 */
309 	addr->__data[addr_len - offset] = 0;
310 
311 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
312 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
313 	 * know the actual buffer.
314 	 */
315 	return strlen(addr->__data) + offset + 1;
316 }
317 
318 static void __unix_remove_socket(struct sock *sk)
319 {
320 	sk_del_node_init(sk);
321 }
322 
323 static void __unix_insert_socket(struct net *net, struct sock *sk)
324 {
325 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
326 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
327 }
328 
329 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
330 				 struct unix_address *addr, unsigned int hash)
331 {
332 	__unix_remove_socket(sk);
333 	smp_store_release(&unix_sk(sk)->addr, addr);
334 
335 	sk->sk_hash = hash;
336 	__unix_insert_socket(net, sk);
337 }
338 
339 static void unix_remove_socket(struct net *net, struct sock *sk)
340 {
341 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
342 	__unix_remove_socket(sk);
343 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
344 }
345 
346 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
347 {
348 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
349 	__unix_insert_socket(net, sk);
350 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
351 }
352 
353 static void unix_insert_bsd_socket(struct sock *sk)
354 {
355 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
356 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
357 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
358 }
359 
360 static void unix_remove_bsd_socket(struct sock *sk)
361 {
362 	if (!hlist_unhashed(&sk->sk_bind_node)) {
363 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
364 		__sk_del_bind_node(sk);
365 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
366 
367 		sk_node_init(&sk->sk_bind_node);
368 	}
369 }
370 
371 static struct sock *__unix_find_socket_byname(struct net *net,
372 					      struct sockaddr_un *sunname,
373 					      int len, unsigned int hash)
374 {
375 	struct sock *s;
376 
377 	sk_for_each(s, &net->unx.table.buckets[hash]) {
378 		struct unix_sock *u = unix_sk(s);
379 
380 		if (u->addr->len == len &&
381 		    !memcmp(u->addr->name, sunname, len))
382 			return s;
383 	}
384 	return NULL;
385 }
386 
387 static inline struct sock *unix_find_socket_byname(struct net *net,
388 						   struct sockaddr_un *sunname,
389 						   int len, unsigned int hash)
390 {
391 	struct sock *s;
392 
393 	spin_lock(&net->unx.table.locks[hash]);
394 	s = __unix_find_socket_byname(net, sunname, len, hash);
395 	if (s)
396 		sock_hold(s);
397 	spin_unlock(&net->unx.table.locks[hash]);
398 	return s;
399 }
400 
401 static struct sock *unix_find_socket_byinode(struct inode *i)
402 {
403 	unsigned int hash = unix_bsd_hash(i);
404 	struct sock *s;
405 
406 	spin_lock(&bsd_socket_locks[hash]);
407 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
408 		struct dentry *dentry = unix_sk(s)->path.dentry;
409 
410 		if (dentry && d_backing_inode(dentry) == i) {
411 			sock_hold(s);
412 			spin_unlock(&bsd_socket_locks[hash]);
413 			return s;
414 		}
415 	}
416 	spin_unlock(&bsd_socket_locks[hash]);
417 	return NULL;
418 }
419 
420 /* Support code for asymmetrically connected dgram sockets
421  *
422  * If a datagram socket is connected to a socket not itself connected
423  * to the first socket (eg, /dev/log), clients may only enqueue more
424  * messages if the present receive queue of the server socket is not
425  * "too large". This means there's a second writeability condition
426  * poll and sendmsg need to test. The dgram recv code will do a wake
427  * up on the peer_wait wait queue of a socket upon reception of a
428  * datagram which needs to be propagated to sleeping would-be writers
429  * since these might not have sent anything so far. This can't be
430  * accomplished via poll_wait because the lifetime of the server
431  * socket might be less than that of its clients if these break their
432  * association with it or if the server socket is closed while clients
433  * are still connected to it and there's no way to inform "a polling
434  * implementation" that it should let go of a certain wait queue
435  *
436  * In order to propagate a wake up, a wait_queue_entry_t of the client
437  * socket is enqueued on the peer_wait queue of the server socket
438  * whose wake function does a wake_up on the ordinary client socket
439  * wait queue. This connection is established whenever a write (or
440  * poll for write) hit the flow control condition and broken when the
441  * association to the server socket is dissolved or after a wake up
442  * was relayed.
443  */
444 
445 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
446 				      void *key)
447 {
448 	struct unix_sock *u;
449 	wait_queue_head_t *u_sleep;
450 
451 	u = container_of(q, struct unix_sock, peer_wake);
452 
453 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
454 			    q);
455 	u->peer_wake.private = NULL;
456 
457 	/* relaying can only happen while the wq still exists */
458 	u_sleep = sk_sleep(&u->sk);
459 	if (u_sleep)
460 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
461 
462 	return 0;
463 }
464 
465 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
466 {
467 	struct unix_sock *u, *u_other;
468 	int rc;
469 
470 	u = unix_sk(sk);
471 	u_other = unix_sk(other);
472 	rc = 0;
473 	spin_lock(&u_other->peer_wait.lock);
474 
475 	if (!u->peer_wake.private) {
476 		u->peer_wake.private = other;
477 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
478 
479 		rc = 1;
480 	}
481 
482 	spin_unlock(&u_other->peer_wait.lock);
483 	return rc;
484 }
485 
486 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
487 					    struct sock *other)
488 {
489 	struct unix_sock *u, *u_other;
490 
491 	u = unix_sk(sk);
492 	u_other = unix_sk(other);
493 	spin_lock(&u_other->peer_wait.lock);
494 
495 	if (u->peer_wake.private == other) {
496 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
497 		u->peer_wake.private = NULL;
498 	}
499 
500 	spin_unlock(&u_other->peer_wait.lock);
501 }
502 
503 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
504 						   struct sock *other)
505 {
506 	unix_dgram_peer_wake_disconnect(sk, other);
507 	wake_up_interruptible_poll(sk_sleep(sk),
508 				   EPOLLOUT |
509 				   EPOLLWRNORM |
510 				   EPOLLWRBAND);
511 }
512 
513 /* preconditions:
514  *	- unix_peer(sk) == other
515  *	- association is stable
516  */
517 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
518 {
519 	int connected;
520 
521 	connected = unix_dgram_peer_wake_connect(sk, other);
522 
523 	/* If other is SOCK_DEAD, we want to make sure we signal
524 	 * POLLOUT, such that a subsequent write() can get a
525 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
526 	 * to other and its full, we will hang waiting for POLLOUT.
527 	 */
528 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
529 		return 1;
530 
531 	if (connected)
532 		unix_dgram_peer_wake_disconnect(sk, other);
533 
534 	return 0;
535 }
536 
537 static int unix_writable(const struct sock *sk)
538 {
539 	return sk->sk_state != TCP_LISTEN &&
540 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
541 }
542 
543 static void unix_write_space(struct sock *sk)
544 {
545 	struct socket_wq *wq;
546 
547 	rcu_read_lock();
548 	if (unix_writable(sk)) {
549 		wq = rcu_dereference(sk->sk_wq);
550 		if (skwq_has_sleeper(wq))
551 			wake_up_interruptible_sync_poll(&wq->wait,
552 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
553 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
554 	}
555 	rcu_read_unlock();
556 }
557 
558 /* When dgram socket disconnects (or changes its peer), we clear its receive
559  * queue of packets arrived from previous peer. First, it allows to do
560  * flow control based only on wmem_alloc; second, sk connected to peer
561  * may receive messages only from that peer. */
562 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
563 {
564 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
565 		skb_queue_purge(&sk->sk_receive_queue);
566 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
567 
568 		/* If one link of bidirectional dgram pipe is disconnected,
569 		 * we signal error. Messages are lost. Do not make this,
570 		 * when peer was not connected to us.
571 		 */
572 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
573 			WRITE_ONCE(other->sk_err, ECONNRESET);
574 			sk_error_report(other);
575 		}
576 	}
577 	other->sk_state = TCP_CLOSE;
578 }
579 
580 static void unix_sock_destructor(struct sock *sk)
581 {
582 	struct unix_sock *u = unix_sk(sk);
583 
584 	skb_queue_purge(&sk->sk_receive_queue);
585 
586 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
587 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
588 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
589 	if (!sock_flag(sk, SOCK_DEAD)) {
590 		pr_info("Attempt to release alive unix socket: %p\n", sk);
591 		return;
592 	}
593 
594 	if (u->addr)
595 		unix_release_addr(u->addr);
596 
597 	atomic_long_dec(&unix_nr_socks);
598 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
599 #ifdef UNIX_REFCNT_DEBUG
600 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
601 		atomic_long_read(&unix_nr_socks));
602 #endif
603 }
604 
605 static void unix_release_sock(struct sock *sk, int embrion)
606 {
607 	struct unix_sock *u = unix_sk(sk);
608 	struct sock *skpair;
609 	struct sk_buff *skb;
610 	struct path path;
611 	int state;
612 
613 	unix_remove_socket(sock_net(sk), sk);
614 	unix_remove_bsd_socket(sk);
615 
616 	/* Clear state */
617 	unix_state_lock(sk);
618 	sock_orphan(sk);
619 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
620 	path	     = u->path;
621 	u->path.dentry = NULL;
622 	u->path.mnt = NULL;
623 	state = sk->sk_state;
624 	sk->sk_state = TCP_CLOSE;
625 
626 	skpair = unix_peer(sk);
627 	unix_peer(sk) = NULL;
628 
629 	unix_state_unlock(sk);
630 
631 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
632 	if (u->oob_skb) {
633 		kfree_skb(u->oob_skb);
634 		u->oob_skb = NULL;
635 	}
636 #endif
637 
638 	wake_up_interruptible_all(&u->peer_wait);
639 
640 	if (skpair != NULL) {
641 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
642 			unix_state_lock(skpair);
643 			/* No more writes */
644 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
645 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
646 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
647 			unix_state_unlock(skpair);
648 			skpair->sk_state_change(skpair);
649 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
650 		}
651 
652 		unix_dgram_peer_wake_disconnect(sk, skpair);
653 		sock_put(skpair); /* It may now die */
654 	}
655 
656 	/* Try to flush out this socket. Throw out buffers at least */
657 
658 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
659 		if (state == TCP_LISTEN)
660 			unix_release_sock(skb->sk, 1);
661 		/* passed fds are erased in the kfree_skb hook	      */
662 		UNIXCB(skb).consumed = skb->len;
663 		kfree_skb(skb);
664 	}
665 
666 	if (path.dentry)
667 		path_put(&path);
668 
669 	sock_put(sk);
670 
671 	/* ---- Socket is dead now and most probably destroyed ---- */
672 
673 	/*
674 	 * Fixme: BSD difference: In BSD all sockets connected to us get
675 	 *	  ECONNRESET and we die on the spot. In Linux we behave
676 	 *	  like files and pipes do and wait for the last
677 	 *	  dereference.
678 	 *
679 	 * Can't we simply set sock->err?
680 	 *
681 	 *	  What the above comment does talk about? --ANK(980817)
682 	 */
683 
684 	if (READ_ONCE(unix_tot_inflight))
685 		unix_gc();		/* Garbage collect fds */
686 }
687 
688 static void init_peercred(struct sock *sk)
689 {
690 	const struct cred *old_cred;
691 	struct pid *old_pid;
692 
693 	spin_lock(&sk->sk_peer_lock);
694 	old_pid = sk->sk_peer_pid;
695 	old_cred = sk->sk_peer_cred;
696 	sk->sk_peer_pid  = get_pid(task_tgid(current));
697 	sk->sk_peer_cred = get_current_cred();
698 	spin_unlock(&sk->sk_peer_lock);
699 
700 	put_pid(old_pid);
701 	put_cred(old_cred);
702 }
703 
704 static void copy_peercred(struct sock *sk, struct sock *peersk)
705 {
706 	const struct cred *old_cred;
707 	struct pid *old_pid;
708 
709 	if (sk < peersk) {
710 		spin_lock(&sk->sk_peer_lock);
711 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
712 	} else {
713 		spin_lock(&peersk->sk_peer_lock);
714 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
715 	}
716 	old_pid = sk->sk_peer_pid;
717 	old_cred = sk->sk_peer_cred;
718 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
719 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
720 
721 	spin_unlock(&sk->sk_peer_lock);
722 	spin_unlock(&peersk->sk_peer_lock);
723 
724 	put_pid(old_pid);
725 	put_cred(old_cred);
726 }
727 
728 static int unix_listen(struct socket *sock, int backlog)
729 {
730 	int err;
731 	struct sock *sk = sock->sk;
732 	struct unix_sock *u = unix_sk(sk);
733 
734 	err = -EOPNOTSUPP;
735 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
736 		goto out;	/* Only stream/seqpacket sockets accept */
737 	err = -EINVAL;
738 	if (!u->addr)
739 		goto out;	/* No listens on an unbound socket */
740 	unix_state_lock(sk);
741 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
742 		goto out_unlock;
743 	if (backlog > sk->sk_max_ack_backlog)
744 		wake_up_interruptible_all(&u->peer_wait);
745 	sk->sk_max_ack_backlog	= backlog;
746 	sk->sk_state		= TCP_LISTEN;
747 	/* set credentials so connect can copy them */
748 	init_peercred(sk);
749 	err = 0;
750 
751 out_unlock:
752 	unix_state_unlock(sk);
753 out:
754 	return err;
755 }
756 
757 static int unix_release(struct socket *);
758 static int unix_bind(struct socket *, struct sockaddr *, int);
759 static int unix_stream_connect(struct socket *, struct sockaddr *,
760 			       int addr_len, int flags);
761 static int unix_socketpair(struct socket *, struct socket *);
762 static int unix_accept(struct socket *, struct socket *, int, bool);
763 static int unix_getname(struct socket *, struct sockaddr *, int);
764 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
765 static __poll_t unix_dgram_poll(struct file *, struct socket *,
766 				    poll_table *);
767 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
768 #ifdef CONFIG_COMPAT
769 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
770 #endif
771 static int unix_shutdown(struct socket *, int);
772 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
773 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
774 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
775 				       struct pipe_inode_info *, size_t size,
776 				       unsigned int flags);
777 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
778 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
779 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
780 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
781 static int unix_dgram_connect(struct socket *, struct sockaddr *,
782 			      int, int);
783 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
784 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
785 				  int);
786 
787 static int unix_set_peek_off(struct sock *sk, int val)
788 {
789 	struct unix_sock *u = unix_sk(sk);
790 
791 	if (mutex_lock_interruptible(&u->iolock))
792 		return -EINTR;
793 
794 	WRITE_ONCE(sk->sk_peek_off, val);
795 	mutex_unlock(&u->iolock);
796 
797 	return 0;
798 }
799 
800 #ifdef CONFIG_PROC_FS
801 static int unix_count_nr_fds(struct sock *sk)
802 {
803 	struct sk_buff *skb;
804 	struct unix_sock *u;
805 	int nr_fds = 0;
806 
807 	spin_lock(&sk->sk_receive_queue.lock);
808 	skb = skb_peek(&sk->sk_receive_queue);
809 	while (skb) {
810 		u = unix_sk(skb->sk);
811 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
812 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
813 	}
814 	spin_unlock(&sk->sk_receive_queue.lock);
815 
816 	return nr_fds;
817 }
818 
819 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
820 {
821 	struct sock *sk = sock->sk;
822 	unsigned char s_state;
823 	struct unix_sock *u;
824 	int nr_fds = 0;
825 
826 	if (sk) {
827 		s_state = READ_ONCE(sk->sk_state);
828 		u = unix_sk(sk);
829 
830 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
831 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
832 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
833 		 */
834 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
835 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
836 		else if (s_state == TCP_LISTEN)
837 			nr_fds = unix_count_nr_fds(sk);
838 
839 		seq_printf(m, "scm_fds: %u\n", nr_fds);
840 	}
841 }
842 #else
843 #define unix_show_fdinfo NULL
844 #endif
845 
846 static const struct proto_ops unix_stream_ops = {
847 	.family =	PF_UNIX,
848 	.owner =	THIS_MODULE,
849 	.release =	unix_release,
850 	.bind =		unix_bind,
851 	.connect =	unix_stream_connect,
852 	.socketpair =	unix_socketpair,
853 	.accept =	unix_accept,
854 	.getname =	unix_getname,
855 	.poll =		unix_poll,
856 	.ioctl =	unix_ioctl,
857 #ifdef CONFIG_COMPAT
858 	.compat_ioctl =	unix_compat_ioctl,
859 #endif
860 	.listen =	unix_listen,
861 	.shutdown =	unix_shutdown,
862 	.sendmsg =	unix_stream_sendmsg,
863 	.recvmsg =	unix_stream_recvmsg,
864 	.read_skb =	unix_stream_read_skb,
865 	.mmap =		sock_no_mmap,
866 	.splice_read =	unix_stream_splice_read,
867 	.set_peek_off =	unix_set_peek_off,
868 	.show_fdinfo =	unix_show_fdinfo,
869 };
870 
871 static const struct proto_ops unix_dgram_ops = {
872 	.family =	PF_UNIX,
873 	.owner =	THIS_MODULE,
874 	.release =	unix_release,
875 	.bind =		unix_bind,
876 	.connect =	unix_dgram_connect,
877 	.socketpair =	unix_socketpair,
878 	.accept =	sock_no_accept,
879 	.getname =	unix_getname,
880 	.poll =		unix_dgram_poll,
881 	.ioctl =	unix_ioctl,
882 #ifdef CONFIG_COMPAT
883 	.compat_ioctl =	unix_compat_ioctl,
884 #endif
885 	.listen =	sock_no_listen,
886 	.shutdown =	unix_shutdown,
887 	.sendmsg =	unix_dgram_sendmsg,
888 	.read_skb =	unix_read_skb,
889 	.recvmsg =	unix_dgram_recvmsg,
890 	.mmap =		sock_no_mmap,
891 	.set_peek_off =	unix_set_peek_off,
892 	.show_fdinfo =	unix_show_fdinfo,
893 };
894 
895 static const struct proto_ops unix_seqpacket_ops = {
896 	.family =	PF_UNIX,
897 	.owner =	THIS_MODULE,
898 	.release =	unix_release,
899 	.bind =		unix_bind,
900 	.connect =	unix_stream_connect,
901 	.socketpair =	unix_socketpair,
902 	.accept =	unix_accept,
903 	.getname =	unix_getname,
904 	.poll =		unix_dgram_poll,
905 	.ioctl =	unix_ioctl,
906 #ifdef CONFIG_COMPAT
907 	.compat_ioctl =	unix_compat_ioctl,
908 #endif
909 	.listen =	unix_listen,
910 	.shutdown =	unix_shutdown,
911 	.sendmsg =	unix_seqpacket_sendmsg,
912 	.recvmsg =	unix_seqpacket_recvmsg,
913 	.mmap =		sock_no_mmap,
914 	.set_peek_off =	unix_set_peek_off,
915 	.show_fdinfo =	unix_show_fdinfo,
916 };
917 
918 static void unix_close(struct sock *sk, long timeout)
919 {
920 	/* Nothing to do here, unix socket does not need a ->close().
921 	 * This is merely for sockmap.
922 	 */
923 }
924 
925 static void unix_unhash(struct sock *sk)
926 {
927 	/* Nothing to do here, unix socket does not need a ->unhash().
928 	 * This is merely for sockmap.
929 	 */
930 }
931 
932 static bool unix_bpf_bypass_getsockopt(int level, int optname)
933 {
934 	if (level == SOL_SOCKET) {
935 		switch (optname) {
936 		case SO_PEERPIDFD:
937 			return true;
938 		default:
939 			return false;
940 		}
941 	}
942 
943 	return false;
944 }
945 
946 struct proto unix_dgram_proto = {
947 	.name			= "UNIX",
948 	.owner			= THIS_MODULE,
949 	.obj_size		= sizeof(struct unix_sock),
950 	.close			= unix_close,
951 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
952 #ifdef CONFIG_BPF_SYSCALL
953 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
954 #endif
955 };
956 
957 struct proto unix_stream_proto = {
958 	.name			= "UNIX-STREAM",
959 	.owner			= THIS_MODULE,
960 	.obj_size		= sizeof(struct unix_sock),
961 	.close			= unix_close,
962 	.unhash			= unix_unhash,
963 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
964 #ifdef CONFIG_BPF_SYSCALL
965 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
966 #endif
967 };
968 
969 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
970 {
971 	struct unix_sock *u;
972 	struct sock *sk;
973 	int err;
974 
975 	atomic_long_inc(&unix_nr_socks);
976 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
977 		err = -ENFILE;
978 		goto err;
979 	}
980 
981 	if (type == SOCK_STREAM)
982 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
983 	else /*dgram and  seqpacket */
984 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
985 
986 	if (!sk) {
987 		err = -ENOMEM;
988 		goto err;
989 	}
990 
991 	sock_init_data(sock, sk);
992 
993 	sk->sk_hash		= unix_unbound_hash(sk);
994 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
995 	sk->sk_write_space	= unix_write_space;
996 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
997 	sk->sk_destruct		= unix_sock_destructor;
998 	u	  = unix_sk(sk);
999 	u->path.dentry = NULL;
1000 	u->path.mnt = NULL;
1001 	spin_lock_init(&u->lock);
1002 	atomic_long_set(&u->inflight, 0);
1003 	INIT_LIST_HEAD(&u->link);
1004 	mutex_init(&u->iolock); /* single task reading lock */
1005 	mutex_init(&u->bindlock); /* single task binding lock */
1006 	init_waitqueue_head(&u->peer_wait);
1007 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1008 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1009 	unix_insert_unbound_socket(net, sk);
1010 
1011 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1012 
1013 	return sk;
1014 
1015 err:
1016 	atomic_long_dec(&unix_nr_socks);
1017 	return ERR_PTR(err);
1018 }
1019 
1020 static int unix_create(struct net *net, struct socket *sock, int protocol,
1021 		       int kern)
1022 {
1023 	struct sock *sk;
1024 
1025 	if (protocol && protocol != PF_UNIX)
1026 		return -EPROTONOSUPPORT;
1027 
1028 	sock->state = SS_UNCONNECTED;
1029 
1030 	switch (sock->type) {
1031 	case SOCK_STREAM:
1032 		sock->ops = &unix_stream_ops;
1033 		break;
1034 		/*
1035 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1036 		 *	nothing uses it.
1037 		 */
1038 	case SOCK_RAW:
1039 		sock->type = SOCK_DGRAM;
1040 		fallthrough;
1041 	case SOCK_DGRAM:
1042 		sock->ops = &unix_dgram_ops;
1043 		break;
1044 	case SOCK_SEQPACKET:
1045 		sock->ops = &unix_seqpacket_ops;
1046 		break;
1047 	default:
1048 		return -ESOCKTNOSUPPORT;
1049 	}
1050 
1051 	sk = unix_create1(net, sock, kern, sock->type);
1052 	if (IS_ERR(sk))
1053 		return PTR_ERR(sk);
1054 
1055 	return 0;
1056 }
1057 
1058 static int unix_release(struct socket *sock)
1059 {
1060 	struct sock *sk = sock->sk;
1061 
1062 	if (!sk)
1063 		return 0;
1064 
1065 	sk->sk_prot->close(sk, 0);
1066 	unix_release_sock(sk, 0);
1067 	sock->sk = NULL;
1068 
1069 	return 0;
1070 }
1071 
1072 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1073 				  int type)
1074 {
1075 	struct inode *inode;
1076 	struct path path;
1077 	struct sock *sk;
1078 	int err;
1079 
1080 	unix_mkname_bsd(sunaddr, addr_len);
1081 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1082 	if (err)
1083 		goto fail;
1084 
1085 	err = path_permission(&path, MAY_WRITE);
1086 	if (err)
1087 		goto path_put;
1088 
1089 	err = -ECONNREFUSED;
1090 	inode = d_backing_inode(path.dentry);
1091 	if (!S_ISSOCK(inode->i_mode))
1092 		goto path_put;
1093 
1094 	sk = unix_find_socket_byinode(inode);
1095 	if (!sk)
1096 		goto path_put;
1097 
1098 	err = -EPROTOTYPE;
1099 	if (sk->sk_type == type)
1100 		touch_atime(&path);
1101 	else
1102 		goto sock_put;
1103 
1104 	path_put(&path);
1105 
1106 	return sk;
1107 
1108 sock_put:
1109 	sock_put(sk);
1110 path_put:
1111 	path_put(&path);
1112 fail:
1113 	return ERR_PTR(err);
1114 }
1115 
1116 static struct sock *unix_find_abstract(struct net *net,
1117 				       struct sockaddr_un *sunaddr,
1118 				       int addr_len, int type)
1119 {
1120 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1121 	struct dentry *dentry;
1122 	struct sock *sk;
1123 
1124 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1125 	if (!sk)
1126 		return ERR_PTR(-ECONNREFUSED);
1127 
1128 	dentry = unix_sk(sk)->path.dentry;
1129 	if (dentry)
1130 		touch_atime(&unix_sk(sk)->path);
1131 
1132 	return sk;
1133 }
1134 
1135 static struct sock *unix_find_other(struct net *net,
1136 				    struct sockaddr_un *sunaddr,
1137 				    int addr_len, int type)
1138 {
1139 	struct sock *sk;
1140 
1141 	if (sunaddr->sun_path[0])
1142 		sk = unix_find_bsd(sunaddr, addr_len, type);
1143 	else
1144 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1145 
1146 	return sk;
1147 }
1148 
1149 static int unix_autobind(struct sock *sk)
1150 {
1151 	unsigned int new_hash, old_hash = sk->sk_hash;
1152 	struct unix_sock *u = unix_sk(sk);
1153 	struct net *net = sock_net(sk);
1154 	struct unix_address *addr;
1155 	u32 lastnum, ordernum;
1156 	int err;
1157 
1158 	err = mutex_lock_interruptible(&u->bindlock);
1159 	if (err)
1160 		return err;
1161 
1162 	if (u->addr)
1163 		goto out;
1164 
1165 	err = -ENOMEM;
1166 	addr = kzalloc(sizeof(*addr) +
1167 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1168 	if (!addr)
1169 		goto out;
1170 
1171 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1172 	addr->name->sun_family = AF_UNIX;
1173 	refcount_set(&addr->refcnt, 1);
1174 
1175 	ordernum = get_random_u32();
1176 	lastnum = ordernum & 0xFFFFF;
1177 retry:
1178 	ordernum = (ordernum + 1) & 0xFFFFF;
1179 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1180 
1181 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1182 	unix_table_double_lock(net, old_hash, new_hash);
1183 
1184 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1185 		unix_table_double_unlock(net, old_hash, new_hash);
1186 
1187 		/* __unix_find_socket_byname() may take long time if many names
1188 		 * are already in use.
1189 		 */
1190 		cond_resched();
1191 
1192 		if (ordernum == lastnum) {
1193 			/* Give up if all names seems to be in use. */
1194 			err = -ENOSPC;
1195 			unix_release_addr(addr);
1196 			goto out;
1197 		}
1198 
1199 		goto retry;
1200 	}
1201 
1202 	__unix_set_addr_hash(net, sk, addr, new_hash);
1203 	unix_table_double_unlock(net, old_hash, new_hash);
1204 	err = 0;
1205 
1206 out:	mutex_unlock(&u->bindlock);
1207 	return err;
1208 }
1209 
1210 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1211 			 int addr_len)
1212 {
1213 	umode_t mode = S_IFSOCK |
1214 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1215 	unsigned int new_hash, old_hash = sk->sk_hash;
1216 	struct unix_sock *u = unix_sk(sk);
1217 	struct net *net = sock_net(sk);
1218 	struct mnt_idmap *idmap;
1219 	struct unix_address *addr;
1220 	struct dentry *dentry;
1221 	struct path parent;
1222 	int err;
1223 
1224 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1225 	addr = unix_create_addr(sunaddr, addr_len);
1226 	if (!addr)
1227 		return -ENOMEM;
1228 
1229 	/*
1230 	 * Get the parent directory, calculate the hash for last
1231 	 * component.
1232 	 */
1233 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1234 	if (IS_ERR(dentry)) {
1235 		err = PTR_ERR(dentry);
1236 		goto out;
1237 	}
1238 
1239 	/*
1240 	 * All right, let's create it.
1241 	 */
1242 	idmap = mnt_idmap(parent.mnt);
1243 	err = security_path_mknod(&parent, dentry, mode, 0);
1244 	if (!err)
1245 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1246 	if (err)
1247 		goto out_path;
1248 	err = mutex_lock_interruptible(&u->bindlock);
1249 	if (err)
1250 		goto out_unlink;
1251 	if (u->addr)
1252 		goto out_unlock;
1253 
1254 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1255 	unix_table_double_lock(net, old_hash, new_hash);
1256 	u->path.mnt = mntget(parent.mnt);
1257 	u->path.dentry = dget(dentry);
1258 	__unix_set_addr_hash(net, sk, addr, new_hash);
1259 	unix_table_double_unlock(net, old_hash, new_hash);
1260 	unix_insert_bsd_socket(sk);
1261 	mutex_unlock(&u->bindlock);
1262 	done_path_create(&parent, dentry);
1263 	return 0;
1264 
1265 out_unlock:
1266 	mutex_unlock(&u->bindlock);
1267 	err = -EINVAL;
1268 out_unlink:
1269 	/* failed after successful mknod?  unlink what we'd created... */
1270 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1271 out_path:
1272 	done_path_create(&parent, dentry);
1273 out:
1274 	unix_release_addr(addr);
1275 	return err == -EEXIST ? -EADDRINUSE : err;
1276 }
1277 
1278 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1279 			      int addr_len)
1280 {
1281 	unsigned int new_hash, old_hash = sk->sk_hash;
1282 	struct unix_sock *u = unix_sk(sk);
1283 	struct net *net = sock_net(sk);
1284 	struct unix_address *addr;
1285 	int err;
1286 
1287 	addr = unix_create_addr(sunaddr, addr_len);
1288 	if (!addr)
1289 		return -ENOMEM;
1290 
1291 	err = mutex_lock_interruptible(&u->bindlock);
1292 	if (err)
1293 		goto out;
1294 
1295 	if (u->addr) {
1296 		err = -EINVAL;
1297 		goto out_mutex;
1298 	}
1299 
1300 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1301 	unix_table_double_lock(net, old_hash, new_hash);
1302 
1303 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1304 		goto out_spin;
1305 
1306 	__unix_set_addr_hash(net, sk, addr, new_hash);
1307 	unix_table_double_unlock(net, old_hash, new_hash);
1308 	mutex_unlock(&u->bindlock);
1309 	return 0;
1310 
1311 out_spin:
1312 	unix_table_double_unlock(net, old_hash, new_hash);
1313 	err = -EADDRINUSE;
1314 out_mutex:
1315 	mutex_unlock(&u->bindlock);
1316 out:
1317 	unix_release_addr(addr);
1318 	return err;
1319 }
1320 
1321 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1322 {
1323 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1324 	struct sock *sk = sock->sk;
1325 	int err;
1326 
1327 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1328 	    sunaddr->sun_family == AF_UNIX)
1329 		return unix_autobind(sk);
1330 
1331 	err = unix_validate_addr(sunaddr, addr_len);
1332 	if (err)
1333 		return err;
1334 
1335 	if (sunaddr->sun_path[0])
1336 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1337 	else
1338 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1339 
1340 	return err;
1341 }
1342 
1343 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1344 {
1345 	if (unlikely(sk1 == sk2) || !sk2) {
1346 		unix_state_lock(sk1);
1347 		return;
1348 	}
1349 	if (sk1 < sk2) {
1350 		unix_state_lock(sk1);
1351 		unix_state_lock_nested(sk2);
1352 	} else {
1353 		unix_state_lock(sk2);
1354 		unix_state_lock_nested(sk1);
1355 	}
1356 }
1357 
1358 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1359 {
1360 	if (unlikely(sk1 == sk2) || !sk2) {
1361 		unix_state_unlock(sk1);
1362 		return;
1363 	}
1364 	unix_state_unlock(sk1);
1365 	unix_state_unlock(sk2);
1366 }
1367 
1368 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1369 			      int alen, int flags)
1370 {
1371 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1372 	struct sock *sk = sock->sk;
1373 	struct sock *other;
1374 	int err;
1375 
1376 	err = -EINVAL;
1377 	if (alen < offsetofend(struct sockaddr, sa_family))
1378 		goto out;
1379 
1380 	if (addr->sa_family != AF_UNSPEC) {
1381 		err = unix_validate_addr(sunaddr, alen);
1382 		if (err)
1383 			goto out;
1384 
1385 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1386 		if (err)
1387 			goto out;
1388 
1389 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1390 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1391 		    !unix_sk(sk)->addr) {
1392 			err = unix_autobind(sk);
1393 			if (err)
1394 				goto out;
1395 		}
1396 
1397 restart:
1398 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1399 		if (IS_ERR(other)) {
1400 			err = PTR_ERR(other);
1401 			goto out;
1402 		}
1403 
1404 		unix_state_double_lock(sk, other);
1405 
1406 		/* Apparently VFS overslept socket death. Retry. */
1407 		if (sock_flag(other, SOCK_DEAD)) {
1408 			unix_state_double_unlock(sk, other);
1409 			sock_put(other);
1410 			goto restart;
1411 		}
1412 
1413 		err = -EPERM;
1414 		if (!unix_may_send(sk, other))
1415 			goto out_unlock;
1416 
1417 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1418 		if (err)
1419 			goto out_unlock;
1420 
1421 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1422 	} else {
1423 		/*
1424 		 *	1003.1g breaking connected state with AF_UNSPEC
1425 		 */
1426 		other = NULL;
1427 		unix_state_double_lock(sk, other);
1428 	}
1429 
1430 	/*
1431 	 * If it was connected, reconnect.
1432 	 */
1433 	if (unix_peer(sk)) {
1434 		struct sock *old_peer = unix_peer(sk);
1435 
1436 		unix_peer(sk) = other;
1437 		if (!other)
1438 			sk->sk_state = TCP_CLOSE;
1439 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1440 
1441 		unix_state_double_unlock(sk, other);
1442 
1443 		if (other != old_peer)
1444 			unix_dgram_disconnected(sk, old_peer);
1445 		sock_put(old_peer);
1446 	} else {
1447 		unix_peer(sk) = other;
1448 		unix_state_double_unlock(sk, other);
1449 	}
1450 
1451 	return 0;
1452 
1453 out_unlock:
1454 	unix_state_double_unlock(sk, other);
1455 	sock_put(other);
1456 out:
1457 	return err;
1458 }
1459 
1460 static long unix_wait_for_peer(struct sock *other, long timeo)
1461 	__releases(&unix_sk(other)->lock)
1462 {
1463 	struct unix_sock *u = unix_sk(other);
1464 	int sched;
1465 	DEFINE_WAIT(wait);
1466 
1467 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1468 
1469 	sched = !sock_flag(other, SOCK_DEAD) &&
1470 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1471 		unix_recvq_full_lockless(other);
1472 
1473 	unix_state_unlock(other);
1474 
1475 	if (sched)
1476 		timeo = schedule_timeout(timeo);
1477 
1478 	finish_wait(&u->peer_wait, &wait);
1479 	return timeo;
1480 }
1481 
1482 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1483 			       int addr_len, int flags)
1484 {
1485 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1486 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1487 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1488 	struct net *net = sock_net(sk);
1489 	struct sk_buff *skb = NULL;
1490 	long timeo;
1491 	int err;
1492 	int st;
1493 
1494 	err = unix_validate_addr(sunaddr, addr_len);
1495 	if (err)
1496 		goto out;
1497 
1498 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1499 	if (err)
1500 		goto out;
1501 
1502 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1503 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1504 		err = unix_autobind(sk);
1505 		if (err)
1506 			goto out;
1507 	}
1508 
1509 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1510 
1511 	/* First of all allocate resources.
1512 	   If we will make it after state is locked,
1513 	   we will have to recheck all again in any case.
1514 	 */
1515 
1516 	/* create new sock for complete connection */
1517 	newsk = unix_create1(net, NULL, 0, sock->type);
1518 	if (IS_ERR(newsk)) {
1519 		err = PTR_ERR(newsk);
1520 		newsk = NULL;
1521 		goto out;
1522 	}
1523 
1524 	err = -ENOMEM;
1525 
1526 	/* Allocate skb for sending to listening sock */
1527 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1528 	if (skb == NULL)
1529 		goto out;
1530 
1531 restart:
1532 	/*  Find listening sock. */
1533 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1534 	if (IS_ERR(other)) {
1535 		err = PTR_ERR(other);
1536 		other = NULL;
1537 		goto out;
1538 	}
1539 
1540 	/* Latch state of peer */
1541 	unix_state_lock(other);
1542 
1543 	/* Apparently VFS overslept socket death. Retry. */
1544 	if (sock_flag(other, SOCK_DEAD)) {
1545 		unix_state_unlock(other);
1546 		sock_put(other);
1547 		goto restart;
1548 	}
1549 
1550 	err = -ECONNREFUSED;
1551 	if (other->sk_state != TCP_LISTEN)
1552 		goto out_unlock;
1553 	if (other->sk_shutdown & RCV_SHUTDOWN)
1554 		goto out_unlock;
1555 
1556 	if (unix_recvq_full(other)) {
1557 		err = -EAGAIN;
1558 		if (!timeo)
1559 			goto out_unlock;
1560 
1561 		timeo = unix_wait_for_peer(other, timeo);
1562 
1563 		err = sock_intr_errno(timeo);
1564 		if (signal_pending(current))
1565 			goto out;
1566 		sock_put(other);
1567 		goto restart;
1568 	}
1569 
1570 	/* Latch our state.
1571 
1572 	   It is tricky place. We need to grab our state lock and cannot
1573 	   drop lock on peer. It is dangerous because deadlock is
1574 	   possible. Connect to self case and simultaneous
1575 	   attempt to connect are eliminated by checking socket
1576 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1577 	   check this before attempt to grab lock.
1578 
1579 	   Well, and we have to recheck the state after socket locked.
1580 	 */
1581 	st = sk->sk_state;
1582 
1583 	switch (st) {
1584 	case TCP_CLOSE:
1585 		/* This is ok... continue with connect */
1586 		break;
1587 	case TCP_ESTABLISHED:
1588 		/* Socket is already connected */
1589 		err = -EISCONN;
1590 		goto out_unlock;
1591 	default:
1592 		err = -EINVAL;
1593 		goto out_unlock;
1594 	}
1595 
1596 	unix_state_lock_nested(sk);
1597 
1598 	if (sk->sk_state != st) {
1599 		unix_state_unlock(sk);
1600 		unix_state_unlock(other);
1601 		sock_put(other);
1602 		goto restart;
1603 	}
1604 
1605 	err = security_unix_stream_connect(sk, other, newsk);
1606 	if (err) {
1607 		unix_state_unlock(sk);
1608 		goto out_unlock;
1609 	}
1610 
1611 	/* The way is open! Fastly set all the necessary fields... */
1612 
1613 	sock_hold(sk);
1614 	unix_peer(newsk)	= sk;
1615 	newsk->sk_state		= TCP_ESTABLISHED;
1616 	newsk->sk_type		= sk->sk_type;
1617 	init_peercred(newsk);
1618 	newu = unix_sk(newsk);
1619 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1620 	otheru = unix_sk(other);
1621 
1622 	/* copy address information from listening to new sock
1623 	 *
1624 	 * The contents of *(otheru->addr) and otheru->path
1625 	 * are seen fully set up here, since we have found
1626 	 * otheru in hash under its lock.  Insertion into the
1627 	 * hash chain we'd found it in had been done in an
1628 	 * earlier critical area protected by the chain's lock,
1629 	 * the same one where we'd set *(otheru->addr) contents,
1630 	 * as well as otheru->path and otheru->addr itself.
1631 	 *
1632 	 * Using smp_store_release() here to set newu->addr
1633 	 * is enough to make those stores, as well as stores
1634 	 * to newu->path visible to anyone who gets newu->addr
1635 	 * by smp_load_acquire().  IOW, the same warranties
1636 	 * as for unix_sock instances bound in unix_bind() or
1637 	 * in unix_autobind().
1638 	 */
1639 	if (otheru->path.dentry) {
1640 		path_get(&otheru->path);
1641 		newu->path = otheru->path;
1642 	}
1643 	refcount_inc(&otheru->addr->refcnt);
1644 	smp_store_release(&newu->addr, otheru->addr);
1645 
1646 	/* Set credentials */
1647 	copy_peercred(sk, other);
1648 
1649 	sock->state	= SS_CONNECTED;
1650 	sk->sk_state	= TCP_ESTABLISHED;
1651 	sock_hold(newsk);
1652 
1653 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1654 	unix_peer(sk)	= newsk;
1655 
1656 	unix_state_unlock(sk);
1657 
1658 	/* take ten and send info to listening sock */
1659 	spin_lock(&other->sk_receive_queue.lock);
1660 	__skb_queue_tail(&other->sk_receive_queue, skb);
1661 	spin_unlock(&other->sk_receive_queue.lock);
1662 	unix_state_unlock(other);
1663 	other->sk_data_ready(other);
1664 	sock_put(other);
1665 	return 0;
1666 
1667 out_unlock:
1668 	if (other)
1669 		unix_state_unlock(other);
1670 
1671 out:
1672 	kfree_skb(skb);
1673 	if (newsk)
1674 		unix_release_sock(newsk, 0);
1675 	if (other)
1676 		sock_put(other);
1677 	return err;
1678 }
1679 
1680 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1681 {
1682 	struct sock *ska = socka->sk, *skb = sockb->sk;
1683 
1684 	/* Join our sockets back to back */
1685 	sock_hold(ska);
1686 	sock_hold(skb);
1687 	unix_peer(ska) = skb;
1688 	unix_peer(skb) = ska;
1689 	init_peercred(ska);
1690 	init_peercred(skb);
1691 
1692 	ska->sk_state = TCP_ESTABLISHED;
1693 	skb->sk_state = TCP_ESTABLISHED;
1694 	socka->state  = SS_CONNECTED;
1695 	sockb->state  = SS_CONNECTED;
1696 	return 0;
1697 }
1698 
1699 static void unix_sock_inherit_flags(const struct socket *old,
1700 				    struct socket *new)
1701 {
1702 	if (test_bit(SOCK_PASSCRED, &old->flags))
1703 		set_bit(SOCK_PASSCRED, &new->flags);
1704 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1705 		set_bit(SOCK_PASSPIDFD, &new->flags);
1706 	if (test_bit(SOCK_PASSSEC, &old->flags))
1707 		set_bit(SOCK_PASSSEC, &new->flags);
1708 }
1709 
1710 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1711 		       bool kern)
1712 {
1713 	struct sock *sk = sock->sk;
1714 	struct sock *tsk;
1715 	struct sk_buff *skb;
1716 	int err;
1717 
1718 	err = -EOPNOTSUPP;
1719 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1720 		goto out;
1721 
1722 	err = -EINVAL;
1723 	if (sk->sk_state != TCP_LISTEN)
1724 		goto out;
1725 
1726 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1727 	 * so that no locks are necessary.
1728 	 */
1729 
1730 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1731 				&err);
1732 	if (!skb) {
1733 		/* This means receive shutdown. */
1734 		if (err == 0)
1735 			err = -EINVAL;
1736 		goto out;
1737 	}
1738 
1739 	tsk = skb->sk;
1740 	skb_free_datagram(sk, skb);
1741 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1742 
1743 	/* attach accepted sock to socket */
1744 	unix_state_lock(tsk);
1745 	newsock->state = SS_CONNECTED;
1746 	unix_sock_inherit_flags(sock, newsock);
1747 	sock_graft(tsk, newsock);
1748 	unix_state_unlock(tsk);
1749 	return 0;
1750 
1751 out:
1752 	return err;
1753 }
1754 
1755 
1756 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1757 {
1758 	struct sock *sk = sock->sk;
1759 	struct unix_address *addr;
1760 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1761 	int err = 0;
1762 
1763 	if (peer) {
1764 		sk = unix_peer_get(sk);
1765 
1766 		err = -ENOTCONN;
1767 		if (!sk)
1768 			goto out;
1769 		err = 0;
1770 	} else {
1771 		sock_hold(sk);
1772 	}
1773 
1774 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1775 	if (!addr) {
1776 		sunaddr->sun_family = AF_UNIX;
1777 		sunaddr->sun_path[0] = 0;
1778 		err = offsetof(struct sockaddr_un, sun_path);
1779 	} else {
1780 		err = addr->len;
1781 		memcpy(sunaddr, addr->name, addr->len);
1782 
1783 		if (peer)
1784 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1785 					       CGROUP_UNIX_GETPEERNAME);
1786 		else
1787 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1788 					       CGROUP_UNIX_GETSOCKNAME);
1789 	}
1790 	sock_put(sk);
1791 out:
1792 	return err;
1793 }
1794 
1795 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1796 {
1797 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1798 
1799 	/*
1800 	 * Garbage collection of unix sockets starts by selecting a set of
1801 	 * candidate sockets which have reference only from being in flight
1802 	 * (total_refs == inflight_refs).  This condition is checked once during
1803 	 * the candidate collection phase, and candidates are marked as such, so
1804 	 * that non-candidates can later be ignored.  While inflight_refs is
1805 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1806 	 * is an instantaneous decision.
1807 	 *
1808 	 * Once a candidate, however, the socket must not be reinstalled into a
1809 	 * file descriptor while the garbage collection is in progress.
1810 	 *
1811 	 * If the above conditions are met, then the directed graph of
1812 	 * candidates (*) does not change while unix_gc_lock is held.
1813 	 *
1814 	 * Any operations that changes the file count through file descriptors
1815 	 * (dup, close, sendmsg) does not change the graph since candidates are
1816 	 * not installed in fds.
1817 	 *
1818 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1819 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1820 	 * serialized with garbage collection.
1821 	 *
1822 	 * MSG_PEEK is special in that it does not change the inflight count,
1823 	 * yet does install the socket into an fd.  The following lock/unlock
1824 	 * pair is to ensure serialization with garbage collection.  It must be
1825 	 * done between incrementing the file count and installing the file into
1826 	 * an fd.
1827 	 *
1828 	 * If garbage collection starts after the barrier provided by the
1829 	 * lock/unlock, then it will see the elevated refcount and not mark this
1830 	 * as a candidate.  If a garbage collection is already in progress
1831 	 * before the file count was incremented, then the lock/unlock pair will
1832 	 * ensure that garbage collection is finished before progressing to
1833 	 * installing the fd.
1834 	 *
1835 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1836 	 * which is on the queue of listening socket A.
1837 	 */
1838 	spin_lock(&unix_gc_lock);
1839 	spin_unlock(&unix_gc_lock);
1840 }
1841 
1842 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1843 {
1844 	int err = 0;
1845 
1846 	UNIXCB(skb).pid  = get_pid(scm->pid);
1847 	UNIXCB(skb).uid = scm->creds.uid;
1848 	UNIXCB(skb).gid = scm->creds.gid;
1849 	UNIXCB(skb).fp = NULL;
1850 	unix_get_secdata(scm, skb);
1851 	if (scm->fp && send_fds)
1852 		err = unix_attach_fds(scm, skb);
1853 
1854 	skb->destructor = unix_destruct_scm;
1855 	return err;
1856 }
1857 
1858 static bool unix_passcred_enabled(const struct socket *sock,
1859 				  const struct sock *other)
1860 {
1861 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1862 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1863 	       !other->sk_socket ||
1864 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1865 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1866 }
1867 
1868 /*
1869  * Some apps rely on write() giving SCM_CREDENTIALS
1870  * We include credentials if source or destination socket
1871  * asserted SOCK_PASSCRED.
1872  */
1873 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1874 			    const struct sock *other)
1875 {
1876 	if (UNIXCB(skb).pid)
1877 		return;
1878 	if (unix_passcred_enabled(sock, other)) {
1879 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1880 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1881 	}
1882 }
1883 
1884 static bool unix_skb_scm_eq(struct sk_buff *skb,
1885 			    struct scm_cookie *scm)
1886 {
1887 	return UNIXCB(skb).pid == scm->pid &&
1888 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1889 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1890 	       unix_secdata_eq(scm, skb);
1891 }
1892 
1893 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1894 {
1895 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1896 	struct unix_sock *u = unix_sk(sk);
1897 
1898 	if (unlikely(fp && fp->count))
1899 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1900 }
1901 
1902 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1903 {
1904 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1905 	struct unix_sock *u = unix_sk(sk);
1906 
1907 	if (unlikely(fp && fp->count))
1908 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1909 }
1910 
1911 /*
1912  *	Send AF_UNIX data.
1913  */
1914 
1915 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1916 			      size_t len)
1917 {
1918 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1919 	struct sock *sk = sock->sk, *other = NULL;
1920 	struct unix_sock *u = unix_sk(sk);
1921 	struct scm_cookie scm;
1922 	struct sk_buff *skb;
1923 	int data_len = 0;
1924 	int sk_locked;
1925 	long timeo;
1926 	int err;
1927 
1928 	wait_for_unix_gc();
1929 	err = scm_send(sock, msg, &scm, false);
1930 	if (err < 0)
1931 		return err;
1932 
1933 	err = -EOPNOTSUPP;
1934 	if (msg->msg_flags&MSG_OOB)
1935 		goto out;
1936 
1937 	if (msg->msg_namelen) {
1938 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1939 		if (err)
1940 			goto out;
1941 
1942 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1943 							    msg->msg_name,
1944 							    &msg->msg_namelen,
1945 							    NULL);
1946 		if (err)
1947 			goto out;
1948 	} else {
1949 		sunaddr = NULL;
1950 		err = -ENOTCONN;
1951 		other = unix_peer_get(sk);
1952 		if (!other)
1953 			goto out;
1954 	}
1955 
1956 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1957 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1958 		err = unix_autobind(sk);
1959 		if (err)
1960 			goto out;
1961 	}
1962 
1963 	err = -EMSGSIZE;
1964 	if (len > sk->sk_sndbuf - 32)
1965 		goto out;
1966 
1967 	if (len > SKB_MAX_ALLOC) {
1968 		data_len = min_t(size_t,
1969 				 len - SKB_MAX_ALLOC,
1970 				 MAX_SKB_FRAGS * PAGE_SIZE);
1971 		data_len = PAGE_ALIGN(data_len);
1972 
1973 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1974 	}
1975 
1976 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1977 				   msg->msg_flags & MSG_DONTWAIT, &err,
1978 				   PAGE_ALLOC_COSTLY_ORDER);
1979 	if (skb == NULL)
1980 		goto out;
1981 
1982 	err = unix_scm_to_skb(&scm, skb, true);
1983 	if (err < 0)
1984 		goto out_free;
1985 
1986 	skb_put(skb, len - data_len);
1987 	skb->data_len = data_len;
1988 	skb->len = len;
1989 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1990 	if (err)
1991 		goto out_free;
1992 
1993 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1994 
1995 restart:
1996 	if (!other) {
1997 		err = -ECONNRESET;
1998 		if (sunaddr == NULL)
1999 			goto out_free;
2000 
2001 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2002 					sk->sk_type);
2003 		if (IS_ERR(other)) {
2004 			err = PTR_ERR(other);
2005 			other = NULL;
2006 			goto out_free;
2007 		}
2008 	}
2009 
2010 	if (sk_filter(other, skb) < 0) {
2011 		/* Toss the packet but do not return any error to the sender */
2012 		err = len;
2013 		goto out_free;
2014 	}
2015 
2016 	sk_locked = 0;
2017 	unix_state_lock(other);
2018 restart_locked:
2019 	err = -EPERM;
2020 	if (!unix_may_send(sk, other))
2021 		goto out_unlock;
2022 
2023 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2024 		/*
2025 		 *	Check with 1003.1g - what should
2026 		 *	datagram error
2027 		 */
2028 		unix_state_unlock(other);
2029 		sock_put(other);
2030 
2031 		if (!sk_locked)
2032 			unix_state_lock(sk);
2033 
2034 		err = 0;
2035 		if (sk->sk_type == SOCK_SEQPACKET) {
2036 			/* We are here only when racing with unix_release_sock()
2037 			 * is clearing @other. Never change state to TCP_CLOSE
2038 			 * unlike SOCK_DGRAM wants.
2039 			 */
2040 			unix_state_unlock(sk);
2041 			err = -EPIPE;
2042 		} else if (unix_peer(sk) == other) {
2043 			unix_peer(sk) = NULL;
2044 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2045 
2046 			sk->sk_state = TCP_CLOSE;
2047 			unix_state_unlock(sk);
2048 
2049 			unix_dgram_disconnected(sk, other);
2050 			sock_put(other);
2051 			err = -ECONNREFUSED;
2052 		} else {
2053 			unix_state_unlock(sk);
2054 		}
2055 
2056 		other = NULL;
2057 		if (err)
2058 			goto out_free;
2059 		goto restart;
2060 	}
2061 
2062 	err = -EPIPE;
2063 	if (other->sk_shutdown & RCV_SHUTDOWN)
2064 		goto out_unlock;
2065 
2066 	if (sk->sk_type != SOCK_SEQPACKET) {
2067 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2068 		if (err)
2069 			goto out_unlock;
2070 	}
2071 
2072 	/* other == sk && unix_peer(other) != sk if
2073 	 * - unix_peer(sk) == NULL, destination address bound to sk
2074 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2075 	 */
2076 	if (other != sk &&
2077 	    unlikely(unix_peer(other) != sk &&
2078 	    unix_recvq_full_lockless(other))) {
2079 		if (timeo) {
2080 			timeo = unix_wait_for_peer(other, timeo);
2081 
2082 			err = sock_intr_errno(timeo);
2083 			if (signal_pending(current))
2084 				goto out_free;
2085 
2086 			goto restart;
2087 		}
2088 
2089 		if (!sk_locked) {
2090 			unix_state_unlock(other);
2091 			unix_state_double_lock(sk, other);
2092 		}
2093 
2094 		if (unix_peer(sk) != other ||
2095 		    unix_dgram_peer_wake_me(sk, other)) {
2096 			err = -EAGAIN;
2097 			sk_locked = 1;
2098 			goto out_unlock;
2099 		}
2100 
2101 		if (!sk_locked) {
2102 			sk_locked = 1;
2103 			goto restart_locked;
2104 		}
2105 	}
2106 
2107 	if (unlikely(sk_locked))
2108 		unix_state_unlock(sk);
2109 
2110 	if (sock_flag(other, SOCK_RCVTSTAMP))
2111 		__net_timestamp(skb);
2112 	maybe_add_creds(skb, sock, other);
2113 	scm_stat_add(other, skb);
2114 	skb_queue_tail(&other->sk_receive_queue, skb);
2115 	unix_state_unlock(other);
2116 	other->sk_data_ready(other);
2117 	sock_put(other);
2118 	scm_destroy(&scm);
2119 	return len;
2120 
2121 out_unlock:
2122 	if (sk_locked)
2123 		unix_state_unlock(sk);
2124 	unix_state_unlock(other);
2125 out_free:
2126 	kfree_skb(skb);
2127 out:
2128 	if (other)
2129 		sock_put(other);
2130 	scm_destroy(&scm);
2131 	return err;
2132 }
2133 
2134 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2135  * bytes, and a minimum of a full page.
2136  */
2137 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2138 
2139 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2140 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2141 		     struct scm_cookie *scm, bool fds_sent)
2142 {
2143 	struct unix_sock *ousk = unix_sk(other);
2144 	struct sk_buff *skb;
2145 	int err = 0;
2146 
2147 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2148 
2149 	if (!skb)
2150 		return err;
2151 
2152 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2153 	if (err < 0) {
2154 		kfree_skb(skb);
2155 		return err;
2156 	}
2157 	skb_put(skb, 1);
2158 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2159 
2160 	if (err) {
2161 		kfree_skb(skb);
2162 		return err;
2163 	}
2164 
2165 	unix_state_lock(other);
2166 
2167 	if (sock_flag(other, SOCK_DEAD) ||
2168 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2169 		unix_state_unlock(other);
2170 		kfree_skb(skb);
2171 		return -EPIPE;
2172 	}
2173 
2174 	maybe_add_creds(skb, sock, other);
2175 	skb_get(skb);
2176 
2177 	if (ousk->oob_skb)
2178 		consume_skb(ousk->oob_skb);
2179 
2180 	WRITE_ONCE(ousk->oob_skb, skb);
2181 
2182 	scm_stat_add(other, skb);
2183 	skb_queue_tail(&other->sk_receive_queue, skb);
2184 	sk_send_sigurg(other);
2185 	unix_state_unlock(other);
2186 	other->sk_data_ready(other);
2187 
2188 	return err;
2189 }
2190 #endif
2191 
2192 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2193 			       size_t len)
2194 {
2195 	struct sock *sk = sock->sk;
2196 	struct sock *other = NULL;
2197 	int err, size;
2198 	struct sk_buff *skb;
2199 	int sent = 0;
2200 	struct scm_cookie scm;
2201 	bool fds_sent = false;
2202 	int data_len;
2203 
2204 	wait_for_unix_gc();
2205 	err = scm_send(sock, msg, &scm, false);
2206 	if (err < 0)
2207 		return err;
2208 
2209 	err = -EOPNOTSUPP;
2210 	if (msg->msg_flags & MSG_OOB) {
2211 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2212 		if (len)
2213 			len--;
2214 		else
2215 #endif
2216 			goto out_err;
2217 	}
2218 
2219 	if (msg->msg_namelen) {
2220 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2221 		goto out_err;
2222 	} else {
2223 		err = -ENOTCONN;
2224 		other = unix_peer(sk);
2225 		if (!other)
2226 			goto out_err;
2227 	}
2228 
2229 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2230 		goto pipe_err;
2231 
2232 	while (sent < len) {
2233 		size = len - sent;
2234 
2235 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2236 			skb = sock_alloc_send_pskb(sk, 0, 0,
2237 						   msg->msg_flags & MSG_DONTWAIT,
2238 						   &err, 0);
2239 		} else {
2240 			/* Keep two messages in the pipe so it schedules better */
2241 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2242 
2243 			/* allow fallback to order-0 allocations */
2244 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2245 
2246 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2247 
2248 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2249 
2250 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2251 						   msg->msg_flags & MSG_DONTWAIT, &err,
2252 						   get_order(UNIX_SKB_FRAGS_SZ));
2253 		}
2254 		if (!skb)
2255 			goto out_err;
2256 
2257 		/* Only send the fds in the first buffer */
2258 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2259 		if (err < 0) {
2260 			kfree_skb(skb);
2261 			goto out_err;
2262 		}
2263 		fds_sent = true;
2264 
2265 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2266 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2267 						   sk->sk_allocation);
2268 			if (err < 0) {
2269 				kfree_skb(skb);
2270 				goto out_err;
2271 			}
2272 			size = err;
2273 			refcount_add(size, &sk->sk_wmem_alloc);
2274 		} else {
2275 			skb_put(skb, size - data_len);
2276 			skb->data_len = data_len;
2277 			skb->len = size;
2278 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2279 			if (err) {
2280 				kfree_skb(skb);
2281 				goto out_err;
2282 			}
2283 		}
2284 
2285 		unix_state_lock(other);
2286 
2287 		if (sock_flag(other, SOCK_DEAD) ||
2288 		    (other->sk_shutdown & RCV_SHUTDOWN))
2289 			goto pipe_err_free;
2290 
2291 		maybe_add_creds(skb, sock, other);
2292 		scm_stat_add(other, skb);
2293 		skb_queue_tail(&other->sk_receive_queue, skb);
2294 		unix_state_unlock(other);
2295 		other->sk_data_ready(other);
2296 		sent += size;
2297 	}
2298 
2299 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2300 	if (msg->msg_flags & MSG_OOB) {
2301 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2302 		if (err)
2303 			goto out_err;
2304 		sent++;
2305 	}
2306 #endif
2307 
2308 	scm_destroy(&scm);
2309 
2310 	return sent;
2311 
2312 pipe_err_free:
2313 	unix_state_unlock(other);
2314 	kfree_skb(skb);
2315 pipe_err:
2316 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2317 		send_sig(SIGPIPE, current, 0);
2318 	err = -EPIPE;
2319 out_err:
2320 	scm_destroy(&scm);
2321 	return sent ? : err;
2322 }
2323 
2324 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2325 				  size_t len)
2326 {
2327 	int err;
2328 	struct sock *sk = sock->sk;
2329 
2330 	err = sock_error(sk);
2331 	if (err)
2332 		return err;
2333 
2334 	if (sk->sk_state != TCP_ESTABLISHED)
2335 		return -ENOTCONN;
2336 
2337 	if (msg->msg_namelen)
2338 		msg->msg_namelen = 0;
2339 
2340 	return unix_dgram_sendmsg(sock, msg, len);
2341 }
2342 
2343 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2344 				  size_t size, int flags)
2345 {
2346 	struct sock *sk = sock->sk;
2347 
2348 	if (sk->sk_state != TCP_ESTABLISHED)
2349 		return -ENOTCONN;
2350 
2351 	return unix_dgram_recvmsg(sock, msg, size, flags);
2352 }
2353 
2354 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2355 {
2356 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2357 
2358 	if (addr) {
2359 		msg->msg_namelen = addr->len;
2360 		memcpy(msg->msg_name, addr->name, addr->len);
2361 	}
2362 }
2363 
2364 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2365 			 int flags)
2366 {
2367 	struct scm_cookie scm;
2368 	struct socket *sock = sk->sk_socket;
2369 	struct unix_sock *u = unix_sk(sk);
2370 	struct sk_buff *skb, *last;
2371 	long timeo;
2372 	int skip;
2373 	int err;
2374 
2375 	err = -EOPNOTSUPP;
2376 	if (flags&MSG_OOB)
2377 		goto out;
2378 
2379 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2380 
2381 	do {
2382 		mutex_lock(&u->iolock);
2383 
2384 		skip = sk_peek_offset(sk, flags);
2385 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2386 					      &skip, &err, &last);
2387 		if (skb) {
2388 			if (!(flags & MSG_PEEK))
2389 				scm_stat_del(sk, skb);
2390 			break;
2391 		}
2392 
2393 		mutex_unlock(&u->iolock);
2394 
2395 		if (err != -EAGAIN)
2396 			break;
2397 	} while (timeo &&
2398 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2399 					      &err, &timeo, last));
2400 
2401 	if (!skb) { /* implies iolock unlocked */
2402 		unix_state_lock(sk);
2403 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2404 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2405 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2406 			err = 0;
2407 		unix_state_unlock(sk);
2408 		goto out;
2409 	}
2410 
2411 	if (wq_has_sleeper(&u->peer_wait))
2412 		wake_up_interruptible_sync_poll(&u->peer_wait,
2413 						EPOLLOUT | EPOLLWRNORM |
2414 						EPOLLWRBAND);
2415 
2416 	if (msg->msg_name) {
2417 		unix_copy_addr(msg, skb->sk);
2418 
2419 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2420 						      msg->msg_name,
2421 						      &msg->msg_namelen);
2422 	}
2423 
2424 	if (size > skb->len - skip)
2425 		size = skb->len - skip;
2426 	else if (size < skb->len - skip)
2427 		msg->msg_flags |= MSG_TRUNC;
2428 
2429 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2430 	if (err)
2431 		goto out_free;
2432 
2433 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2434 		__sock_recv_timestamp(msg, sk, skb);
2435 
2436 	memset(&scm, 0, sizeof(scm));
2437 
2438 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2439 	unix_set_secdata(&scm, skb);
2440 
2441 	if (!(flags & MSG_PEEK)) {
2442 		if (UNIXCB(skb).fp)
2443 			unix_detach_fds(&scm, skb);
2444 
2445 		sk_peek_offset_bwd(sk, skb->len);
2446 	} else {
2447 		/* It is questionable: on PEEK we could:
2448 		   - do not return fds - good, but too simple 8)
2449 		   - return fds, and do not return them on read (old strategy,
2450 		     apparently wrong)
2451 		   - clone fds (I chose it for now, it is the most universal
2452 		     solution)
2453 
2454 		   POSIX 1003.1g does not actually define this clearly
2455 		   at all. POSIX 1003.1g doesn't define a lot of things
2456 		   clearly however!
2457 
2458 		*/
2459 
2460 		sk_peek_offset_fwd(sk, size);
2461 
2462 		if (UNIXCB(skb).fp)
2463 			unix_peek_fds(&scm, skb);
2464 	}
2465 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2466 
2467 	scm_recv_unix(sock, msg, &scm, flags);
2468 
2469 out_free:
2470 	skb_free_datagram(sk, skb);
2471 	mutex_unlock(&u->iolock);
2472 out:
2473 	return err;
2474 }
2475 
2476 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2477 			      int flags)
2478 {
2479 	struct sock *sk = sock->sk;
2480 
2481 #ifdef CONFIG_BPF_SYSCALL
2482 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2483 
2484 	if (prot != &unix_dgram_proto)
2485 		return prot->recvmsg(sk, msg, size, flags, NULL);
2486 #endif
2487 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2488 }
2489 
2490 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2491 {
2492 	struct unix_sock *u = unix_sk(sk);
2493 	struct sk_buff *skb;
2494 	int err;
2495 
2496 	mutex_lock(&u->iolock);
2497 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2498 	mutex_unlock(&u->iolock);
2499 	if (!skb)
2500 		return err;
2501 
2502 	return recv_actor(sk, skb);
2503 }
2504 
2505 /*
2506  *	Sleep until more data has arrived. But check for races..
2507  */
2508 static long unix_stream_data_wait(struct sock *sk, long timeo,
2509 				  struct sk_buff *last, unsigned int last_len,
2510 				  bool freezable)
2511 {
2512 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2513 	struct sk_buff *tail;
2514 	DEFINE_WAIT(wait);
2515 
2516 	unix_state_lock(sk);
2517 
2518 	for (;;) {
2519 		prepare_to_wait(sk_sleep(sk), &wait, state);
2520 
2521 		tail = skb_peek_tail(&sk->sk_receive_queue);
2522 		if (tail != last ||
2523 		    (tail && tail->len != last_len) ||
2524 		    sk->sk_err ||
2525 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2526 		    signal_pending(current) ||
2527 		    !timeo)
2528 			break;
2529 
2530 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2531 		unix_state_unlock(sk);
2532 		timeo = schedule_timeout(timeo);
2533 		unix_state_lock(sk);
2534 
2535 		if (sock_flag(sk, SOCK_DEAD))
2536 			break;
2537 
2538 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2539 	}
2540 
2541 	finish_wait(sk_sleep(sk), &wait);
2542 	unix_state_unlock(sk);
2543 	return timeo;
2544 }
2545 
2546 static unsigned int unix_skb_len(const struct sk_buff *skb)
2547 {
2548 	return skb->len - UNIXCB(skb).consumed;
2549 }
2550 
2551 struct unix_stream_read_state {
2552 	int (*recv_actor)(struct sk_buff *, int, int,
2553 			  struct unix_stream_read_state *);
2554 	struct socket *socket;
2555 	struct msghdr *msg;
2556 	struct pipe_inode_info *pipe;
2557 	size_t size;
2558 	int flags;
2559 	unsigned int splice_flags;
2560 };
2561 
2562 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2563 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2564 {
2565 	struct socket *sock = state->socket;
2566 	struct sock *sk = sock->sk;
2567 	struct unix_sock *u = unix_sk(sk);
2568 	int chunk = 1;
2569 	struct sk_buff *oob_skb;
2570 
2571 	mutex_lock(&u->iolock);
2572 	unix_state_lock(sk);
2573 
2574 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2575 		unix_state_unlock(sk);
2576 		mutex_unlock(&u->iolock);
2577 		return -EINVAL;
2578 	}
2579 
2580 	oob_skb = u->oob_skb;
2581 
2582 	if (!(state->flags & MSG_PEEK))
2583 		WRITE_ONCE(u->oob_skb, NULL);
2584 
2585 	unix_state_unlock(sk);
2586 
2587 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2588 
2589 	if (!(state->flags & MSG_PEEK)) {
2590 		UNIXCB(oob_skb).consumed += 1;
2591 		kfree_skb(oob_skb);
2592 	}
2593 
2594 	mutex_unlock(&u->iolock);
2595 
2596 	if (chunk < 0)
2597 		return -EFAULT;
2598 
2599 	state->msg->msg_flags |= MSG_OOB;
2600 	return 1;
2601 }
2602 
2603 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2604 				  int flags, int copied)
2605 {
2606 	struct unix_sock *u = unix_sk(sk);
2607 
2608 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2609 		skb_unlink(skb, &sk->sk_receive_queue);
2610 		consume_skb(skb);
2611 		skb = NULL;
2612 	} else {
2613 		if (skb == u->oob_skb) {
2614 			if (copied) {
2615 				skb = NULL;
2616 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2617 				if (!(flags & MSG_PEEK)) {
2618 					WRITE_ONCE(u->oob_skb, NULL);
2619 					consume_skb(skb);
2620 				}
2621 			} else if (!(flags & MSG_PEEK)) {
2622 				skb_unlink(skb, &sk->sk_receive_queue);
2623 				consume_skb(skb);
2624 				skb = skb_peek(&sk->sk_receive_queue);
2625 			}
2626 		}
2627 	}
2628 	return skb;
2629 }
2630 #endif
2631 
2632 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2633 {
2634 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2635 		return -ENOTCONN;
2636 
2637 	return unix_read_skb(sk, recv_actor);
2638 }
2639 
2640 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2641 				    bool freezable)
2642 {
2643 	struct scm_cookie scm;
2644 	struct socket *sock = state->socket;
2645 	struct sock *sk = sock->sk;
2646 	struct unix_sock *u = unix_sk(sk);
2647 	int copied = 0;
2648 	int flags = state->flags;
2649 	int noblock = flags & MSG_DONTWAIT;
2650 	bool check_creds = false;
2651 	int target;
2652 	int err = 0;
2653 	long timeo;
2654 	int skip;
2655 	size_t size = state->size;
2656 	unsigned int last_len;
2657 
2658 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2659 		err = -EINVAL;
2660 		goto out;
2661 	}
2662 
2663 	if (unlikely(flags & MSG_OOB)) {
2664 		err = -EOPNOTSUPP;
2665 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2666 		err = unix_stream_recv_urg(state);
2667 #endif
2668 		goto out;
2669 	}
2670 
2671 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2672 	timeo = sock_rcvtimeo(sk, noblock);
2673 
2674 	memset(&scm, 0, sizeof(scm));
2675 
2676 	/* Lock the socket to prevent queue disordering
2677 	 * while sleeps in memcpy_tomsg
2678 	 */
2679 	mutex_lock(&u->iolock);
2680 
2681 	skip = max(sk_peek_offset(sk, flags), 0);
2682 
2683 	do {
2684 		int chunk;
2685 		bool drop_skb;
2686 		struct sk_buff *skb, *last;
2687 
2688 redo:
2689 		unix_state_lock(sk);
2690 		if (sock_flag(sk, SOCK_DEAD)) {
2691 			err = -ECONNRESET;
2692 			goto unlock;
2693 		}
2694 		last = skb = skb_peek(&sk->sk_receive_queue);
2695 		last_len = last ? last->len : 0;
2696 
2697 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2698 		if (skb) {
2699 			skb = manage_oob(skb, sk, flags, copied);
2700 			if (!skb) {
2701 				unix_state_unlock(sk);
2702 				if (copied)
2703 					break;
2704 				goto redo;
2705 			}
2706 		}
2707 #endif
2708 again:
2709 		if (skb == NULL) {
2710 			if (copied >= target)
2711 				goto unlock;
2712 
2713 			/*
2714 			 *	POSIX 1003.1g mandates this order.
2715 			 */
2716 
2717 			err = sock_error(sk);
2718 			if (err)
2719 				goto unlock;
2720 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2721 				goto unlock;
2722 
2723 			unix_state_unlock(sk);
2724 			if (!timeo) {
2725 				err = -EAGAIN;
2726 				break;
2727 			}
2728 
2729 			mutex_unlock(&u->iolock);
2730 
2731 			timeo = unix_stream_data_wait(sk, timeo, last,
2732 						      last_len, freezable);
2733 
2734 			if (signal_pending(current)) {
2735 				err = sock_intr_errno(timeo);
2736 				scm_destroy(&scm);
2737 				goto out;
2738 			}
2739 
2740 			mutex_lock(&u->iolock);
2741 			goto redo;
2742 unlock:
2743 			unix_state_unlock(sk);
2744 			break;
2745 		}
2746 
2747 		while (skip >= unix_skb_len(skb)) {
2748 			skip -= unix_skb_len(skb);
2749 			last = skb;
2750 			last_len = skb->len;
2751 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2752 			if (!skb)
2753 				goto again;
2754 		}
2755 
2756 		unix_state_unlock(sk);
2757 
2758 		if (check_creds) {
2759 			/* Never glue messages from different writers */
2760 			if (!unix_skb_scm_eq(skb, &scm))
2761 				break;
2762 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2763 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2764 			/* Copy credentials */
2765 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2766 			unix_set_secdata(&scm, skb);
2767 			check_creds = true;
2768 		}
2769 
2770 		/* Copy address just once */
2771 		if (state->msg && state->msg->msg_name) {
2772 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2773 					 state->msg->msg_name);
2774 			unix_copy_addr(state->msg, skb->sk);
2775 
2776 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2777 							      state->msg->msg_name,
2778 							      &state->msg->msg_namelen);
2779 
2780 			sunaddr = NULL;
2781 		}
2782 
2783 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2784 		skb_get(skb);
2785 		chunk = state->recv_actor(skb, skip, chunk, state);
2786 		drop_skb = !unix_skb_len(skb);
2787 		/* skb is only safe to use if !drop_skb */
2788 		consume_skb(skb);
2789 		if (chunk < 0) {
2790 			if (copied == 0)
2791 				copied = -EFAULT;
2792 			break;
2793 		}
2794 		copied += chunk;
2795 		size -= chunk;
2796 
2797 		if (drop_skb) {
2798 			/* the skb was touched by a concurrent reader;
2799 			 * we should not expect anything from this skb
2800 			 * anymore and assume it invalid - we can be
2801 			 * sure it was dropped from the socket queue
2802 			 *
2803 			 * let's report a short read
2804 			 */
2805 			err = 0;
2806 			break;
2807 		}
2808 
2809 		/* Mark read part of skb as used */
2810 		if (!(flags & MSG_PEEK)) {
2811 			UNIXCB(skb).consumed += chunk;
2812 
2813 			sk_peek_offset_bwd(sk, chunk);
2814 
2815 			if (UNIXCB(skb).fp) {
2816 				scm_stat_del(sk, skb);
2817 				unix_detach_fds(&scm, skb);
2818 			}
2819 
2820 			if (unix_skb_len(skb))
2821 				break;
2822 
2823 			skb_unlink(skb, &sk->sk_receive_queue);
2824 			consume_skb(skb);
2825 
2826 			if (scm.fp)
2827 				break;
2828 		} else {
2829 			/* It is questionable, see note in unix_dgram_recvmsg.
2830 			 */
2831 			if (UNIXCB(skb).fp)
2832 				unix_peek_fds(&scm, skb);
2833 
2834 			sk_peek_offset_fwd(sk, chunk);
2835 
2836 			if (UNIXCB(skb).fp)
2837 				break;
2838 
2839 			skip = 0;
2840 			last = skb;
2841 			last_len = skb->len;
2842 			unix_state_lock(sk);
2843 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2844 			if (skb)
2845 				goto again;
2846 			unix_state_unlock(sk);
2847 			break;
2848 		}
2849 	} while (size);
2850 
2851 	mutex_unlock(&u->iolock);
2852 	if (state->msg)
2853 		scm_recv_unix(sock, state->msg, &scm, flags);
2854 	else
2855 		scm_destroy(&scm);
2856 out:
2857 	return copied ? : err;
2858 }
2859 
2860 static int unix_stream_read_actor(struct sk_buff *skb,
2861 				  int skip, int chunk,
2862 				  struct unix_stream_read_state *state)
2863 {
2864 	int ret;
2865 
2866 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2867 				    state->msg, chunk);
2868 	return ret ?: chunk;
2869 }
2870 
2871 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2872 			  size_t size, int flags)
2873 {
2874 	struct unix_stream_read_state state = {
2875 		.recv_actor = unix_stream_read_actor,
2876 		.socket = sk->sk_socket,
2877 		.msg = msg,
2878 		.size = size,
2879 		.flags = flags
2880 	};
2881 
2882 	return unix_stream_read_generic(&state, true);
2883 }
2884 
2885 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2886 			       size_t size, int flags)
2887 {
2888 	struct unix_stream_read_state state = {
2889 		.recv_actor = unix_stream_read_actor,
2890 		.socket = sock,
2891 		.msg = msg,
2892 		.size = size,
2893 		.flags = flags
2894 	};
2895 
2896 #ifdef CONFIG_BPF_SYSCALL
2897 	struct sock *sk = sock->sk;
2898 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2899 
2900 	if (prot != &unix_stream_proto)
2901 		return prot->recvmsg(sk, msg, size, flags, NULL);
2902 #endif
2903 	return unix_stream_read_generic(&state, true);
2904 }
2905 
2906 static int unix_stream_splice_actor(struct sk_buff *skb,
2907 				    int skip, int chunk,
2908 				    struct unix_stream_read_state *state)
2909 {
2910 	return skb_splice_bits(skb, state->socket->sk,
2911 			       UNIXCB(skb).consumed + skip,
2912 			       state->pipe, chunk, state->splice_flags);
2913 }
2914 
2915 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2916 				       struct pipe_inode_info *pipe,
2917 				       size_t size, unsigned int flags)
2918 {
2919 	struct unix_stream_read_state state = {
2920 		.recv_actor = unix_stream_splice_actor,
2921 		.socket = sock,
2922 		.pipe = pipe,
2923 		.size = size,
2924 		.splice_flags = flags,
2925 	};
2926 
2927 	if (unlikely(*ppos))
2928 		return -ESPIPE;
2929 
2930 	if (sock->file->f_flags & O_NONBLOCK ||
2931 	    flags & SPLICE_F_NONBLOCK)
2932 		state.flags = MSG_DONTWAIT;
2933 
2934 	return unix_stream_read_generic(&state, false);
2935 }
2936 
2937 static int unix_shutdown(struct socket *sock, int mode)
2938 {
2939 	struct sock *sk = sock->sk;
2940 	struct sock *other;
2941 
2942 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2943 		return -EINVAL;
2944 	/* This maps:
2945 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2946 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2947 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2948 	 */
2949 	++mode;
2950 
2951 	unix_state_lock(sk);
2952 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2953 	other = unix_peer(sk);
2954 	if (other)
2955 		sock_hold(other);
2956 	unix_state_unlock(sk);
2957 	sk->sk_state_change(sk);
2958 
2959 	if (other &&
2960 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2961 
2962 		int peer_mode = 0;
2963 		const struct proto *prot = READ_ONCE(other->sk_prot);
2964 
2965 		if (prot->unhash)
2966 			prot->unhash(other);
2967 		if (mode&RCV_SHUTDOWN)
2968 			peer_mode |= SEND_SHUTDOWN;
2969 		if (mode&SEND_SHUTDOWN)
2970 			peer_mode |= RCV_SHUTDOWN;
2971 		unix_state_lock(other);
2972 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2973 		unix_state_unlock(other);
2974 		other->sk_state_change(other);
2975 		if (peer_mode == SHUTDOWN_MASK)
2976 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2977 		else if (peer_mode & RCV_SHUTDOWN)
2978 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2979 	}
2980 	if (other)
2981 		sock_put(other);
2982 
2983 	return 0;
2984 }
2985 
2986 long unix_inq_len(struct sock *sk)
2987 {
2988 	struct sk_buff *skb;
2989 	long amount = 0;
2990 
2991 	if (sk->sk_state == TCP_LISTEN)
2992 		return -EINVAL;
2993 
2994 	spin_lock(&sk->sk_receive_queue.lock);
2995 	if (sk->sk_type == SOCK_STREAM ||
2996 	    sk->sk_type == SOCK_SEQPACKET) {
2997 		skb_queue_walk(&sk->sk_receive_queue, skb)
2998 			amount += unix_skb_len(skb);
2999 	} else {
3000 		skb = skb_peek(&sk->sk_receive_queue);
3001 		if (skb)
3002 			amount = skb->len;
3003 	}
3004 	spin_unlock(&sk->sk_receive_queue.lock);
3005 
3006 	return amount;
3007 }
3008 EXPORT_SYMBOL_GPL(unix_inq_len);
3009 
3010 long unix_outq_len(struct sock *sk)
3011 {
3012 	return sk_wmem_alloc_get(sk);
3013 }
3014 EXPORT_SYMBOL_GPL(unix_outq_len);
3015 
3016 static int unix_open_file(struct sock *sk)
3017 {
3018 	struct path path;
3019 	struct file *f;
3020 	int fd;
3021 
3022 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3023 		return -EPERM;
3024 
3025 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3026 		return -ENOENT;
3027 
3028 	path = unix_sk(sk)->path;
3029 	if (!path.dentry)
3030 		return -ENOENT;
3031 
3032 	path_get(&path);
3033 
3034 	fd = get_unused_fd_flags(O_CLOEXEC);
3035 	if (fd < 0)
3036 		goto out;
3037 
3038 	f = dentry_open(&path, O_PATH, current_cred());
3039 	if (IS_ERR(f)) {
3040 		put_unused_fd(fd);
3041 		fd = PTR_ERR(f);
3042 		goto out;
3043 	}
3044 
3045 	fd_install(fd, f);
3046 out:
3047 	path_put(&path);
3048 
3049 	return fd;
3050 }
3051 
3052 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3053 {
3054 	struct sock *sk = sock->sk;
3055 	long amount = 0;
3056 	int err;
3057 
3058 	switch (cmd) {
3059 	case SIOCOUTQ:
3060 		amount = unix_outq_len(sk);
3061 		err = put_user(amount, (int __user *)arg);
3062 		break;
3063 	case SIOCINQ:
3064 		amount = unix_inq_len(sk);
3065 		if (amount < 0)
3066 			err = amount;
3067 		else
3068 			err = put_user(amount, (int __user *)arg);
3069 		break;
3070 	case SIOCUNIXFILE:
3071 		err = unix_open_file(sk);
3072 		break;
3073 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3074 	case SIOCATMARK:
3075 		{
3076 			struct sk_buff *skb;
3077 			int answ = 0;
3078 
3079 			skb = skb_peek(&sk->sk_receive_queue);
3080 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3081 				answ = 1;
3082 			err = put_user(answ, (int __user *)arg);
3083 		}
3084 		break;
3085 #endif
3086 	default:
3087 		err = -ENOIOCTLCMD;
3088 		break;
3089 	}
3090 	return err;
3091 }
3092 
3093 #ifdef CONFIG_COMPAT
3094 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3095 {
3096 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3097 }
3098 #endif
3099 
3100 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3101 {
3102 	struct sock *sk = sock->sk;
3103 	__poll_t mask;
3104 	u8 shutdown;
3105 
3106 	sock_poll_wait(file, sock, wait);
3107 	mask = 0;
3108 	shutdown = READ_ONCE(sk->sk_shutdown);
3109 
3110 	/* exceptional events? */
3111 	if (READ_ONCE(sk->sk_err))
3112 		mask |= EPOLLERR;
3113 	if (shutdown == SHUTDOWN_MASK)
3114 		mask |= EPOLLHUP;
3115 	if (shutdown & RCV_SHUTDOWN)
3116 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3117 
3118 	/* readable? */
3119 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3120 		mask |= EPOLLIN | EPOLLRDNORM;
3121 	if (sk_is_readable(sk))
3122 		mask |= EPOLLIN | EPOLLRDNORM;
3123 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3124 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3125 		mask |= EPOLLPRI;
3126 #endif
3127 
3128 	/* Connection-based need to check for termination and startup */
3129 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3130 	    sk->sk_state == TCP_CLOSE)
3131 		mask |= EPOLLHUP;
3132 
3133 	/*
3134 	 * we set writable also when the other side has shut down the
3135 	 * connection. This prevents stuck sockets.
3136 	 */
3137 	if (unix_writable(sk))
3138 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3139 
3140 	return mask;
3141 }
3142 
3143 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3144 				    poll_table *wait)
3145 {
3146 	struct sock *sk = sock->sk, *other;
3147 	unsigned int writable;
3148 	__poll_t mask;
3149 	u8 shutdown;
3150 
3151 	sock_poll_wait(file, sock, wait);
3152 	mask = 0;
3153 	shutdown = READ_ONCE(sk->sk_shutdown);
3154 
3155 	/* exceptional events? */
3156 	if (READ_ONCE(sk->sk_err) ||
3157 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3158 		mask |= EPOLLERR |
3159 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3160 
3161 	if (shutdown & RCV_SHUTDOWN)
3162 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3163 	if (shutdown == SHUTDOWN_MASK)
3164 		mask |= EPOLLHUP;
3165 
3166 	/* readable? */
3167 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3168 		mask |= EPOLLIN | EPOLLRDNORM;
3169 	if (sk_is_readable(sk))
3170 		mask |= EPOLLIN | EPOLLRDNORM;
3171 
3172 	/* Connection-based need to check for termination and startup */
3173 	if (sk->sk_type == SOCK_SEQPACKET) {
3174 		if (sk->sk_state == TCP_CLOSE)
3175 			mask |= EPOLLHUP;
3176 		/* connection hasn't started yet? */
3177 		if (sk->sk_state == TCP_SYN_SENT)
3178 			return mask;
3179 	}
3180 
3181 	/* No write status requested, avoid expensive OUT tests. */
3182 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3183 		return mask;
3184 
3185 	writable = unix_writable(sk);
3186 	if (writable) {
3187 		unix_state_lock(sk);
3188 
3189 		other = unix_peer(sk);
3190 		if (other && unix_peer(other) != sk &&
3191 		    unix_recvq_full_lockless(other) &&
3192 		    unix_dgram_peer_wake_me(sk, other))
3193 			writable = 0;
3194 
3195 		unix_state_unlock(sk);
3196 	}
3197 
3198 	if (writable)
3199 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3200 	else
3201 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3202 
3203 	return mask;
3204 }
3205 
3206 #ifdef CONFIG_PROC_FS
3207 
3208 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3209 
3210 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3211 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3212 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3213 
3214 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3215 {
3216 	unsigned long offset = get_offset(*pos);
3217 	unsigned long bucket = get_bucket(*pos);
3218 	unsigned long count = 0;
3219 	struct sock *sk;
3220 
3221 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3222 	     sk; sk = sk_next(sk)) {
3223 		if (++count == offset)
3224 			break;
3225 	}
3226 
3227 	return sk;
3228 }
3229 
3230 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3231 {
3232 	unsigned long bucket = get_bucket(*pos);
3233 	struct net *net = seq_file_net(seq);
3234 	struct sock *sk;
3235 
3236 	while (bucket < UNIX_HASH_SIZE) {
3237 		spin_lock(&net->unx.table.locks[bucket]);
3238 
3239 		sk = unix_from_bucket(seq, pos);
3240 		if (sk)
3241 			return sk;
3242 
3243 		spin_unlock(&net->unx.table.locks[bucket]);
3244 
3245 		*pos = set_bucket_offset(++bucket, 1);
3246 	}
3247 
3248 	return NULL;
3249 }
3250 
3251 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3252 				  loff_t *pos)
3253 {
3254 	unsigned long bucket = get_bucket(*pos);
3255 
3256 	sk = sk_next(sk);
3257 	if (sk)
3258 		return sk;
3259 
3260 
3261 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3262 
3263 	*pos = set_bucket_offset(++bucket, 1);
3264 
3265 	return unix_get_first(seq, pos);
3266 }
3267 
3268 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3269 {
3270 	if (!*pos)
3271 		return SEQ_START_TOKEN;
3272 
3273 	return unix_get_first(seq, pos);
3274 }
3275 
3276 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3277 {
3278 	++*pos;
3279 
3280 	if (v == SEQ_START_TOKEN)
3281 		return unix_get_first(seq, pos);
3282 
3283 	return unix_get_next(seq, v, pos);
3284 }
3285 
3286 static void unix_seq_stop(struct seq_file *seq, void *v)
3287 {
3288 	struct sock *sk = v;
3289 
3290 	if (sk)
3291 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3292 }
3293 
3294 static int unix_seq_show(struct seq_file *seq, void *v)
3295 {
3296 
3297 	if (v == SEQ_START_TOKEN)
3298 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3299 			 "Inode Path\n");
3300 	else {
3301 		struct sock *s = v;
3302 		struct unix_sock *u = unix_sk(s);
3303 		unix_state_lock(s);
3304 
3305 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3306 			s,
3307 			refcount_read(&s->sk_refcnt),
3308 			0,
3309 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3310 			s->sk_type,
3311 			s->sk_socket ?
3312 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3313 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3314 			sock_i_ino(s));
3315 
3316 		if (u->addr) {	// under a hash table lock here
3317 			int i, len;
3318 			seq_putc(seq, ' ');
3319 
3320 			i = 0;
3321 			len = u->addr->len -
3322 				offsetof(struct sockaddr_un, sun_path);
3323 			if (u->addr->name->sun_path[0]) {
3324 				len--;
3325 			} else {
3326 				seq_putc(seq, '@');
3327 				i++;
3328 			}
3329 			for ( ; i < len; i++)
3330 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3331 					 '@');
3332 		}
3333 		unix_state_unlock(s);
3334 		seq_putc(seq, '\n');
3335 	}
3336 
3337 	return 0;
3338 }
3339 
3340 static const struct seq_operations unix_seq_ops = {
3341 	.start  = unix_seq_start,
3342 	.next   = unix_seq_next,
3343 	.stop   = unix_seq_stop,
3344 	.show   = unix_seq_show,
3345 };
3346 
3347 #ifdef CONFIG_BPF_SYSCALL
3348 struct bpf_unix_iter_state {
3349 	struct seq_net_private p;
3350 	unsigned int cur_sk;
3351 	unsigned int end_sk;
3352 	unsigned int max_sk;
3353 	struct sock **batch;
3354 	bool st_bucket_done;
3355 };
3356 
3357 struct bpf_iter__unix {
3358 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3359 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3360 	uid_t uid __aligned(8);
3361 };
3362 
3363 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3364 			      struct unix_sock *unix_sk, uid_t uid)
3365 {
3366 	struct bpf_iter__unix ctx;
3367 
3368 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3369 	ctx.meta = meta;
3370 	ctx.unix_sk = unix_sk;
3371 	ctx.uid = uid;
3372 	return bpf_iter_run_prog(prog, &ctx);
3373 }
3374 
3375 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3376 
3377 {
3378 	struct bpf_unix_iter_state *iter = seq->private;
3379 	unsigned int expected = 1;
3380 	struct sock *sk;
3381 
3382 	sock_hold(start_sk);
3383 	iter->batch[iter->end_sk++] = start_sk;
3384 
3385 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3386 		if (iter->end_sk < iter->max_sk) {
3387 			sock_hold(sk);
3388 			iter->batch[iter->end_sk++] = sk;
3389 		}
3390 
3391 		expected++;
3392 	}
3393 
3394 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3395 
3396 	return expected;
3397 }
3398 
3399 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3400 {
3401 	while (iter->cur_sk < iter->end_sk)
3402 		sock_put(iter->batch[iter->cur_sk++]);
3403 }
3404 
3405 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3406 				       unsigned int new_batch_sz)
3407 {
3408 	struct sock **new_batch;
3409 
3410 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3411 			     GFP_USER | __GFP_NOWARN);
3412 	if (!new_batch)
3413 		return -ENOMEM;
3414 
3415 	bpf_iter_unix_put_batch(iter);
3416 	kvfree(iter->batch);
3417 	iter->batch = new_batch;
3418 	iter->max_sk = new_batch_sz;
3419 
3420 	return 0;
3421 }
3422 
3423 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3424 					loff_t *pos)
3425 {
3426 	struct bpf_unix_iter_state *iter = seq->private;
3427 	unsigned int expected;
3428 	bool resized = false;
3429 	struct sock *sk;
3430 
3431 	if (iter->st_bucket_done)
3432 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3433 
3434 again:
3435 	/* Get a new batch */
3436 	iter->cur_sk = 0;
3437 	iter->end_sk = 0;
3438 
3439 	sk = unix_get_first(seq, pos);
3440 	if (!sk)
3441 		return NULL; /* Done */
3442 
3443 	expected = bpf_iter_unix_hold_batch(seq, sk);
3444 
3445 	if (iter->end_sk == expected) {
3446 		iter->st_bucket_done = true;
3447 		return sk;
3448 	}
3449 
3450 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3451 		resized = true;
3452 		goto again;
3453 	}
3454 
3455 	return sk;
3456 }
3457 
3458 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3459 {
3460 	if (!*pos)
3461 		return SEQ_START_TOKEN;
3462 
3463 	/* bpf iter does not support lseek, so it always
3464 	 * continue from where it was stop()-ped.
3465 	 */
3466 	return bpf_iter_unix_batch(seq, pos);
3467 }
3468 
3469 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3470 {
3471 	struct bpf_unix_iter_state *iter = seq->private;
3472 	struct sock *sk;
3473 
3474 	/* Whenever seq_next() is called, the iter->cur_sk is
3475 	 * done with seq_show(), so advance to the next sk in
3476 	 * the batch.
3477 	 */
3478 	if (iter->cur_sk < iter->end_sk)
3479 		sock_put(iter->batch[iter->cur_sk++]);
3480 
3481 	++*pos;
3482 
3483 	if (iter->cur_sk < iter->end_sk)
3484 		sk = iter->batch[iter->cur_sk];
3485 	else
3486 		sk = bpf_iter_unix_batch(seq, pos);
3487 
3488 	return sk;
3489 }
3490 
3491 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3492 {
3493 	struct bpf_iter_meta meta;
3494 	struct bpf_prog *prog;
3495 	struct sock *sk = v;
3496 	uid_t uid;
3497 	bool slow;
3498 	int ret;
3499 
3500 	if (v == SEQ_START_TOKEN)
3501 		return 0;
3502 
3503 	slow = lock_sock_fast(sk);
3504 
3505 	if (unlikely(sk_unhashed(sk))) {
3506 		ret = SEQ_SKIP;
3507 		goto unlock;
3508 	}
3509 
3510 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3511 	meta.seq = seq;
3512 	prog = bpf_iter_get_info(&meta, false);
3513 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3514 unlock:
3515 	unlock_sock_fast(sk, slow);
3516 	return ret;
3517 }
3518 
3519 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3520 {
3521 	struct bpf_unix_iter_state *iter = seq->private;
3522 	struct bpf_iter_meta meta;
3523 	struct bpf_prog *prog;
3524 
3525 	if (!v) {
3526 		meta.seq = seq;
3527 		prog = bpf_iter_get_info(&meta, true);
3528 		if (prog)
3529 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3530 	}
3531 
3532 	if (iter->cur_sk < iter->end_sk)
3533 		bpf_iter_unix_put_batch(iter);
3534 }
3535 
3536 static const struct seq_operations bpf_iter_unix_seq_ops = {
3537 	.start	= bpf_iter_unix_seq_start,
3538 	.next	= bpf_iter_unix_seq_next,
3539 	.stop	= bpf_iter_unix_seq_stop,
3540 	.show	= bpf_iter_unix_seq_show,
3541 };
3542 #endif
3543 #endif
3544 
3545 static const struct net_proto_family unix_family_ops = {
3546 	.family = PF_UNIX,
3547 	.create = unix_create,
3548 	.owner	= THIS_MODULE,
3549 };
3550 
3551 
3552 static int __net_init unix_net_init(struct net *net)
3553 {
3554 	int i;
3555 
3556 	net->unx.sysctl_max_dgram_qlen = 10;
3557 	if (unix_sysctl_register(net))
3558 		goto out;
3559 
3560 #ifdef CONFIG_PROC_FS
3561 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3562 			     sizeof(struct seq_net_private)))
3563 		goto err_sysctl;
3564 #endif
3565 
3566 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3567 					      sizeof(spinlock_t), GFP_KERNEL);
3568 	if (!net->unx.table.locks)
3569 		goto err_proc;
3570 
3571 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3572 						sizeof(struct hlist_head),
3573 						GFP_KERNEL);
3574 	if (!net->unx.table.buckets)
3575 		goto free_locks;
3576 
3577 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3578 		spin_lock_init(&net->unx.table.locks[i]);
3579 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3580 	}
3581 
3582 	return 0;
3583 
3584 free_locks:
3585 	kvfree(net->unx.table.locks);
3586 err_proc:
3587 #ifdef CONFIG_PROC_FS
3588 	remove_proc_entry("unix", net->proc_net);
3589 err_sysctl:
3590 #endif
3591 	unix_sysctl_unregister(net);
3592 out:
3593 	return -ENOMEM;
3594 }
3595 
3596 static void __net_exit unix_net_exit(struct net *net)
3597 {
3598 	kvfree(net->unx.table.buckets);
3599 	kvfree(net->unx.table.locks);
3600 	unix_sysctl_unregister(net);
3601 	remove_proc_entry("unix", net->proc_net);
3602 }
3603 
3604 static struct pernet_operations unix_net_ops = {
3605 	.init = unix_net_init,
3606 	.exit = unix_net_exit,
3607 };
3608 
3609 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3610 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3611 		     struct unix_sock *unix_sk, uid_t uid)
3612 
3613 #define INIT_BATCH_SZ 16
3614 
3615 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3616 {
3617 	struct bpf_unix_iter_state *iter = priv_data;
3618 	int err;
3619 
3620 	err = bpf_iter_init_seq_net(priv_data, aux);
3621 	if (err)
3622 		return err;
3623 
3624 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3625 	if (err) {
3626 		bpf_iter_fini_seq_net(priv_data);
3627 		return err;
3628 	}
3629 
3630 	return 0;
3631 }
3632 
3633 static void bpf_iter_fini_unix(void *priv_data)
3634 {
3635 	struct bpf_unix_iter_state *iter = priv_data;
3636 
3637 	bpf_iter_fini_seq_net(priv_data);
3638 	kvfree(iter->batch);
3639 }
3640 
3641 static const struct bpf_iter_seq_info unix_seq_info = {
3642 	.seq_ops		= &bpf_iter_unix_seq_ops,
3643 	.init_seq_private	= bpf_iter_init_unix,
3644 	.fini_seq_private	= bpf_iter_fini_unix,
3645 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3646 };
3647 
3648 static const struct bpf_func_proto *
3649 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3650 			     const struct bpf_prog *prog)
3651 {
3652 	switch (func_id) {
3653 	case BPF_FUNC_setsockopt:
3654 		return &bpf_sk_setsockopt_proto;
3655 	case BPF_FUNC_getsockopt:
3656 		return &bpf_sk_getsockopt_proto;
3657 	default:
3658 		return NULL;
3659 	}
3660 }
3661 
3662 static struct bpf_iter_reg unix_reg_info = {
3663 	.target			= "unix",
3664 	.ctx_arg_info_size	= 1,
3665 	.ctx_arg_info		= {
3666 		{ offsetof(struct bpf_iter__unix, unix_sk),
3667 		  PTR_TO_BTF_ID_OR_NULL },
3668 	},
3669 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3670 	.seq_info		= &unix_seq_info,
3671 };
3672 
3673 static void __init bpf_iter_register(void)
3674 {
3675 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3676 	if (bpf_iter_reg_target(&unix_reg_info))
3677 		pr_warn("Warning: could not register bpf iterator unix\n");
3678 }
3679 #endif
3680 
3681 static int __init af_unix_init(void)
3682 {
3683 	int i, rc = -1;
3684 
3685 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3686 
3687 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3688 		spin_lock_init(&bsd_socket_locks[i]);
3689 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3690 	}
3691 
3692 	rc = proto_register(&unix_dgram_proto, 1);
3693 	if (rc != 0) {
3694 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3695 		goto out;
3696 	}
3697 
3698 	rc = proto_register(&unix_stream_proto, 1);
3699 	if (rc != 0) {
3700 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3701 		proto_unregister(&unix_dgram_proto);
3702 		goto out;
3703 	}
3704 
3705 	sock_register(&unix_family_ops);
3706 	register_pernet_subsys(&unix_net_ops);
3707 	unix_bpf_build_proto();
3708 
3709 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3710 	bpf_iter_register();
3711 #endif
3712 
3713 out:
3714 	return rc;
3715 }
3716 
3717 /* Later than subsys_initcall() because we depend on stuff initialised there */
3718 fs_initcall(af_unix_init);
3719