xref: /linux/net/unix/af_unix.c (revision 50f2944009a25bb39a09f2f7bab64a73ce928bef)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/freezer.h>
116 #include <linux/file.h>
117 #include <linux/btf_ids.h>
118 
119 #include "scm.h"
120 
121 static atomic_long_t unix_nr_socks;
122 
123 /* SMP locking strategy:
124  *    hash table is protected with spinlock.
125  *    each socket state is protected by separate spinlock.
126  */
127 
128 static unsigned int unix_unbound_hash(struct sock *sk)
129 {
130 	unsigned long hash = (unsigned long)sk;
131 
132 	hash ^= hash >> 16;
133 	hash ^= hash >> 8;
134 	hash ^= sk->sk_type;
135 
136 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
137 }
138 
139 static unsigned int unix_bsd_hash(struct inode *i)
140 {
141 	return i->i_ino & UNIX_HASH_MOD;
142 }
143 
144 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
145 				       int addr_len, int type)
146 {
147 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
148 	unsigned int hash;
149 
150 	hash = (__force unsigned int)csum_fold(csum);
151 	hash ^= hash >> 8;
152 	hash ^= type;
153 
154 	return hash & UNIX_HASH_MOD;
155 }
156 
157 static void unix_table_double_lock(struct net *net,
158 				   unsigned int hash1, unsigned int hash2)
159 {
160 	/* hash1 and hash2 is never the same because
161 	 * one is between 0 and UNIX_HASH_MOD, and
162 	 * another is between UNIX_HASH_MOD + 1 and UNIX_HASH_SIZE - 1.
163 	 */
164 	if (hash1 > hash2)
165 		swap(hash1, hash2);
166 
167 	spin_lock(&net->unx.table.locks[hash1]);
168 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
169 }
170 
171 static void unix_table_double_unlock(struct net *net,
172 				     unsigned int hash1, unsigned int hash2)
173 {
174 	spin_unlock(&net->unx.table.locks[hash1]);
175 	spin_unlock(&net->unx.table.locks[hash2]);
176 }
177 
178 #ifdef CONFIG_SECURITY_NETWORK
179 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
180 {
181 	UNIXCB(skb).secid = scm->secid;
182 }
183 
184 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
185 {
186 	scm->secid = UNIXCB(skb).secid;
187 }
188 
189 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
190 {
191 	return (scm->secid == UNIXCB(skb).secid);
192 }
193 #else
194 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
195 { }
196 
197 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
198 { }
199 
200 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
201 {
202 	return true;
203 }
204 #endif /* CONFIG_SECURITY_NETWORK */
205 
206 #define unix_peer(sk) (unix_sk(sk)->peer)
207 
208 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
209 {
210 	return unix_peer(osk) == sk;
211 }
212 
213 static inline int unix_may_send(struct sock *sk, struct sock *osk)
214 {
215 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
216 }
217 
218 static inline int unix_recvq_full(const struct sock *sk)
219 {
220 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
221 }
222 
223 static inline int unix_recvq_full_lockless(const struct sock *sk)
224 {
225 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
226 		READ_ONCE(sk->sk_max_ack_backlog);
227 }
228 
229 struct sock *unix_peer_get(struct sock *s)
230 {
231 	struct sock *peer;
232 
233 	unix_state_lock(s);
234 	peer = unix_peer(s);
235 	if (peer)
236 		sock_hold(peer);
237 	unix_state_unlock(s);
238 	return peer;
239 }
240 EXPORT_SYMBOL_GPL(unix_peer_get);
241 
242 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
243 					     int addr_len)
244 {
245 	struct unix_address *addr;
246 
247 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
248 	if (!addr)
249 		return NULL;
250 
251 	refcount_set(&addr->refcnt, 1);
252 	addr->len = addr_len;
253 	memcpy(addr->name, sunaddr, addr_len);
254 
255 	return addr;
256 }
257 
258 static inline void unix_release_addr(struct unix_address *addr)
259 {
260 	if (refcount_dec_and_test(&addr->refcnt))
261 		kfree(addr);
262 }
263 
264 /*
265  *	Check unix socket name:
266  *		- should be not zero length.
267  *	        - if started by not zero, should be NULL terminated (FS object)
268  *		- if started by zero, it is abstract name.
269  */
270 
271 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
272 {
273 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
274 	    addr_len > sizeof(*sunaddr))
275 		return -EINVAL;
276 
277 	if (sunaddr->sun_family != AF_UNIX)
278 		return -EINVAL;
279 
280 	return 0;
281 }
282 
283 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
284 {
285 	/* This may look like an off by one error but it is a bit more
286 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
287 	 * sun_path[108] doesn't as such exist.  However in kernel space
288 	 * we are guaranteed that it is a valid memory location in our
289 	 * kernel address buffer because syscall functions always pass
290 	 * a pointer of struct sockaddr_storage which has a bigger buffer
291 	 * than 108.
292 	 */
293 	((char *)sunaddr)[addr_len] = 0;
294 }
295 
296 static void __unix_remove_socket(struct sock *sk)
297 {
298 	sk_del_node_init(sk);
299 }
300 
301 static void __unix_insert_socket(struct net *net, struct sock *sk)
302 {
303 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
304 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
305 }
306 
307 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
308 				 struct unix_address *addr, unsigned int hash)
309 {
310 	__unix_remove_socket(sk);
311 	smp_store_release(&unix_sk(sk)->addr, addr);
312 
313 	sk->sk_hash = hash;
314 	__unix_insert_socket(net, sk);
315 }
316 
317 static void unix_remove_socket(struct net *net, struct sock *sk)
318 {
319 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
320 	__unix_remove_socket(sk);
321 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
322 }
323 
324 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
325 {
326 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
327 	__unix_insert_socket(net, sk);
328 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
329 }
330 
331 static struct sock *__unix_find_socket_byname(struct net *net,
332 					      struct sockaddr_un *sunname,
333 					      int len, unsigned int hash)
334 {
335 	struct sock *s;
336 
337 	sk_for_each(s, &net->unx.table.buckets[hash]) {
338 		struct unix_sock *u = unix_sk(s);
339 
340 		if (u->addr->len == len &&
341 		    !memcmp(u->addr->name, sunname, len))
342 			return s;
343 	}
344 	return NULL;
345 }
346 
347 static inline struct sock *unix_find_socket_byname(struct net *net,
348 						   struct sockaddr_un *sunname,
349 						   int len, unsigned int hash)
350 {
351 	struct sock *s;
352 
353 	spin_lock(&net->unx.table.locks[hash]);
354 	s = __unix_find_socket_byname(net, sunname, len, hash);
355 	if (s)
356 		sock_hold(s);
357 	spin_unlock(&net->unx.table.locks[hash]);
358 	return s;
359 }
360 
361 static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
362 {
363 	unsigned int hash = unix_bsd_hash(i);
364 	struct sock *s;
365 
366 	spin_lock(&net->unx.table.locks[hash]);
367 	sk_for_each(s, &net->unx.table.buckets[hash]) {
368 		struct dentry *dentry = unix_sk(s)->path.dentry;
369 
370 		if (dentry && d_backing_inode(dentry) == i) {
371 			sock_hold(s);
372 			spin_unlock(&net->unx.table.locks[hash]);
373 			return s;
374 		}
375 	}
376 	spin_unlock(&net->unx.table.locks[hash]);
377 	return NULL;
378 }
379 
380 /* Support code for asymmetrically connected dgram sockets
381  *
382  * If a datagram socket is connected to a socket not itself connected
383  * to the first socket (eg, /dev/log), clients may only enqueue more
384  * messages if the present receive queue of the server socket is not
385  * "too large". This means there's a second writeability condition
386  * poll and sendmsg need to test. The dgram recv code will do a wake
387  * up on the peer_wait wait queue of a socket upon reception of a
388  * datagram which needs to be propagated to sleeping would-be writers
389  * since these might not have sent anything so far. This can't be
390  * accomplished via poll_wait because the lifetime of the server
391  * socket might be less than that of its clients if these break their
392  * association with it or if the server socket is closed while clients
393  * are still connected to it and there's no way to inform "a polling
394  * implementation" that it should let go of a certain wait queue
395  *
396  * In order to propagate a wake up, a wait_queue_entry_t of the client
397  * socket is enqueued on the peer_wait queue of the server socket
398  * whose wake function does a wake_up on the ordinary client socket
399  * wait queue. This connection is established whenever a write (or
400  * poll for write) hit the flow control condition and broken when the
401  * association to the server socket is dissolved or after a wake up
402  * was relayed.
403  */
404 
405 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
406 				      void *key)
407 {
408 	struct unix_sock *u;
409 	wait_queue_head_t *u_sleep;
410 
411 	u = container_of(q, struct unix_sock, peer_wake);
412 
413 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
414 			    q);
415 	u->peer_wake.private = NULL;
416 
417 	/* relaying can only happen while the wq still exists */
418 	u_sleep = sk_sleep(&u->sk);
419 	if (u_sleep)
420 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
421 
422 	return 0;
423 }
424 
425 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
426 {
427 	struct unix_sock *u, *u_other;
428 	int rc;
429 
430 	u = unix_sk(sk);
431 	u_other = unix_sk(other);
432 	rc = 0;
433 	spin_lock(&u_other->peer_wait.lock);
434 
435 	if (!u->peer_wake.private) {
436 		u->peer_wake.private = other;
437 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
438 
439 		rc = 1;
440 	}
441 
442 	spin_unlock(&u_other->peer_wait.lock);
443 	return rc;
444 }
445 
446 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
447 					    struct sock *other)
448 {
449 	struct unix_sock *u, *u_other;
450 
451 	u = unix_sk(sk);
452 	u_other = unix_sk(other);
453 	spin_lock(&u_other->peer_wait.lock);
454 
455 	if (u->peer_wake.private == other) {
456 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
457 		u->peer_wake.private = NULL;
458 	}
459 
460 	spin_unlock(&u_other->peer_wait.lock);
461 }
462 
463 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
464 						   struct sock *other)
465 {
466 	unix_dgram_peer_wake_disconnect(sk, other);
467 	wake_up_interruptible_poll(sk_sleep(sk),
468 				   EPOLLOUT |
469 				   EPOLLWRNORM |
470 				   EPOLLWRBAND);
471 }
472 
473 /* preconditions:
474  *	- unix_peer(sk) == other
475  *	- association is stable
476  */
477 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
478 {
479 	int connected;
480 
481 	connected = unix_dgram_peer_wake_connect(sk, other);
482 
483 	/* If other is SOCK_DEAD, we want to make sure we signal
484 	 * POLLOUT, such that a subsequent write() can get a
485 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
486 	 * to other and its full, we will hang waiting for POLLOUT.
487 	 */
488 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
489 		return 1;
490 
491 	if (connected)
492 		unix_dgram_peer_wake_disconnect(sk, other);
493 
494 	return 0;
495 }
496 
497 static int unix_writable(const struct sock *sk)
498 {
499 	return sk->sk_state != TCP_LISTEN &&
500 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
501 }
502 
503 static void unix_write_space(struct sock *sk)
504 {
505 	struct socket_wq *wq;
506 
507 	rcu_read_lock();
508 	if (unix_writable(sk)) {
509 		wq = rcu_dereference(sk->sk_wq);
510 		if (skwq_has_sleeper(wq))
511 			wake_up_interruptible_sync_poll(&wq->wait,
512 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
513 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
514 	}
515 	rcu_read_unlock();
516 }
517 
518 /* When dgram socket disconnects (or changes its peer), we clear its receive
519  * queue of packets arrived from previous peer. First, it allows to do
520  * flow control based only on wmem_alloc; second, sk connected to peer
521  * may receive messages only from that peer. */
522 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
523 {
524 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
525 		skb_queue_purge(&sk->sk_receive_queue);
526 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
527 
528 		/* If one link of bidirectional dgram pipe is disconnected,
529 		 * we signal error. Messages are lost. Do not make this,
530 		 * when peer was not connected to us.
531 		 */
532 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
533 			other->sk_err = ECONNRESET;
534 			sk_error_report(other);
535 		}
536 	}
537 	other->sk_state = TCP_CLOSE;
538 }
539 
540 static void unix_sock_destructor(struct sock *sk)
541 {
542 	struct unix_sock *u = unix_sk(sk);
543 
544 	skb_queue_purge(&sk->sk_receive_queue);
545 
546 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
547 	if (u->oob_skb) {
548 		kfree_skb(u->oob_skb);
549 		u->oob_skb = NULL;
550 	}
551 #endif
552 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
553 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
554 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
555 	if (!sock_flag(sk, SOCK_DEAD)) {
556 		pr_info("Attempt to release alive unix socket: %p\n", sk);
557 		return;
558 	}
559 
560 	if (u->addr)
561 		unix_release_addr(u->addr);
562 
563 	atomic_long_dec(&unix_nr_socks);
564 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
565 #ifdef UNIX_REFCNT_DEBUG
566 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
567 		atomic_long_read(&unix_nr_socks));
568 #endif
569 }
570 
571 static void unix_release_sock(struct sock *sk, int embrion)
572 {
573 	struct unix_sock *u = unix_sk(sk);
574 	struct sock *skpair;
575 	struct sk_buff *skb;
576 	struct path path;
577 	int state;
578 
579 	unix_remove_socket(sock_net(sk), sk);
580 
581 	/* Clear state */
582 	unix_state_lock(sk);
583 	sock_orphan(sk);
584 	sk->sk_shutdown = SHUTDOWN_MASK;
585 	path	     = u->path;
586 	u->path.dentry = NULL;
587 	u->path.mnt = NULL;
588 	state = sk->sk_state;
589 	sk->sk_state = TCP_CLOSE;
590 
591 	skpair = unix_peer(sk);
592 	unix_peer(sk) = NULL;
593 
594 	unix_state_unlock(sk);
595 
596 	wake_up_interruptible_all(&u->peer_wait);
597 
598 	if (skpair != NULL) {
599 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
600 			unix_state_lock(skpair);
601 			/* No more writes */
602 			skpair->sk_shutdown = SHUTDOWN_MASK;
603 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
604 				skpair->sk_err = ECONNRESET;
605 			unix_state_unlock(skpair);
606 			skpair->sk_state_change(skpair);
607 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
608 		}
609 
610 		unix_dgram_peer_wake_disconnect(sk, skpair);
611 		sock_put(skpair); /* It may now die */
612 	}
613 
614 	/* Try to flush out this socket. Throw out buffers at least */
615 
616 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
617 		if (state == TCP_LISTEN)
618 			unix_release_sock(skb->sk, 1);
619 		/* passed fds are erased in the kfree_skb hook	      */
620 		UNIXCB(skb).consumed = skb->len;
621 		kfree_skb(skb);
622 	}
623 
624 	if (path.dentry)
625 		path_put(&path);
626 
627 	sock_put(sk);
628 
629 	/* ---- Socket is dead now and most probably destroyed ---- */
630 
631 	/*
632 	 * Fixme: BSD difference: In BSD all sockets connected to us get
633 	 *	  ECONNRESET and we die on the spot. In Linux we behave
634 	 *	  like files and pipes do and wait for the last
635 	 *	  dereference.
636 	 *
637 	 * Can't we simply set sock->err?
638 	 *
639 	 *	  What the above comment does talk about? --ANK(980817)
640 	 */
641 
642 	if (unix_tot_inflight)
643 		unix_gc();		/* Garbage collect fds */
644 }
645 
646 static void init_peercred(struct sock *sk)
647 {
648 	const struct cred *old_cred;
649 	struct pid *old_pid;
650 
651 	spin_lock(&sk->sk_peer_lock);
652 	old_pid = sk->sk_peer_pid;
653 	old_cred = sk->sk_peer_cred;
654 	sk->sk_peer_pid  = get_pid(task_tgid(current));
655 	sk->sk_peer_cred = get_current_cred();
656 	spin_unlock(&sk->sk_peer_lock);
657 
658 	put_pid(old_pid);
659 	put_cred(old_cred);
660 }
661 
662 static void copy_peercred(struct sock *sk, struct sock *peersk)
663 {
664 	const struct cred *old_cred;
665 	struct pid *old_pid;
666 
667 	if (sk < peersk) {
668 		spin_lock(&sk->sk_peer_lock);
669 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
670 	} else {
671 		spin_lock(&peersk->sk_peer_lock);
672 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
673 	}
674 	old_pid = sk->sk_peer_pid;
675 	old_cred = sk->sk_peer_cred;
676 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
677 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
678 
679 	spin_unlock(&sk->sk_peer_lock);
680 	spin_unlock(&peersk->sk_peer_lock);
681 
682 	put_pid(old_pid);
683 	put_cred(old_cred);
684 }
685 
686 static int unix_listen(struct socket *sock, int backlog)
687 {
688 	int err;
689 	struct sock *sk = sock->sk;
690 	struct unix_sock *u = unix_sk(sk);
691 
692 	err = -EOPNOTSUPP;
693 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
694 		goto out;	/* Only stream/seqpacket sockets accept */
695 	err = -EINVAL;
696 	if (!u->addr)
697 		goto out;	/* No listens on an unbound socket */
698 	unix_state_lock(sk);
699 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
700 		goto out_unlock;
701 	if (backlog > sk->sk_max_ack_backlog)
702 		wake_up_interruptible_all(&u->peer_wait);
703 	sk->sk_max_ack_backlog	= backlog;
704 	sk->sk_state		= TCP_LISTEN;
705 	/* set credentials so connect can copy them */
706 	init_peercred(sk);
707 	err = 0;
708 
709 out_unlock:
710 	unix_state_unlock(sk);
711 out:
712 	return err;
713 }
714 
715 static int unix_release(struct socket *);
716 static int unix_bind(struct socket *, struct sockaddr *, int);
717 static int unix_stream_connect(struct socket *, struct sockaddr *,
718 			       int addr_len, int flags);
719 static int unix_socketpair(struct socket *, struct socket *);
720 static int unix_accept(struct socket *, struct socket *, int, bool);
721 static int unix_getname(struct socket *, struct sockaddr *, int);
722 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
723 static __poll_t unix_dgram_poll(struct file *, struct socket *,
724 				    poll_table *);
725 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
726 #ifdef CONFIG_COMPAT
727 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
728 #endif
729 static int unix_shutdown(struct socket *, int);
730 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
731 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
732 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
733 				    size_t size, int flags);
734 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
735 				       struct pipe_inode_info *, size_t size,
736 				       unsigned int flags);
737 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
738 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
739 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
740 			  sk_read_actor_t recv_actor);
741 static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
742 				 sk_read_actor_t recv_actor);
743 static int unix_dgram_connect(struct socket *, struct sockaddr *,
744 			      int, int);
745 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
746 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
747 				  int);
748 
749 static int unix_set_peek_off(struct sock *sk, int val)
750 {
751 	struct unix_sock *u = unix_sk(sk);
752 
753 	if (mutex_lock_interruptible(&u->iolock))
754 		return -EINTR;
755 
756 	sk->sk_peek_off = val;
757 	mutex_unlock(&u->iolock);
758 
759 	return 0;
760 }
761 
762 #ifdef CONFIG_PROC_FS
763 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
764 {
765 	struct sock *sk = sock->sk;
766 	struct unix_sock *u;
767 
768 	if (sk) {
769 		u = unix_sk(sock->sk);
770 		seq_printf(m, "scm_fds: %u\n",
771 			   atomic_read(&u->scm_stat.nr_fds));
772 	}
773 }
774 #else
775 #define unix_show_fdinfo NULL
776 #endif
777 
778 static const struct proto_ops unix_stream_ops = {
779 	.family =	PF_UNIX,
780 	.owner =	THIS_MODULE,
781 	.release =	unix_release,
782 	.bind =		unix_bind,
783 	.connect =	unix_stream_connect,
784 	.socketpair =	unix_socketpair,
785 	.accept =	unix_accept,
786 	.getname =	unix_getname,
787 	.poll =		unix_poll,
788 	.ioctl =	unix_ioctl,
789 #ifdef CONFIG_COMPAT
790 	.compat_ioctl =	unix_compat_ioctl,
791 #endif
792 	.listen =	unix_listen,
793 	.shutdown =	unix_shutdown,
794 	.sendmsg =	unix_stream_sendmsg,
795 	.recvmsg =	unix_stream_recvmsg,
796 	.read_sock =	unix_stream_read_sock,
797 	.mmap =		sock_no_mmap,
798 	.sendpage =	unix_stream_sendpage,
799 	.splice_read =	unix_stream_splice_read,
800 	.set_peek_off =	unix_set_peek_off,
801 	.show_fdinfo =	unix_show_fdinfo,
802 };
803 
804 static const struct proto_ops unix_dgram_ops = {
805 	.family =	PF_UNIX,
806 	.owner =	THIS_MODULE,
807 	.release =	unix_release,
808 	.bind =		unix_bind,
809 	.connect =	unix_dgram_connect,
810 	.socketpair =	unix_socketpair,
811 	.accept =	sock_no_accept,
812 	.getname =	unix_getname,
813 	.poll =		unix_dgram_poll,
814 	.ioctl =	unix_ioctl,
815 #ifdef CONFIG_COMPAT
816 	.compat_ioctl =	unix_compat_ioctl,
817 #endif
818 	.listen =	sock_no_listen,
819 	.shutdown =	unix_shutdown,
820 	.sendmsg =	unix_dgram_sendmsg,
821 	.read_sock =	unix_read_sock,
822 	.recvmsg =	unix_dgram_recvmsg,
823 	.mmap =		sock_no_mmap,
824 	.sendpage =	sock_no_sendpage,
825 	.set_peek_off =	unix_set_peek_off,
826 	.show_fdinfo =	unix_show_fdinfo,
827 };
828 
829 static const struct proto_ops unix_seqpacket_ops = {
830 	.family =	PF_UNIX,
831 	.owner =	THIS_MODULE,
832 	.release =	unix_release,
833 	.bind =		unix_bind,
834 	.connect =	unix_stream_connect,
835 	.socketpair =	unix_socketpair,
836 	.accept =	unix_accept,
837 	.getname =	unix_getname,
838 	.poll =		unix_dgram_poll,
839 	.ioctl =	unix_ioctl,
840 #ifdef CONFIG_COMPAT
841 	.compat_ioctl =	unix_compat_ioctl,
842 #endif
843 	.listen =	unix_listen,
844 	.shutdown =	unix_shutdown,
845 	.sendmsg =	unix_seqpacket_sendmsg,
846 	.recvmsg =	unix_seqpacket_recvmsg,
847 	.mmap =		sock_no_mmap,
848 	.sendpage =	sock_no_sendpage,
849 	.set_peek_off =	unix_set_peek_off,
850 	.show_fdinfo =	unix_show_fdinfo,
851 };
852 
853 static void unix_close(struct sock *sk, long timeout)
854 {
855 	/* Nothing to do here, unix socket does not need a ->close().
856 	 * This is merely for sockmap.
857 	 */
858 }
859 
860 static void unix_unhash(struct sock *sk)
861 {
862 	/* Nothing to do here, unix socket does not need a ->unhash().
863 	 * This is merely for sockmap.
864 	 */
865 }
866 
867 struct proto unix_dgram_proto = {
868 	.name			= "UNIX",
869 	.owner			= THIS_MODULE,
870 	.obj_size		= sizeof(struct unix_sock),
871 	.close			= unix_close,
872 #ifdef CONFIG_BPF_SYSCALL
873 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
874 #endif
875 };
876 
877 struct proto unix_stream_proto = {
878 	.name			= "UNIX-STREAM",
879 	.owner			= THIS_MODULE,
880 	.obj_size		= sizeof(struct unix_sock),
881 	.close			= unix_close,
882 	.unhash			= unix_unhash,
883 #ifdef CONFIG_BPF_SYSCALL
884 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
885 #endif
886 };
887 
888 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
889 {
890 	struct unix_sock *u;
891 	struct sock *sk;
892 	int err;
893 
894 	atomic_long_inc(&unix_nr_socks);
895 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
896 		err = -ENFILE;
897 		goto err;
898 	}
899 
900 	if (type == SOCK_STREAM)
901 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
902 	else /*dgram and  seqpacket */
903 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
904 
905 	if (!sk) {
906 		err = -ENOMEM;
907 		goto err;
908 	}
909 
910 	sock_init_data(sock, sk);
911 
912 	sk->sk_hash		= unix_unbound_hash(sk);
913 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
914 	sk->sk_write_space	= unix_write_space;
915 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
916 	sk->sk_destruct		= unix_sock_destructor;
917 	u	  = unix_sk(sk);
918 	u->path.dentry = NULL;
919 	u->path.mnt = NULL;
920 	spin_lock_init(&u->lock);
921 	atomic_long_set(&u->inflight, 0);
922 	INIT_LIST_HEAD(&u->link);
923 	mutex_init(&u->iolock); /* single task reading lock */
924 	mutex_init(&u->bindlock); /* single task binding lock */
925 	init_waitqueue_head(&u->peer_wait);
926 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
927 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
928 	unix_insert_unbound_socket(net, sk);
929 
930 	sock_prot_inuse_add(net, sk->sk_prot, 1);
931 
932 	return sk;
933 
934 err:
935 	atomic_long_dec(&unix_nr_socks);
936 	return ERR_PTR(err);
937 }
938 
939 static int unix_create(struct net *net, struct socket *sock, int protocol,
940 		       int kern)
941 {
942 	struct sock *sk;
943 
944 	if (protocol && protocol != PF_UNIX)
945 		return -EPROTONOSUPPORT;
946 
947 	sock->state = SS_UNCONNECTED;
948 
949 	switch (sock->type) {
950 	case SOCK_STREAM:
951 		sock->ops = &unix_stream_ops;
952 		break;
953 		/*
954 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
955 		 *	nothing uses it.
956 		 */
957 	case SOCK_RAW:
958 		sock->type = SOCK_DGRAM;
959 		fallthrough;
960 	case SOCK_DGRAM:
961 		sock->ops = &unix_dgram_ops;
962 		break;
963 	case SOCK_SEQPACKET:
964 		sock->ops = &unix_seqpacket_ops;
965 		break;
966 	default:
967 		return -ESOCKTNOSUPPORT;
968 	}
969 
970 	sk = unix_create1(net, sock, kern, sock->type);
971 	if (IS_ERR(sk))
972 		return PTR_ERR(sk);
973 
974 	return 0;
975 }
976 
977 static int unix_release(struct socket *sock)
978 {
979 	struct sock *sk = sock->sk;
980 
981 	if (!sk)
982 		return 0;
983 
984 	sk->sk_prot->close(sk, 0);
985 	unix_release_sock(sk, 0);
986 	sock->sk = NULL;
987 
988 	return 0;
989 }
990 
991 static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
992 				  int addr_len, int type)
993 {
994 	struct inode *inode;
995 	struct path path;
996 	struct sock *sk;
997 	int err;
998 
999 	unix_mkname_bsd(sunaddr, addr_len);
1000 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1001 	if (err)
1002 		goto fail;
1003 
1004 	err = path_permission(&path, MAY_WRITE);
1005 	if (err)
1006 		goto path_put;
1007 
1008 	err = -ECONNREFUSED;
1009 	inode = d_backing_inode(path.dentry);
1010 	if (!S_ISSOCK(inode->i_mode))
1011 		goto path_put;
1012 
1013 	sk = unix_find_socket_byinode(net, inode);
1014 	if (!sk)
1015 		goto path_put;
1016 
1017 	err = -EPROTOTYPE;
1018 	if (sk->sk_type == type)
1019 		touch_atime(&path);
1020 	else
1021 		goto sock_put;
1022 
1023 	path_put(&path);
1024 
1025 	return sk;
1026 
1027 sock_put:
1028 	sock_put(sk);
1029 path_put:
1030 	path_put(&path);
1031 fail:
1032 	return ERR_PTR(err);
1033 }
1034 
1035 static struct sock *unix_find_abstract(struct net *net,
1036 				       struct sockaddr_un *sunaddr,
1037 				       int addr_len, int type)
1038 {
1039 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1040 	struct dentry *dentry;
1041 	struct sock *sk;
1042 
1043 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1044 	if (!sk)
1045 		return ERR_PTR(-ECONNREFUSED);
1046 
1047 	dentry = unix_sk(sk)->path.dentry;
1048 	if (dentry)
1049 		touch_atime(&unix_sk(sk)->path);
1050 
1051 	return sk;
1052 }
1053 
1054 static struct sock *unix_find_other(struct net *net,
1055 				    struct sockaddr_un *sunaddr,
1056 				    int addr_len, int type)
1057 {
1058 	struct sock *sk;
1059 
1060 	if (sunaddr->sun_path[0])
1061 		sk = unix_find_bsd(net, sunaddr, addr_len, type);
1062 	else
1063 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1064 
1065 	return sk;
1066 }
1067 
1068 static int unix_autobind(struct sock *sk)
1069 {
1070 	unsigned int new_hash, old_hash = sk->sk_hash;
1071 	struct unix_sock *u = unix_sk(sk);
1072 	struct net *net = sock_net(sk);
1073 	struct unix_address *addr;
1074 	u32 lastnum, ordernum;
1075 	int err;
1076 
1077 	err = mutex_lock_interruptible(&u->bindlock);
1078 	if (err)
1079 		return err;
1080 
1081 	if (u->addr)
1082 		goto out;
1083 
1084 	err = -ENOMEM;
1085 	addr = kzalloc(sizeof(*addr) +
1086 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1087 	if (!addr)
1088 		goto out;
1089 
1090 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1091 	addr->name->sun_family = AF_UNIX;
1092 	refcount_set(&addr->refcnt, 1);
1093 
1094 	ordernum = prandom_u32();
1095 	lastnum = ordernum & 0xFFFFF;
1096 retry:
1097 	ordernum = (ordernum + 1) & 0xFFFFF;
1098 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1099 
1100 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1101 	unix_table_double_lock(net, old_hash, new_hash);
1102 
1103 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1104 		unix_table_double_unlock(net, old_hash, new_hash);
1105 
1106 		/* __unix_find_socket_byname() may take long time if many names
1107 		 * are already in use.
1108 		 */
1109 		cond_resched();
1110 
1111 		if (ordernum == lastnum) {
1112 			/* Give up if all names seems to be in use. */
1113 			err = -ENOSPC;
1114 			unix_release_addr(addr);
1115 			goto out;
1116 		}
1117 
1118 		goto retry;
1119 	}
1120 
1121 	__unix_set_addr_hash(net, sk, addr, new_hash);
1122 	unix_table_double_unlock(net, old_hash, new_hash);
1123 	err = 0;
1124 
1125 out:	mutex_unlock(&u->bindlock);
1126 	return err;
1127 }
1128 
1129 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1130 			 int addr_len)
1131 {
1132 	umode_t mode = S_IFSOCK |
1133 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1134 	unsigned int new_hash, old_hash = sk->sk_hash;
1135 	struct unix_sock *u = unix_sk(sk);
1136 	struct net *net = sock_net(sk);
1137 	struct user_namespace *ns; // barf...
1138 	struct unix_address *addr;
1139 	struct dentry *dentry;
1140 	struct path parent;
1141 	int err;
1142 
1143 	unix_mkname_bsd(sunaddr, addr_len);
1144 	addr_len = strlen(sunaddr->sun_path) +
1145 		offsetof(struct sockaddr_un, sun_path) + 1;
1146 
1147 	addr = unix_create_addr(sunaddr, addr_len);
1148 	if (!addr)
1149 		return -ENOMEM;
1150 
1151 	/*
1152 	 * Get the parent directory, calculate the hash for last
1153 	 * component.
1154 	 */
1155 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1156 	if (IS_ERR(dentry)) {
1157 		err = PTR_ERR(dentry);
1158 		goto out;
1159 	}
1160 
1161 	/*
1162 	 * All right, let's create it.
1163 	 */
1164 	ns = mnt_user_ns(parent.mnt);
1165 	err = security_path_mknod(&parent, dentry, mode, 0);
1166 	if (!err)
1167 		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1168 	if (err)
1169 		goto out_path;
1170 	err = mutex_lock_interruptible(&u->bindlock);
1171 	if (err)
1172 		goto out_unlink;
1173 	if (u->addr)
1174 		goto out_unlock;
1175 
1176 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1177 	unix_table_double_lock(net, old_hash, new_hash);
1178 	u->path.mnt = mntget(parent.mnt);
1179 	u->path.dentry = dget(dentry);
1180 	__unix_set_addr_hash(net, sk, addr, new_hash);
1181 	unix_table_double_unlock(net, old_hash, new_hash);
1182 	mutex_unlock(&u->bindlock);
1183 	done_path_create(&parent, dentry);
1184 	return 0;
1185 
1186 out_unlock:
1187 	mutex_unlock(&u->bindlock);
1188 	err = -EINVAL;
1189 out_unlink:
1190 	/* failed after successful mknod?  unlink what we'd created... */
1191 	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1192 out_path:
1193 	done_path_create(&parent, dentry);
1194 out:
1195 	unix_release_addr(addr);
1196 	return err == -EEXIST ? -EADDRINUSE : err;
1197 }
1198 
1199 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1200 			      int addr_len)
1201 {
1202 	unsigned int new_hash, old_hash = sk->sk_hash;
1203 	struct unix_sock *u = unix_sk(sk);
1204 	struct net *net = sock_net(sk);
1205 	struct unix_address *addr;
1206 	int err;
1207 
1208 	addr = unix_create_addr(sunaddr, addr_len);
1209 	if (!addr)
1210 		return -ENOMEM;
1211 
1212 	err = mutex_lock_interruptible(&u->bindlock);
1213 	if (err)
1214 		goto out;
1215 
1216 	if (u->addr) {
1217 		err = -EINVAL;
1218 		goto out_mutex;
1219 	}
1220 
1221 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1222 	unix_table_double_lock(net, old_hash, new_hash);
1223 
1224 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1225 		goto out_spin;
1226 
1227 	__unix_set_addr_hash(net, sk, addr, new_hash);
1228 	unix_table_double_unlock(net, old_hash, new_hash);
1229 	mutex_unlock(&u->bindlock);
1230 	return 0;
1231 
1232 out_spin:
1233 	unix_table_double_unlock(net, old_hash, new_hash);
1234 	err = -EADDRINUSE;
1235 out_mutex:
1236 	mutex_unlock(&u->bindlock);
1237 out:
1238 	unix_release_addr(addr);
1239 	return err;
1240 }
1241 
1242 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1243 {
1244 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1245 	struct sock *sk = sock->sk;
1246 	int err;
1247 
1248 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1249 	    sunaddr->sun_family == AF_UNIX)
1250 		return unix_autobind(sk);
1251 
1252 	err = unix_validate_addr(sunaddr, addr_len);
1253 	if (err)
1254 		return err;
1255 
1256 	if (sunaddr->sun_path[0])
1257 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1258 	else
1259 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1260 
1261 	return err;
1262 }
1263 
1264 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1265 {
1266 	if (unlikely(sk1 == sk2) || !sk2) {
1267 		unix_state_lock(sk1);
1268 		return;
1269 	}
1270 	if (sk1 < sk2) {
1271 		unix_state_lock(sk1);
1272 		unix_state_lock_nested(sk2);
1273 	} else {
1274 		unix_state_lock(sk2);
1275 		unix_state_lock_nested(sk1);
1276 	}
1277 }
1278 
1279 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1280 {
1281 	if (unlikely(sk1 == sk2) || !sk2) {
1282 		unix_state_unlock(sk1);
1283 		return;
1284 	}
1285 	unix_state_unlock(sk1);
1286 	unix_state_unlock(sk2);
1287 }
1288 
1289 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1290 			      int alen, int flags)
1291 {
1292 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1293 	struct sock *sk = sock->sk;
1294 	struct sock *other;
1295 	int err;
1296 
1297 	err = -EINVAL;
1298 	if (alen < offsetofend(struct sockaddr, sa_family))
1299 		goto out;
1300 
1301 	if (addr->sa_family != AF_UNSPEC) {
1302 		err = unix_validate_addr(sunaddr, alen);
1303 		if (err)
1304 			goto out;
1305 
1306 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1307 		    !unix_sk(sk)->addr) {
1308 			err = unix_autobind(sk);
1309 			if (err)
1310 				goto out;
1311 		}
1312 
1313 restart:
1314 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1315 		if (IS_ERR(other)) {
1316 			err = PTR_ERR(other);
1317 			goto out;
1318 		}
1319 
1320 		unix_state_double_lock(sk, other);
1321 
1322 		/* Apparently VFS overslept socket death. Retry. */
1323 		if (sock_flag(other, SOCK_DEAD)) {
1324 			unix_state_double_unlock(sk, other);
1325 			sock_put(other);
1326 			goto restart;
1327 		}
1328 
1329 		err = -EPERM;
1330 		if (!unix_may_send(sk, other))
1331 			goto out_unlock;
1332 
1333 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1334 		if (err)
1335 			goto out_unlock;
1336 
1337 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1338 	} else {
1339 		/*
1340 		 *	1003.1g breaking connected state with AF_UNSPEC
1341 		 */
1342 		other = NULL;
1343 		unix_state_double_lock(sk, other);
1344 	}
1345 
1346 	/*
1347 	 * If it was connected, reconnect.
1348 	 */
1349 	if (unix_peer(sk)) {
1350 		struct sock *old_peer = unix_peer(sk);
1351 
1352 		unix_peer(sk) = other;
1353 		if (!other)
1354 			sk->sk_state = TCP_CLOSE;
1355 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1356 
1357 		unix_state_double_unlock(sk, other);
1358 
1359 		if (other != old_peer)
1360 			unix_dgram_disconnected(sk, old_peer);
1361 		sock_put(old_peer);
1362 	} else {
1363 		unix_peer(sk) = other;
1364 		unix_state_double_unlock(sk, other);
1365 	}
1366 
1367 	return 0;
1368 
1369 out_unlock:
1370 	unix_state_double_unlock(sk, other);
1371 	sock_put(other);
1372 out:
1373 	return err;
1374 }
1375 
1376 static long unix_wait_for_peer(struct sock *other, long timeo)
1377 	__releases(&unix_sk(other)->lock)
1378 {
1379 	struct unix_sock *u = unix_sk(other);
1380 	int sched;
1381 	DEFINE_WAIT(wait);
1382 
1383 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1384 
1385 	sched = !sock_flag(other, SOCK_DEAD) &&
1386 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1387 		unix_recvq_full(other);
1388 
1389 	unix_state_unlock(other);
1390 
1391 	if (sched)
1392 		timeo = schedule_timeout(timeo);
1393 
1394 	finish_wait(&u->peer_wait, &wait);
1395 	return timeo;
1396 }
1397 
1398 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1399 			       int addr_len, int flags)
1400 {
1401 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1402 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1403 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1404 	struct net *net = sock_net(sk);
1405 	struct sk_buff *skb = NULL;
1406 	long timeo;
1407 	int err;
1408 	int st;
1409 
1410 	err = unix_validate_addr(sunaddr, addr_len);
1411 	if (err)
1412 		goto out;
1413 
1414 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1415 		err = unix_autobind(sk);
1416 		if (err)
1417 			goto out;
1418 	}
1419 
1420 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1421 
1422 	/* First of all allocate resources.
1423 	   If we will make it after state is locked,
1424 	   we will have to recheck all again in any case.
1425 	 */
1426 
1427 	/* create new sock for complete connection */
1428 	newsk = unix_create1(net, NULL, 0, sock->type);
1429 	if (IS_ERR(newsk)) {
1430 		err = PTR_ERR(newsk);
1431 		newsk = NULL;
1432 		goto out;
1433 	}
1434 
1435 	err = -ENOMEM;
1436 
1437 	/* Allocate skb for sending to listening sock */
1438 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1439 	if (skb == NULL)
1440 		goto out;
1441 
1442 restart:
1443 	/*  Find listening sock. */
1444 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1445 	if (IS_ERR(other)) {
1446 		err = PTR_ERR(other);
1447 		other = NULL;
1448 		goto out;
1449 	}
1450 
1451 	/* Latch state of peer */
1452 	unix_state_lock(other);
1453 
1454 	/* Apparently VFS overslept socket death. Retry. */
1455 	if (sock_flag(other, SOCK_DEAD)) {
1456 		unix_state_unlock(other);
1457 		sock_put(other);
1458 		goto restart;
1459 	}
1460 
1461 	err = -ECONNREFUSED;
1462 	if (other->sk_state != TCP_LISTEN)
1463 		goto out_unlock;
1464 	if (other->sk_shutdown & RCV_SHUTDOWN)
1465 		goto out_unlock;
1466 
1467 	if (unix_recvq_full(other)) {
1468 		err = -EAGAIN;
1469 		if (!timeo)
1470 			goto out_unlock;
1471 
1472 		timeo = unix_wait_for_peer(other, timeo);
1473 
1474 		err = sock_intr_errno(timeo);
1475 		if (signal_pending(current))
1476 			goto out;
1477 		sock_put(other);
1478 		goto restart;
1479 	}
1480 
1481 	/* Latch our state.
1482 
1483 	   It is tricky place. We need to grab our state lock and cannot
1484 	   drop lock on peer. It is dangerous because deadlock is
1485 	   possible. Connect to self case and simultaneous
1486 	   attempt to connect are eliminated by checking socket
1487 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1488 	   check this before attempt to grab lock.
1489 
1490 	   Well, and we have to recheck the state after socket locked.
1491 	 */
1492 	st = sk->sk_state;
1493 
1494 	switch (st) {
1495 	case TCP_CLOSE:
1496 		/* This is ok... continue with connect */
1497 		break;
1498 	case TCP_ESTABLISHED:
1499 		/* Socket is already connected */
1500 		err = -EISCONN;
1501 		goto out_unlock;
1502 	default:
1503 		err = -EINVAL;
1504 		goto out_unlock;
1505 	}
1506 
1507 	unix_state_lock_nested(sk);
1508 
1509 	if (sk->sk_state != st) {
1510 		unix_state_unlock(sk);
1511 		unix_state_unlock(other);
1512 		sock_put(other);
1513 		goto restart;
1514 	}
1515 
1516 	err = security_unix_stream_connect(sk, other, newsk);
1517 	if (err) {
1518 		unix_state_unlock(sk);
1519 		goto out_unlock;
1520 	}
1521 
1522 	/* The way is open! Fastly set all the necessary fields... */
1523 
1524 	sock_hold(sk);
1525 	unix_peer(newsk)	= sk;
1526 	newsk->sk_state		= TCP_ESTABLISHED;
1527 	newsk->sk_type		= sk->sk_type;
1528 	init_peercred(newsk);
1529 	newu = unix_sk(newsk);
1530 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1531 	otheru = unix_sk(other);
1532 
1533 	/* copy address information from listening to new sock
1534 	 *
1535 	 * The contents of *(otheru->addr) and otheru->path
1536 	 * are seen fully set up here, since we have found
1537 	 * otheru in hash under its lock.  Insertion into the
1538 	 * hash chain we'd found it in had been done in an
1539 	 * earlier critical area protected by the chain's lock,
1540 	 * the same one where we'd set *(otheru->addr) contents,
1541 	 * as well as otheru->path and otheru->addr itself.
1542 	 *
1543 	 * Using smp_store_release() here to set newu->addr
1544 	 * is enough to make those stores, as well as stores
1545 	 * to newu->path visible to anyone who gets newu->addr
1546 	 * by smp_load_acquire().  IOW, the same warranties
1547 	 * as for unix_sock instances bound in unix_bind() or
1548 	 * in unix_autobind().
1549 	 */
1550 	if (otheru->path.dentry) {
1551 		path_get(&otheru->path);
1552 		newu->path = otheru->path;
1553 	}
1554 	refcount_inc(&otheru->addr->refcnt);
1555 	smp_store_release(&newu->addr, otheru->addr);
1556 
1557 	/* Set credentials */
1558 	copy_peercred(sk, other);
1559 
1560 	sock->state	= SS_CONNECTED;
1561 	sk->sk_state	= TCP_ESTABLISHED;
1562 	sock_hold(newsk);
1563 
1564 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1565 	unix_peer(sk)	= newsk;
1566 
1567 	unix_state_unlock(sk);
1568 
1569 	/* take ten and send info to listening sock */
1570 	spin_lock(&other->sk_receive_queue.lock);
1571 	__skb_queue_tail(&other->sk_receive_queue, skb);
1572 	spin_unlock(&other->sk_receive_queue.lock);
1573 	unix_state_unlock(other);
1574 	other->sk_data_ready(other);
1575 	sock_put(other);
1576 	return 0;
1577 
1578 out_unlock:
1579 	if (other)
1580 		unix_state_unlock(other);
1581 
1582 out:
1583 	kfree_skb(skb);
1584 	if (newsk)
1585 		unix_release_sock(newsk, 0);
1586 	if (other)
1587 		sock_put(other);
1588 	return err;
1589 }
1590 
1591 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1592 {
1593 	struct sock *ska = socka->sk, *skb = sockb->sk;
1594 
1595 	/* Join our sockets back to back */
1596 	sock_hold(ska);
1597 	sock_hold(skb);
1598 	unix_peer(ska) = skb;
1599 	unix_peer(skb) = ska;
1600 	init_peercred(ska);
1601 	init_peercred(skb);
1602 
1603 	ska->sk_state = TCP_ESTABLISHED;
1604 	skb->sk_state = TCP_ESTABLISHED;
1605 	socka->state  = SS_CONNECTED;
1606 	sockb->state  = SS_CONNECTED;
1607 	return 0;
1608 }
1609 
1610 static void unix_sock_inherit_flags(const struct socket *old,
1611 				    struct socket *new)
1612 {
1613 	if (test_bit(SOCK_PASSCRED, &old->flags))
1614 		set_bit(SOCK_PASSCRED, &new->flags);
1615 	if (test_bit(SOCK_PASSSEC, &old->flags))
1616 		set_bit(SOCK_PASSSEC, &new->flags);
1617 }
1618 
1619 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1620 		       bool kern)
1621 {
1622 	struct sock *sk = sock->sk;
1623 	struct sock *tsk;
1624 	struct sk_buff *skb;
1625 	int err;
1626 
1627 	err = -EOPNOTSUPP;
1628 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1629 		goto out;
1630 
1631 	err = -EINVAL;
1632 	if (sk->sk_state != TCP_LISTEN)
1633 		goto out;
1634 
1635 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1636 	 * so that no locks are necessary.
1637 	 */
1638 
1639 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1640 				&err);
1641 	if (!skb) {
1642 		/* This means receive shutdown. */
1643 		if (err == 0)
1644 			err = -EINVAL;
1645 		goto out;
1646 	}
1647 
1648 	tsk = skb->sk;
1649 	skb_free_datagram(sk, skb);
1650 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1651 
1652 	/* attach accepted sock to socket */
1653 	unix_state_lock(tsk);
1654 	newsock->state = SS_CONNECTED;
1655 	unix_sock_inherit_flags(sock, newsock);
1656 	sock_graft(tsk, newsock);
1657 	unix_state_unlock(tsk);
1658 	return 0;
1659 
1660 out:
1661 	return err;
1662 }
1663 
1664 
1665 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1666 {
1667 	struct sock *sk = sock->sk;
1668 	struct unix_address *addr;
1669 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1670 	int err = 0;
1671 
1672 	if (peer) {
1673 		sk = unix_peer_get(sk);
1674 
1675 		err = -ENOTCONN;
1676 		if (!sk)
1677 			goto out;
1678 		err = 0;
1679 	} else {
1680 		sock_hold(sk);
1681 	}
1682 
1683 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1684 	if (!addr) {
1685 		sunaddr->sun_family = AF_UNIX;
1686 		sunaddr->sun_path[0] = 0;
1687 		err = offsetof(struct sockaddr_un, sun_path);
1688 	} else {
1689 		err = addr->len;
1690 		memcpy(sunaddr, addr->name, addr->len);
1691 	}
1692 	sock_put(sk);
1693 out:
1694 	return err;
1695 }
1696 
1697 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1698 {
1699 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1700 
1701 	/*
1702 	 * Garbage collection of unix sockets starts by selecting a set of
1703 	 * candidate sockets which have reference only from being in flight
1704 	 * (total_refs == inflight_refs).  This condition is checked once during
1705 	 * the candidate collection phase, and candidates are marked as such, so
1706 	 * that non-candidates can later be ignored.  While inflight_refs is
1707 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1708 	 * is an instantaneous decision.
1709 	 *
1710 	 * Once a candidate, however, the socket must not be reinstalled into a
1711 	 * file descriptor while the garbage collection is in progress.
1712 	 *
1713 	 * If the above conditions are met, then the directed graph of
1714 	 * candidates (*) does not change while unix_gc_lock is held.
1715 	 *
1716 	 * Any operations that changes the file count through file descriptors
1717 	 * (dup, close, sendmsg) does not change the graph since candidates are
1718 	 * not installed in fds.
1719 	 *
1720 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1721 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1722 	 * serialized with garbage collection.
1723 	 *
1724 	 * MSG_PEEK is special in that it does not change the inflight count,
1725 	 * yet does install the socket into an fd.  The following lock/unlock
1726 	 * pair is to ensure serialization with garbage collection.  It must be
1727 	 * done between incrementing the file count and installing the file into
1728 	 * an fd.
1729 	 *
1730 	 * If garbage collection starts after the barrier provided by the
1731 	 * lock/unlock, then it will see the elevated refcount and not mark this
1732 	 * as a candidate.  If a garbage collection is already in progress
1733 	 * before the file count was incremented, then the lock/unlock pair will
1734 	 * ensure that garbage collection is finished before progressing to
1735 	 * installing the fd.
1736 	 *
1737 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1738 	 * which is on the queue of listening socket A.
1739 	 */
1740 	spin_lock(&unix_gc_lock);
1741 	spin_unlock(&unix_gc_lock);
1742 }
1743 
1744 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1745 {
1746 	int err = 0;
1747 
1748 	UNIXCB(skb).pid  = get_pid(scm->pid);
1749 	UNIXCB(skb).uid = scm->creds.uid;
1750 	UNIXCB(skb).gid = scm->creds.gid;
1751 	UNIXCB(skb).fp = NULL;
1752 	unix_get_secdata(scm, skb);
1753 	if (scm->fp && send_fds)
1754 		err = unix_attach_fds(scm, skb);
1755 
1756 	skb->destructor = unix_destruct_scm;
1757 	return err;
1758 }
1759 
1760 static bool unix_passcred_enabled(const struct socket *sock,
1761 				  const struct sock *other)
1762 {
1763 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1764 	       !other->sk_socket ||
1765 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1766 }
1767 
1768 /*
1769  * Some apps rely on write() giving SCM_CREDENTIALS
1770  * We include credentials if source or destination socket
1771  * asserted SOCK_PASSCRED.
1772  */
1773 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1774 			    const struct sock *other)
1775 {
1776 	if (UNIXCB(skb).pid)
1777 		return;
1778 	if (unix_passcred_enabled(sock, other)) {
1779 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1780 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1781 	}
1782 }
1783 
1784 static int maybe_init_creds(struct scm_cookie *scm,
1785 			    struct socket *socket,
1786 			    const struct sock *other)
1787 {
1788 	int err;
1789 	struct msghdr msg = { .msg_controllen = 0 };
1790 
1791 	err = scm_send(socket, &msg, scm, false);
1792 	if (err)
1793 		return err;
1794 
1795 	if (unix_passcred_enabled(socket, other)) {
1796 		scm->pid = get_pid(task_tgid(current));
1797 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1798 	}
1799 	return err;
1800 }
1801 
1802 static bool unix_skb_scm_eq(struct sk_buff *skb,
1803 			    struct scm_cookie *scm)
1804 {
1805 	return UNIXCB(skb).pid == scm->pid &&
1806 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1807 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1808 	       unix_secdata_eq(scm, skb);
1809 }
1810 
1811 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1812 {
1813 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1814 	struct unix_sock *u = unix_sk(sk);
1815 
1816 	if (unlikely(fp && fp->count))
1817 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1818 }
1819 
1820 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1821 {
1822 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1823 	struct unix_sock *u = unix_sk(sk);
1824 
1825 	if (unlikely(fp && fp->count))
1826 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1827 }
1828 
1829 /*
1830  *	Send AF_UNIX data.
1831  */
1832 
1833 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1834 			      size_t len)
1835 {
1836 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1837 	struct sock *sk = sock->sk, *other = NULL;
1838 	struct unix_sock *u = unix_sk(sk);
1839 	struct scm_cookie scm;
1840 	struct sk_buff *skb;
1841 	int data_len = 0;
1842 	int sk_locked;
1843 	long timeo;
1844 	int err;
1845 
1846 	wait_for_unix_gc();
1847 	err = scm_send(sock, msg, &scm, false);
1848 	if (err < 0)
1849 		return err;
1850 
1851 	err = -EOPNOTSUPP;
1852 	if (msg->msg_flags&MSG_OOB)
1853 		goto out;
1854 
1855 	if (msg->msg_namelen) {
1856 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1857 		if (err)
1858 			goto out;
1859 	} else {
1860 		sunaddr = NULL;
1861 		err = -ENOTCONN;
1862 		other = unix_peer_get(sk);
1863 		if (!other)
1864 			goto out;
1865 	}
1866 
1867 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1868 		err = unix_autobind(sk);
1869 		if (err)
1870 			goto out;
1871 	}
1872 
1873 	err = -EMSGSIZE;
1874 	if (len > sk->sk_sndbuf - 32)
1875 		goto out;
1876 
1877 	if (len > SKB_MAX_ALLOC) {
1878 		data_len = min_t(size_t,
1879 				 len - SKB_MAX_ALLOC,
1880 				 MAX_SKB_FRAGS * PAGE_SIZE);
1881 		data_len = PAGE_ALIGN(data_len);
1882 
1883 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1884 	}
1885 
1886 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1887 				   msg->msg_flags & MSG_DONTWAIT, &err,
1888 				   PAGE_ALLOC_COSTLY_ORDER);
1889 	if (skb == NULL)
1890 		goto out;
1891 
1892 	err = unix_scm_to_skb(&scm, skb, true);
1893 	if (err < 0)
1894 		goto out_free;
1895 
1896 	skb_put(skb, len - data_len);
1897 	skb->data_len = data_len;
1898 	skb->len = len;
1899 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1900 	if (err)
1901 		goto out_free;
1902 
1903 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1904 
1905 restart:
1906 	if (!other) {
1907 		err = -ECONNRESET;
1908 		if (sunaddr == NULL)
1909 			goto out_free;
1910 
1911 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1912 					sk->sk_type);
1913 		if (IS_ERR(other)) {
1914 			err = PTR_ERR(other);
1915 			other = NULL;
1916 			goto out_free;
1917 		}
1918 	}
1919 
1920 	if (sk_filter(other, skb) < 0) {
1921 		/* Toss the packet but do not return any error to the sender */
1922 		err = len;
1923 		goto out_free;
1924 	}
1925 
1926 	sk_locked = 0;
1927 	unix_state_lock(other);
1928 restart_locked:
1929 	err = -EPERM;
1930 	if (!unix_may_send(sk, other))
1931 		goto out_unlock;
1932 
1933 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1934 		/*
1935 		 *	Check with 1003.1g - what should
1936 		 *	datagram error
1937 		 */
1938 		unix_state_unlock(other);
1939 		sock_put(other);
1940 
1941 		if (!sk_locked)
1942 			unix_state_lock(sk);
1943 
1944 		err = 0;
1945 		if (unix_peer(sk) == other) {
1946 			unix_peer(sk) = NULL;
1947 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1948 
1949 			unix_state_unlock(sk);
1950 
1951 			sk->sk_state = TCP_CLOSE;
1952 			unix_dgram_disconnected(sk, other);
1953 			sock_put(other);
1954 			err = -ECONNREFUSED;
1955 		} else {
1956 			unix_state_unlock(sk);
1957 		}
1958 
1959 		other = NULL;
1960 		if (err)
1961 			goto out_free;
1962 		goto restart;
1963 	}
1964 
1965 	err = -EPIPE;
1966 	if (other->sk_shutdown & RCV_SHUTDOWN)
1967 		goto out_unlock;
1968 
1969 	if (sk->sk_type != SOCK_SEQPACKET) {
1970 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1971 		if (err)
1972 			goto out_unlock;
1973 	}
1974 
1975 	/* other == sk && unix_peer(other) != sk if
1976 	 * - unix_peer(sk) == NULL, destination address bound to sk
1977 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
1978 	 */
1979 	if (other != sk &&
1980 	    unlikely(unix_peer(other) != sk &&
1981 	    unix_recvq_full_lockless(other))) {
1982 		if (timeo) {
1983 			timeo = unix_wait_for_peer(other, timeo);
1984 
1985 			err = sock_intr_errno(timeo);
1986 			if (signal_pending(current))
1987 				goto out_free;
1988 
1989 			goto restart;
1990 		}
1991 
1992 		if (!sk_locked) {
1993 			unix_state_unlock(other);
1994 			unix_state_double_lock(sk, other);
1995 		}
1996 
1997 		if (unix_peer(sk) != other ||
1998 		    unix_dgram_peer_wake_me(sk, other)) {
1999 			err = -EAGAIN;
2000 			sk_locked = 1;
2001 			goto out_unlock;
2002 		}
2003 
2004 		if (!sk_locked) {
2005 			sk_locked = 1;
2006 			goto restart_locked;
2007 		}
2008 	}
2009 
2010 	if (unlikely(sk_locked))
2011 		unix_state_unlock(sk);
2012 
2013 	if (sock_flag(other, SOCK_RCVTSTAMP))
2014 		__net_timestamp(skb);
2015 	maybe_add_creds(skb, sock, other);
2016 	scm_stat_add(other, skb);
2017 	skb_queue_tail(&other->sk_receive_queue, skb);
2018 	unix_state_unlock(other);
2019 	other->sk_data_ready(other);
2020 	sock_put(other);
2021 	scm_destroy(&scm);
2022 	return len;
2023 
2024 out_unlock:
2025 	if (sk_locked)
2026 		unix_state_unlock(sk);
2027 	unix_state_unlock(other);
2028 out_free:
2029 	kfree_skb(skb);
2030 out:
2031 	if (other)
2032 		sock_put(other);
2033 	scm_destroy(&scm);
2034 	return err;
2035 }
2036 
2037 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2038  * bytes, and a minimum of a full page.
2039  */
2040 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2041 
2042 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2043 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2044 {
2045 	struct unix_sock *ousk = unix_sk(other);
2046 	struct sk_buff *skb;
2047 	int err = 0;
2048 
2049 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2050 
2051 	if (!skb)
2052 		return err;
2053 
2054 	skb_put(skb, 1);
2055 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2056 
2057 	if (err) {
2058 		kfree_skb(skb);
2059 		return err;
2060 	}
2061 
2062 	unix_state_lock(other);
2063 
2064 	if (sock_flag(other, SOCK_DEAD) ||
2065 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2066 		unix_state_unlock(other);
2067 		kfree_skb(skb);
2068 		return -EPIPE;
2069 	}
2070 
2071 	maybe_add_creds(skb, sock, other);
2072 	skb_get(skb);
2073 
2074 	if (ousk->oob_skb)
2075 		consume_skb(ousk->oob_skb);
2076 
2077 	WRITE_ONCE(ousk->oob_skb, skb);
2078 
2079 	scm_stat_add(other, skb);
2080 	skb_queue_tail(&other->sk_receive_queue, skb);
2081 	sk_send_sigurg(other);
2082 	unix_state_unlock(other);
2083 	other->sk_data_ready(other);
2084 
2085 	return err;
2086 }
2087 #endif
2088 
2089 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2090 			       size_t len)
2091 {
2092 	struct sock *sk = sock->sk;
2093 	struct sock *other = NULL;
2094 	int err, size;
2095 	struct sk_buff *skb;
2096 	int sent = 0;
2097 	struct scm_cookie scm;
2098 	bool fds_sent = false;
2099 	int data_len;
2100 
2101 	wait_for_unix_gc();
2102 	err = scm_send(sock, msg, &scm, false);
2103 	if (err < 0)
2104 		return err;
2105 
2106 	err = -EOPNOTSUPP;
2107 	if (msg->msg_flags & MSG_OOB) {
2108 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2109 		if (len)
2110 			len--;
2111 		else
2112 #endif
2113 			goto out_err;
2114 	}
2115 
2116 	if (msg->msg_namelen) {
2117 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2118 		goto out_err;
2119 	} else {
2120 		err = -ENOTCONN;
2121 		other = unix_peer(sk);
2122 		if (!other)
2123 			goto out_err;
2124 	}
2125 
2126 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2127 		goto pipe_err;
2128 
2129 	while (sent < len) {
2130 		size = len - sent;
2131 
2132 		/* Keep two messages in the pipe so it schedules better */
2133 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2134 
2135 		/* allow fallback to order-0 allocations */
2136 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2137 
2138 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2139 
2140 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2141 
2142 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2143 					   msg->msg_flags & MSG_DONTWAIT, &err,
2144 					   get_order(UNIX_SKB_FRAGS_SZ));
2145 		if (!skb)
2146 			goto out_err;
2147 
2148 		/* Only send the fds in the first buffer */
2149 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2150 		if (err < 0) {
2151 			kfree_skb(skb);
2152 			goto out_err;
2153 		}
2154 		fds_sent = true;
2155 
2156 		skb_put(skb, size - data_len);
2157 		skb->data_len = data_len;
2158 		skb->len = size;
2159 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2160 		if (err) {
2161 			kfree_skb(skb);
2162 			goto out_err;
2163 		}
2164 
2165 		unix_state_lock(other);
2166 
2167 		if (sock_flag(other, SOCK_DEAD) ||
2168 		    (other->sk_shutdown & RCV_SHUTDOWN))
2169 			goto pipe_err_free;
2170 
2171 		maybe_add_creds(skb, sock, other);
2172 		scm_stat_add(other, skb);
2173 		skb_queue_tail(&other->sk_receive_queue, skb);
2174 		unix_state_unlock(other);
2175 		other->sk_data_ready(other);
2176 		sent += size;
2177 	}
2178 
2179 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2180 	if (msg->msg_flags & MSG_OOB) {
2181 		err = queue_oob(sock, msg, other);
2182 		if (err)
2183 			goto out_err;
2184 		sent++;
2185 	}
2186 #endif
2187 
2188 	scm_destroy(&scm);
2189 
2190 	return sent;
2191 
2192 pipe_err_free:
2193 	unix_state_unlock(other);
2194 	kfree_skb(skb);
2195 pipe_err:
2196 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2197 		send_sig(SIGPIPE, current, 0);
2198 	err = -EPIPE;
2199 out_err:
2200 	scm_destroy(&scm);
2201 	return sent ? : err;
2202 }
2203 
2204 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2205 				    int offset, size_t size, int flags)
2206 {
2207 	int err;
2208 	bool send_sigpipe = false;
2209 	bool init_scm = true;
2210 	struct scm_cookie scm;
2211 	struct sock *other, *sk = socket->sk;
2212 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2213 
2214 	if (flags & MSG_OOB)
2215 		return -EOPNOTSUPP;
2216 
2217 	other = unix_peer(sk);
2218 	if (!other || sk->sk_state != TCP_ESTABLISHED)
2219 		return -ENOTCONN;
2220 
2221 	if (false) {
2222 alloc_skb:
2223 		unix_state_unlock(other);
2224 		mutex_unlock(&unix_sk(other)->iolock);
2225 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2226 					      &err, 0);
2227 		if (!newskb)
2228 			goto err;
2229 	}
2230 
2231 	/* we must acquire iolock as we modify already present
2232 	 * skbs in the sk_receive_queue and mess with skb->len
2233 	 */
2234 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2235 	if (err) {
2236 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2237 		goto err;
2238 	}
2239 
2240 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
2241 		err = -EPIPE;
2242 		send_sigpipe = true;
2243 		goto err_unlock;
2244 	}
2245 
2246 	unix_state_lock(other);
2247 
2248 	if (sock_flag(other, SOCK_DEAD) ||
2249 	    other->sk_shutdown & RCV_SHUTDOWN) {
2250 		err = -EPIPE;
2251 		send_sigpipe = true;
2252 		goto err_state_unlock;
2253 	}
2254 
2255 	if (init_scm) {
2256 		err = maybe_init_creds(&scm, socket, other);
2257 		if (err)
2258 			goto err_state_unlock;
2259 		init_scm = false;
2260 	}
2261 
2262 	skb = skb_peek_tail(&other->sk_receive_queue);
2263 	if (tail && tail == skb) {
2264 		skb = newskb;
2265 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2266 		if (newskb) {
2267 			skb = newskb;
2268 		} else {
2269 			tail = skb;
2270 			goto alloc_skb;
2271 		}
2272 	} else if (newskb) {
2273 		/* this is fast path, we don't necessarily need to
2274 		 * call to kfree_skb even though with newskb == NULL
2275 		 * this - does no harm
2276 		 */
2277 		consume_skb(newskb);
2278 		newskb = NULL;
2279 	}
2280 
2281 	if (skb_append_pagefrags(skb, page, offset, size)) {
2282 		tail = skb;
2283 		goto alloc_skb;
2284 	}
2285 
2286 	skb->len += size;
2287 	skb->data_len += size;
2288 	skb->truesize += size;
2289 	refcount_add(size, &sk->sk_wmem_alloc);
2290 
2291 	if (newskb) {
2292 		err = unix_scm_to_skb(&scm, skb, false);
2293 		if (err)
2294 			goto err_state_unlock;
2295 		spin_lock(&other->sk_receive_queue.lock);
2296 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2297 		spin_unlock(&other->sk_receive_queue.lock);
2298 	}
2299 
2300 	unix_state_unlock(other);
2301 	mutex_unlock(&unix_sk(other)->iolock);
2302 
2303 	other->sk_data_ready(other);
2304 	scm_destroy(&scm);
2305 	return size;
2306 
2307 err_state_unlock:
2308 	unix_state_unlock(other);
2309 err_unlock:
2310 	mutex_unlock(&unix_sk(other)->iolock);
2311 err:
2312 	kfree_skb(newskb);
2313 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2314 		send_sig(SIGPIPE, current, 0);
2315 	if (!init_scm)
2316 		scm_destroy(&scm);
2317 	return err;
2318 }
2319 
2320 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2321 				  size_t len)
2322 {
2323 	int err;
2324 	struct sock *sk = sock->sk;
2325 
2326 	err = sock_error(sk);
2327 	if (err)
2328 		return err;
2329 
2330 	if (sk->sk_state != TCP_ESTABLISHED)
2331 		return -ENOTCONN;
2332 
2333 	if (msg->msg_namelen)
2334 		msg->msg_namelen = 0;
2335 
2336 	return unix_dgram_sendmsg(sock, msg, len);
2337 }
2338 
2339 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2340 				  size_t size, int flags)
2341 {
2342 	struct sock *sk = sock->sk;
2343 
2344 	if (sk->sk_state != TCP_ESTABLISHED)
2345 		return -ENOTCONN;
2346 
2347 	return unix_dgram_recvmsg(sock, msg, size, flags);
2348 }
2349 
2350 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2351 {
2352 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2353 
2354 	if (addr) {
2355 		msg->msg_namelen = addr->len;
2356 		memcpy(msg->msg_name, addr->name, addr->len);
2357 	}
2358 }
2359 
2360 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2361 			 int flags)
2362 {
2363 	struct scm_cookie scm;
2364 	struct socket *sock = sk->sk_socket;
2365 	struct unix_sock *u = unix_sk(sk);
2366 	struct sk_buff *skb, *last;
2367 	long timeo;
2368 	int skip;
2369 	int err;
2370 
2371 	err = -EOPNOTSUPP;
2372 	if (flags&MSG_OOB)
2373 		goto out;
2374 
2375 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2376 
2377 	do {
2378 		mutex_lock(&u->iolock);
2379 
2380 		skip = sk_peek_offset(sk, flags);
2381 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2382 					      &skip, &err, &last);
2383 		if (skb) {
2384 			if (!(flags & MSG_PEEK))
2385 				scm_stat_del(sk, skb);
2386 			break;
2387 		}
2388 
2389 		mutex_unlock(&u->iolock);
2390 
2391 		if (err != -EAGAIN)
2392 			break;
2393 	} while (timeo &&
2394 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2395 					      &err, &timeo, last));
2396 
2397 	if (!skb) { /* implies iolock unlocked */
2398 		unix_state_lock(sk);
2399 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2400 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2401 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2402 			err = 0;
2403 		unix_state_unlock(sk);
2404 		goto out;
2405 	}
2406 
2407 	if (wq_has_sleeper(&u->peer_wait))
2408 		wake_up_interruptible_sync_poll(&u->peer_wait,
2409 						EPOLLOUT | EPOLLWRNORM |
2410 						EPOLLWRBAND);
2411 
2412 	if (msg->msg_name)
2413 		unix_copy_addr(msg, skb->sk);
2414 
2415 	if (size > skb->len - skip)
2416 		size = skb->len - skip;
2417 	else if (size < skb->len - skip)
2418 		msg->msg_flags |= MSG_TRUNC;
2419 
2420 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2421 	if (err)
2422 		goto out_free;
2423 
2424 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2425 		__sock_recv_timestamp(msg, sk, skb);
2426 
2427 	memset(&scm, 0, sizeof(scm));
2428 
2429 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2430 	unix_set_secdata(&scm, skb);
2431 
2432 	if (!(flags & MSG_PEEK)) {
2433 		if (UNIXCB(skb).fp)
2434 			unix_detach_fds(&scm, skb);
2435 
2436 		sk_peek_offset_bwd(sk, skb->len);
2437 	} else {
2438 		/* It is questionable: on PEEK we could:
2439 		   - do not return fds - good, but too simple 8)
2440 		   - return fds, and do not return them on read (old strategy,
2441 		     apparently wrong)
2442 		   - clone fds (I chose it for now, it is the most universal
2443 		     solution)
2444 
2445 		   POSIX 1003.1g does not actually define this clearly
2446 		   at all. POSIX 1003.1g doesn't define a lot of things
2447 		   clearly however!
2448 
2449 		*/
2450 
2451 		sk_peek_offset_fwd(sk, size);
2452 
2453 		if (UNIXCB(skb).fp)
2454 			unix_peek_fds(&scm, skb);
2455 	}
2456 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2457 
2458 	scm_recv(sock, msg, &scm, flags);
2459 
2460 out_free:
2461 	skb_free_datagram(sk, skb);
2462 	mutex_unlock(&u->iolock);
2463 out:
2464 	return err;
2465 }
2466 
2467 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2468 			      int flags)
2469 {
2470 	struct sock *sk = sock->sk;
2471 
2472 #ifdef CONFIG_BPF_SYSCALL
2473 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2474 
2475 	if (prot != &unix_dgram_proto)
2476 		return prot->recvmsg(sk, msg, size, flags, NULL);
2477 #endif
2478 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2479 }
2480 
2481 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
2482 			  sk_read_actor_t recv_actor)
2483 {
2484 	int copied = 0;
2485 
2486 	while (1) {
2487 		struct unix_sock *u = unix_sk(sk);
2488 		struct sk_buff *skb;
2489 		int used, err;
2490 
2491 		mutex_lock(&u->iolock);
2492 		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2493 		mutex_unlock(&u->iolock);
2494 		if (!skb)
2495 			return err;
2496 
2497 		used = recv_actor(desc, skb, 0, skb->len);
2498 		if (used <= 0) {
2499 			if (!copied)
2500 				copied = used;
2501 			kfree_skb(skb);
2502 			break;
2503 		} else if (used <= skb->len) {
2504 			copied += used;
2505 		}
2506 
2507 		kfree_skb(skb);
2508 		if (!desc->count)
2509 			break;
2510 	}
2511 
2512 	return copied;
2513 }
2514 
2515 /*
2516  *	Sleep until more data has arrived. But check for races..
2517  */
2518 static long unix_stream_data_wait(struct sock *sk, long timeo,
2519 				  struct sk_buff *last, unsigned int last_len,
2520 				  bool freezable)
2521 {
2522 	struct sk_buff *tail;
2523 	DEFINE_WAIT(wait);
2524 
2525 	unix_state_lock(sk);
2526 
2527 	for (;;) {
2528 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2529 
2530 		tail = skb_peek_tail(&sk->sk_receive_queue);
2531 		if (tail != last ||
2532 		    (tail && tail->len != last_len) ||
2533 		    sk->sk_err ||
2534 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2535 		    signal_pending(current) ||
2536 		    !timeo)
2537 			break;
2538 
2539 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2540 		unix_state_unlock(sk);
2541 		if (freezable)
2542 			timeo = freezable_schedule_timeout(timeo);
2543 		else
2544 			timeo = schedule_timeout(timeo);
2545 		unix_state_lock(sk);
2546 
2547 		if (sock_flag(sk, SOCK_DEAD))
2548 			break;
2549 
2550 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2551 	}
2552 
2553 	finish_wait(sk_sleep(sk), &wait);
2554 	unix_state_unlock(sk);
2555 	return timeo;
2556 }
2557 
2558 static unsigned int unix_skb_len(const struct sk_buff *skb)
2559 {
2560 	return skb->len - UNIXCB(skb).consumed;
2561 }
2562 
2563 struct unix_stream_read_state {
2564 	int (*recv_actor)(struct sk_buff *, int, int,
2565 			  struct unix_stream_read_state *);
2566 	struct socket *socket;
2567 	struct msghdr *msg;
2568 	struct pipe_inode_info *pipe;
2569 	size_t size;
2570 	int flags;
2571 	unsigned int splice_flags;
2572 };
2573 
2574 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2575 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2576 {
2577 	struct socket *sock = state->socket;
2578 	struct sock *sk = sock->sk;
2579 	struct unix_sock *u = unix_sk(sk);
2580 	int chunk = 1;
2581 	struct sk_buff *oob_skb;
2582 
2583 	mutex_lock(&u->iolock);
2584 	unix_state_lock(sk);
2585 
2586 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2587 		unix_state_unlock(sk);
2588 		mutex_unlock(&u->iolock);
2589 		return -EINVAL;
2590 	}
2591 
2592 	oob_skb = u->oob_skb;
2593 
2594 	if (!(state->flags & MSG_PEEK))
2595 		WRITE_ONCE(u->oob_skb, NULL);
2596 
2597 	unix_state_unlock(sk);
2598 
2599 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2600 
2601 	if (!(state->flags & MSG_PEEK)) {
2602 		UNIXCB(oob_skb).consumed += 1;
2603 		kfree_skb(oob_skb);
2604 	}
2605 
2606 	mutex_unlock(&u->iolock);
2607 
2608 	if (chunk < 0)
2609 		return -EFAULT;
2610 
2611 	state->msg->msg_flags |= MSG_OOB;
2612 	return 1;
2613 }
2614 
2615 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2616 				  int flags, int copied)
2617 {
2618 	struct unix_sock *u = unix_sk(sk);
2619 
2620 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2621 		skb_unlink(skb, &sk->sk_receive_queue);
2622 		consume_skb(skb);
2623 		skb = NULL;
2624 	} else {
2625 		if (skb == u->oob_skb) {
2626 			if (copied) {
2627 				skb = NULL;
2628 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2629 				if (!(flags & MSG_PEEK)) {
2630 					WRITE_ONCE(u->oob_skb, NULL);
2631 					consume_skb(skb);
2632 				}
2633 			} else if (!(flags & MSG_PEEK)) {
2634 				skb_unlink(skb, &sk->sk_receive_queue);
2635 				consume_skb(skb);
2636 				skb = skb_peek(&sk->sk_receive_queue);
2637 			}
2638 		}
2639 	}
2640 	return skb;
2641 }
2642 #endif
2643 
2644 static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
2645 				 sk_read_actor_t recv_actor)
2646 {
2647 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2648 		return -ENOTCONN;
2649 
2650 	return unix_read_sock(sk, desc, recv_actor);
2651 }
2652 
2653 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2654 				    bool freezable)
2655 {
2656 	struct scm_cookie scm;
2657 	struct socket *sock = state->socket;
2658 	struct sock *sk = sock->sk;
2659 	struct unix_sock *u = unix_sk(sk);
2660 	int copied = 0;
2661 	int flags = state->flags;
2662 	int noblock = flags & MSG_DONTWAIT;
2663 	bool check_creds = false;
2664 	int target;
2665 	int err = 0;
2666 	long timeo;
2667 	int skip;
2668 	size_t size = state->size;
2669 	unsigned int last_len;
2670 
2671 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2672 		err = -EINVAL;
2673 		goto out;
2674 	}
2675 
2676 	if (unlikely(flags & MSG_OOB)) {
2677 		err = -EOPNOTSUPP;
2678 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2679 		err = unix_stream_recv_urg(state);
2680 #endif
2681 		goto out;
2682 	}
2683 
2684 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2685 	timeo = sock_rcvtimeo(sk, noblock);
2686 
2687 	memset(&scm, 0, sizeof(scm));
2688 
2689 	/* Lock the socket to prevent queue disordering
2690 	 * while sleeps in memcpy_tomsg
2691 	 */
2692 	mutex_lock(&u->iolock);
2693 
2694 	skip = max(sk_peek_offset(sk, flags), 0);
2695 
2696 	do {
2697 		int chunk;
2698 		bool drop_skb;
2699 		struct sk_buff *skb, *last;
2700 
2701 redo:
2702 		unix_state_lock(sk);
2703 		if (sock_flag(sk, SOCK_DEAD)) {
2704 			err = -ECONNRESET;
2705 			goto unlock;
2706 		}
2707 		last = skb = skb_peek(&sk->sk_receive_queue);
2708 		last_len = last ? last->len : 0;
2709 
2710 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2711 		if (skb) {
2712 			skb = manage_oob(skb, sk, flags, copied);
2713 			if (!skb) {
2714 				unix_state_unlock(sk);
2715 				if (copied)
2716 					break;
2717 				goto redo;
2718 			}
2719 		}
2720 #endif
2721 again:
2722 		if (skb == NULL) {
2723 			if (copied >= target)
2724 				goto unlock;
2725 
2726 			/*
2727 			 *	POSIX 1003.1g mandates this order.
2728 			 */
2729 
2730 			err = sock_error(sk);
2731 			if (err)
2732 				goto unlock;
2733 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2734 				goto unlock;
2735 
2736 			unix_state_unlock(sk);
2737 			if (!timeo) {
2738 				err = -EAGAIN;
2739 				break;
2740 			}
2741 
2742 			mutex_unlock(&u->iolock);
2743 
2744 			timeo = unix_stream_data_wait(sk, timeo, last,
2745 						      last_len, freezable);
2746 
2747 			if (signal_pending(current)) {
2748 				err = sock_intr_errno(timeo);
2749 				scm_destroy(&scm);
2750 				goto out;
2751 			}
2752 
2753 			mutex_lock(&u->iolock);
2754 			goto redo;
2755 unlock:
2756 			unix_state_unlock(sk);
2757 			break;
2758 		}
2759 
2760 		while (skip >= unix_skb_len(skb)) {
2761 			skip -= unix_skb_len(skb);
2762 			last = skb;
2763 			last_len = skb->len;
2764 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2765 			if (!skb)
2766 				goto again;
2767 		}
2768 
2769 		unix_state_unlock(sk);
2770 
2771 		if (check_creds) {
2772 			/* Never glue messages from different writers */
2773 			if (!unix_skb_scm_eq(skb, &scm))
2774 				break;
2775 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2776 			/* Copy credentials */
2777 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2778 			unix_set_secdata(&scm, skb);
2779 			check_creds = true;
2780 		}
2781 
2782 		/* Copy address just once */
2783 		if (state->msg && state->msg->msg_name) {
2784 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2785 					 state->msg->msg_name);
2786 			unix_copy_addr(state->msg, skb->sk);
2787 			sunaddr = NULL;
2788 		}
2789 
2790 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2791 		skb_get(skb);
2792 		chunk = state->recv_actor(skb, skip, chunk, state);
2793 		drop_skb = !unix_skb_len(skb);
2794 		/* skb is only safe to use if !drop_skb */
2795 		consume_skb(skb);
2796 		if (chunk < 0) {
2797 			if (copied == 0)
2798 				copied = -EFAULT;
2799 			break;
2800 		}
2801 		copied += chunk;
2802 		size -= chunk;
2803 
2804 		if (drop_skb) {
2805 			/* the skb was touched by a concurrent reader;
2806 			 * we should not expect anything from this skb
2807 			 * anymore and assume it invalid - we can be
2808 			 * sure it was dropped from the socket queue
2809 			 *
2810 			 * let's report a short read
2811 			 */
2812 			err = 0;
2813 			break;
2814 		}
2815 
2816 		/* Mark read part of skb as used */
2817 		if (!(flags & MSG_PEEK)) {
2818 			UNIXCB(skb).consumed += chunk;
2819 
2820 			sk_peek_offset_bwd(sk, chunk);
2821 
2822 			if (UNIXCB(skb).fp) {
2823 				scm_stat_del(sk, skb);
2824 				unix_detach_fds(&scm, skb);
2825 			}
2826 
2827 			if (unix_skb_len(skb))
2828 				break;
2829 
2830 			skb_unlink(skb, &sk->sk_receive_queue);
2831 			consume_skb(skb);
2832 
2833 			if (scm.fp)
2834 				break;
2835 		} else {
2836 			/* It is questionable, see note in unix_dgram_recvmsg.
2837 			 */
2838 			if (UNIXCB(skb).fp)
2839 				unix_peek_fds(&scm, skb);
2840 
2841 			sk_peek_offset_fwd(sk, chunk);
2842 
2843 			if (UNIXCB(skb).fp)
2844 				break;
2845 
2846 			skip = 0;
2847 			last = skb;
2848 			last_len = skb->len;
2849 			unix_state_lock(sk);
2850 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2851 			if (skb)
2852 				goto again;
2853 			unix_state_unlock(sk);
2854 			break;
2855 		}
2856 	} while (size);
2857 
2858 	mutex_unlock(&u->iolock);
2859 	if (state->msg)
2860 		scm_recv(sock, state->msg, &scm, flags);
2861 	else
2862 		scm_destroy(&scm);
2863 out:
2864 	return copied ? : err;
2865 }
2866 
2867 static int unix_stream_read_actor(struct sk_buff *skb,
2868 				  int skip, int chunk,
2869 				  struct unix_stream_read_state *state)
2870 {
2871 	int ret;
2872 
2873 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2874 				    state->msg, chunk);
2875 	return ret ?: chunk;
2876 }
2877 
2878 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2879 			  size_t size, int flags)
2880 {
2881 	struct unix_stream_read_state state = {
2882 		.recv_actor = unix_stream_read_actor,
2883 		.socket = sk->sk_socket,
2884 		.msg = msg,
2885 		.size = size,
2886 		.flags = flags
2887 	};
2888 
2889 	return unix_stream_read_generic(&state, true);
2890 }
2891 
2892 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2893 			       size_t size, int flags)
2894 {
2895 	struct unix_stream_read_state state = {
2896 		.recv_actor = unix_stream_read_actor,
2897 		.socket = sock,
2898 		.msg = msg,
2899 		.size = size,
2900 		.flags = flags
2901 	};
2902 
2903 #ifdef CONFIG_BPF_SYSCALL
2904 	struct sock *sk = sock->sk;
2905 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2906 
2907 	if (prot != &unix_stream_proto)
2908 		return prot->recvmsg(sk, msg, size, flags, NULL);
2909 #endif
2910 	return unix_stream_read_generic(&state, true);
2911 }
2912 
2913 static int unix_stream_splice_actor(struct sk_buff *skb,
2914 				    int skip, int chunk,
2915 				    struct unix_stream_read_state *state)
2916 {
2917 	return skb_splice_bits(skb, state->socket->sk,
2918 			       UNIXCB(skb).consumed + skip,
2919 			       state->pipe, chunk, state->splice_flags);
2920 }
2921 
2922 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2923 				       struct pipe_inode_info *pipe,
2924 				       size_t size, unsigned int flags)
2925 {
2926 	struct unix_stream_read_state state = {
2927 		.recv_actor = unix_stream_splice_actor,
2928 		.socket = sock,
2929 		.pipe = pipe,
2930 		.size = size,
2931 		.splice_flags = flags,
2932 	};
2933 
2934 	if (unlikely(*ppos))
2935 		return -ESPIPE;
2936 
2937 	if (sock->file->f_flags & O_NONBLOCK ||
2938 	    flags & SPLICE_F_NONBLOCK)
2939 		state.flags = MSG_DONTWAIT;
2940 
2941 	return unix_stream_read_generic(&state, false);
2942 }
2943 
2944 static int unix_shutdown(struct socket *sock, int mode)
2945 {
2946 	struct sock *sk = sock->sk;
2947 	struct sock *other;
2948 
2949 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2950 		return -EINVAL;
2951 	/* This maps:
2952 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2953 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2954 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2955 	 */
2956 	++mode;
2957 
2958 	unix_state_lock(sk);
2959 	sk->sk_shutdown |= mode;
2960 	other = unix_peer(sk);
2961 	if (other)
2962 		sock_hold(other);
2963 	unix_state_unlock(sk);
2964 	sk->sk_state_change(sk);
2965 
2966 	if (other &&
2967 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2968 
2969 		int peer_mode = 0;
2970 		const struct proto *prot = READ_ONCE(other->sk_prot);
2971 
2972 		if (prot->unhash)
2973 			prot->unhash(other);
2974 		if (mode&RCV_SHUTDOWN)
2975 			peer_mode |= SEND_SHUTDOWN;
2976 		if (mode&SEND_SHUTDOWN)
2977 			peer_mode |= RCV_SHUTDOWN;
2978 		unix_state_lock(other);
2979 		other->sk_shutdown |= peer_mode;
2980 		unix_state_unlock(other);
2981 		other->sk_state_change(other);
2982 		if (peer_mode == SHUTDOWN_MASK)
2983 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2984 		else if (peer_mode & RCV_SHUTDOWN)
2985 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2986 	}
2987 	if (other)
2988 		sock_put(other);
2989 
2990 	return 0;
2991 }
2992 
2993 long unix_inq_len(struct sock *sk)
2994 {
2995 	struct sk_buff *skb;
2996 	long amount = 0;
2997 
2998 	if (sk->sk_state == TCP_LISTEN)
2999 		return -EINVAL;
3000 
3001 	spin_lock(&sk->sk_receive_queue.lock);
3002 	if (sk->sk_type == SOCK_STREAM ||
3003 	    sk->sk_type == SOCK_SEQPACKET) {
3004 		skb_queue_walk(&sk->sk_receive_queue, skb)
3005 			amount += unix_skb_len(skb);
3006 	} else {
3007 		skb = skb_peek(&sk->sk_receive_queue);
3008 		if (skb)
3009 			amount = skb->len;
3010 	}
3011 	spin_unlock(&sk->sk_receive_queue.lock);
3012 
3013 	return amount;
3014 }
3015 EXPORT_SYMBOL_GPL(unix_inq_len);
3016 
3017 long unix_outq_len(struct sock *sk)
3018 {
3019 	return sk_wmem_alloc_get(sk);
3020 }
3021 EXPORT_SYMBOL_GPL(unix_outq_len);
3022 
3023 static int unix_open_file(struct sock *sk)
3024 {
3025 	struct path path;
3026 	struct file *f;
3027 	int fd;
3028 
3029 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3030 		return -EPERM;
3031 
3032 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3033 		return -ENOENT;
3034 
3035 	path = unix_sk(sk)->path;
3036 	if (!path.dentry)
3037 		return -ENOENT;
3038 
3039 	path_get(&path);
3040 
3041 	fd = get_unused_fd_flags(O_CLOEXEC);
3042 	if (fd < 0)
3043 		goto out;
3044 
3045 	f = dentry_open(&path, O_PATH, current_cred());
3046 	if (IS_ERR(f)) {
3047 		put_unused_fd(fd);
3048 		fd = PTR_ERR(f);
3049 		goto out;
3050 	}
3051 
3052 	fd_install(fd, f);
3053 out:
3054 	path_put(&path);
3055 
3056 	return fd;
3057 }
3058 
3059 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3060 {
3061 	struct sock *sk = sock->sk;
3062 	long amount = 0;
3063 	int err;
3064 
3065 	switch (cmd) {
3066 	case SIOCOUTQ:
3067 		amount = unix_outq_len(sk);
3068 		err = put_user(amount, (int __user *)arg);
3069 		break;
3070 	case SIOCINQ:
3071 		amount = unix_inq_len(sk);
3072 		if (amount < 0)
3073 			err = amount;
3074 		else
3075 			err = put_user(amount, (int __user *)arg);
3076 		break;
3077 	case SIOCUNIXFILE:
3078 		err = unix_open_file(sk);
3079 		break;
3080 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3081 	case SIOCATMARK:
3082 		{
3083 			struct sk_buff *skb;
3084 			int answ = 0;
3085 
3086 			skb = skb_peek(&sk->sk_receive_queue);
3087 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3088 				answ = 1;
3089 			err = put_user(answ, (int __user *)arg);
3090 		}
3091 		break;
3092 #endif
3093 	default:
3094 		err = -ENOIOCTLCMD;
3095 		break;
3096 	}
3097 	return err;
3098 }
3099 
3100 #ifdef CONFIG_COMPAT
3101 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3102 {
3103 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3104 }
3105 #endif
3106 
3107 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3108 {
3109 	struct sock *sk = sock->sk;
3110 	__poll_t mask;
3111 
3112 	sock_poll_wait(file, sock, wait);
3113 	mask = 0;
3114 
3115 	/* exceptional events? */
3116 	if (sk->sk_err)
3117 		mask |= EPOLLERR;
3118 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3119 		mask |= EPOLLHUP;
3120 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3121 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3122 
3123 	/* readable? */
3124 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3125 		mask |= EPOLLIN | EPOLLRDNORM;
3126 	if (sk_is_readable(sk))
3127 		mask |= EPOLLIN | EPOLLRDNORM;
3128 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3129 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3130 		mask |= EPOLLPRI;
3131 #endif
3132 
3133 	/* Connection-based need to check for termination and startup */
3134 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3135 	    sk->sk_state == TCP_CLOSE)
3136 		mask |= EPOLLHUP;
3137 
3138 	/*
3139 	 * we set writable also when the other side has shut down the
3140 	 * connection. This prevents stuck sockets.
3141 	 */
3142 	if (unix_writable(sk))
3143 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3144 
3145 	return mask;
3146 }
3147 
3148 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3149 				    poll_table *wait)
3150 {
3151 	struct sock *sk = sock->sk, *other;
3152 	unsigned int writable;
3153 	__poll_t mask;
3154 
3155 	sock_poll_wait(file, sock, wait);
3156 	mask = 0;
3157 
3158 	/* exceptional events? */
3159 	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3160 		mask |= EPOLLERR |
3161 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3162 
3163 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3164 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3165 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3166 		mask |= EPOLLHUP;
3167 
3168 	/* readable? */
3169 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3170 		mask |= EPOLLIN | EPOLLRDNORM;
3171 	if (sk_is_readable(sk))
3172 		mask |= EPOLLIN | EPOLLRDNORM;
3173 
3174 	/* Connection-based need to check for termination and startup */
3175 	if (sk->sk_type == SOCK_SEQPACKET) {
3176 		if (sk->sk_state == TCP_CLOSE)
3177 			mask |= EPOLLHUP;
3178 		/* connection hasn't started yet? */
3179 		if (sk->sk_state == TCP_SYN_SENT)
3180 			return mask;
3181 	}
3182 
3183 	/* No write status requested, avoid expensive OUT tests. */
3184 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3185 		return mask;
3186 
3187 	writable = unix_writable(sk);
3188 	if (writable) {
3189 		unix_state_lock(sk);
3190 
3191 		other = unix_peer(sk);
3192 		if (other && unix_peer(other) != sk &&
3193 		    unix_recvq_full_lockless(other) &&
3194 		    unix_dgram_peer_wake_me(sk, other))
3195 			writable = 0;
3196 
3197 		unix_state_unlock(sk);
3198 	}
3199 
3200 	if (writable)
3201 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3202 	else
3203 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3204 
3205 	return mask;
3206 }
3207 
3208 #ifdef CONFIG_PROC_FS
3209 
3210 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3211 
3212 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3213 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3214 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3215 
3216 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3217 {
3218 	unsigned long offset = get_offset(*pos);
3219 	unsigned long bucket = get_bucket(*pos);
3220 	unsigned long count = 0;
3221 	struct sock *sk;
3222 
3223 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3224 	     sk; sk = sk_next(sk)) {
3225 		if (++count == offset)
3226 			break;
3227 	}
3228 
3229 	return sk;
3230 }
3231 
3232 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3233 {
3234 	unsigned long bucket = get_bucket(*pos);
3235 	struct net *net = seq_file_net(seq);
3236 	struct sock *sk;
3237 
3238 	while (bucket < UNIX_HASH_SIZE) {
3239 		spin_lock(&net->unx.table.locks[bucket]);
3240 
3241 		sk = unix_from_bucket(seq, pos);
3242 		if (sk)
3243 			return sk;
3244 
3245 		spin_unlock(&net->unx.table.locks[bucket]);
3246 
3247 		*pos = set_bucket_offset(++bucket, 1);
3248 	}
3249 
3250 	return NULL;
3251 }
3252 
3253 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3254 				  loff_t *pos)
3255 {
3256 	unsigned long bucket = get_bucket(*pos);
3257 
3258 	sk = sk_next(sk);
3259 	if (sk)
3260 		return sk;
3261 
3262 
3263 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3264 
3265 	*pos = set_bucket_offset(++bucket, 1);
3266 
3267 	return unix_get_first(seq, pos);
3268 }
3269 
3270 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3271 {
3272 	if (!*pos)
3273 		return SEQ_START_TOKEN;
3274 
3275 	return unix_get_first(seq, pos);
3276 }
3277 
3278 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3279 {
3280 	++*pos;
3281 
3282 	if (v == SEQ_START_TOKEN)
3283 		return unix_get_first(seq, pos);
3284 
3285 	return unix_get_next(seq, v, pos);
3286 }
3287 
3288 static void unix_seq_stop(struct seq_file *seq, void *v)
3289 {
3290 	struct sock *sk = v;
3291 
3292 	if (sk)
3293 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3294 }
3295 
3296 static int unix_seq_show(struct seq_file *seq, void *v)
3297 {
3298 
3299 	if (v == SEQ_START_TOKEN)
3300 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3301 			 "Inode Path\n");
3302 	else {
3303 		struct sock *s = v;
3304 		struct unix_sock *u = unix_sk(s);
3305 		unix_state_lock(s);
3306 
3307 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3308 			s,
3309 			refcount_read(&s->sk_refcnt),
3310 			0,
3311 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3312 			s->sk_type,
3313 			s->sk_socket ?
3314 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3315 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3316 			sock_i_ino(s));
3317 
3318 		if (u->addr) {	// under a hash table lock here
3319 			int i, len;
3320 			seq_putc(seq, ' ');
3321 
3322 			i = 0;
3323 			len = u->addr->len -
3324 				offsetof(struct sockaddr_un, sun_path);
3325 			if (u->addr->name->sun_path[0]) {
3326 				len--;
3327 			} else {
3328 				seq_putc(seq, '@');
3329 				i++;
3330 			}
3331 			for ( ; i < len; i++)
3332 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3333 					 '@');
3334 		}
3335 		unix_state_unlock(s);
3336 		seq_putc(seq, '\n');
3337 	}
3338 
3339 	return 0;
3340 }
3341 
3342 static const struct seq_operations unix_seq_ops = {
3343 	.start  = unix_seq_start,
3344 	.next   = unix_seq_next,
3345 	.stop   = unix_seq_stop,
3346 	.show   = unix_seq_show,
3347 };
3348 
3349 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3350 struct bpf_unix_iter_state {
3351 	struct seq_net_private p;
3352 	unsigned int cur_sk;
3353 	unsigned int end_sk;
3354 	unsigned int max_sk;
3355 	struct sock **batch;
3356 	bool st_bucket_done;
3357 };
3358 
3359 struct bpf_iter__unix {
3360 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3361 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3362 	uid_t uid __aligned(8);
3363 };
3364 
3365 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3366 			      struct unix_sock *unix_sk, uid_t uid)
3367 {
3368 	struct bpf_iter__unix ctx;
3369 
3370 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3371 	ctx.meta = meta;
3372 	ctx.unix_sk = unix_sk;
3373 	ctx.uid = uid;
3374 	return bpf_iter_run_prog(prog, &ctx);
3375 }
3376 
3377 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3378 
3379 {
3380 	struct bpf_unix_iter_state *iter = seq->private;
3381 	unsigned int expected = 1;
3382 	struct sock *sk;
3383 
3384 	sock_hold(start_sk);
3385 	iter->batch[iter->end_sk++] = start_sk;
3386 
3387 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3388 		if (iter->end_sk < iter->max_sk) {
3389 			sock_hold(sk);
3390 			iter->batch[iter->end_sk++] = sk;
3391 		}
3392 
3393 		expected++;
3394 	}
3395 
3396 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3397 
3398 	return expected;
3399 }
3400 
3401 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3402 {
3403 	while (iter->cur_sk < iter->end_sk)
3404 		sock_put(iter->batch[iter->cur_sk++]);
3405 }
3406 
3407 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3408 				       unsigned int new_batch_sz)
3409 {
3410 	struct sock **new_batch;
3411 
3412 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3413 			     GFP_USER | __GFP_NOWARN);
3414 	if (!new_batch)
3415 		return -ENOMEM;
3416 
3417 	bpf_iter_unix_put_batch(iter);
3418 	kvfree(iter->batch);
3419 	iter->batch = new_batch;
3420 	iter->max_sk = new_batch_sz;
3421 
3422 	return 0;
3423 }
3424 
3425 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3426 					loff_t *pos)
3427 {
3428 	struct bpf_unix_iter_state *iter = seq->private;
3429 	unsigned int expected;
3430 	bool resized = false;
3431 	struct sock *sk;
3432 
3433 	if (iter->st_bucket_done)
3434 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3435 
3436 again:
3437 	/* Get a new batch */
3438 	iter->cur_sk = 0;
3439 	iter->end_sk = 0;
3440 
3441 	sk = unix_get_first(seq, pos);
3442 	if (!sk)
3443 		return NULL; /* Done */
3444 
3445 	expected = bpf_iter_unix_hold_batch(seq, sk);
3446 
3447 	if (iter->end_sk == expected) {
3448 		iter->st_bucket_done = true;
3449 		return sk;
3450 	}
3451 
3452 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3453 		resized = true;
3454 		goto again;
3455 	}
3456 
3457 	return sk;
3458 }
3459 
3460 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3461 {
3462 	if (!*pos)
3463 		return SEQ_START_TOKEN;
3464 
3465 	/* bpf iter does not support lseek, so it always
3466 	 * continue from where it was stop()-ped.
3467 	 */
3468 	return bpf_iter_unix_batch(seq, pos);
3469 }
3470 
3471 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3472 {
3473 	struct bpf_unix_iter_state *iter = seq->private;
3474 	struct sock *sk;
3475 
3476 	/* Whenever seq_next() is called, the iter->cur_sk is
3477 	 * done with seq_show(), so advance to the next sk in
3478 	 * the batch.
3479 	 */
3480 	if (iter->cur_sk < iter->end_sk)
3481 		sock_put(iter->batch[iter->cur_sk++]);
3482 
3483 	++*pos;
3484 
3485 	if (iter->cur_sk < iter->end_sk)
3486 		sk = iter->batch[iter->cur_sk];
3487 	else
3488 		sk = bpf_iter_unix_batch(seq, pos);
3489 
3490 	return sk;
3491 }
3492 
3493 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3494 {
3495 	struct bpf_iter_meta meta;
3496 	struct bpf_prog *prog;
3497 	struct sock *sk = v;
3498 	uid_t uid;
3499 	bool slow;
3500 	int ret;
3501 
3502 	if (v == SEQ_START_TOKEN)
3503 		return 0;
3504 
3505 	slow = lock_sock_fast(sk);
3506 
3507 	if (unlikely(sk_unhashed(sk))) {
3508 		ret = SEQ_SKIP;
3509 		goto unlock;
3510 	}
3511 
3512 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3513 	meta.seq = seq;
3514 	prog = bpf_iter_get_info(&meta, false);
3515 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3516 unlock:
3517 	unlock_sock_fast(sk, slow);
3518 	return ret;
3519 }
3520 
3521 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3522 {
3523 	struct bpf_unix_iter_state *iter = seq->private;
3524 	struct bpf_iter_meta meta;
3525 	struct bpf_prog *prog;
3526 
3527 	if (!v) {
3528 		meta.seq = seq;
3529 		prog = bpf_iter_get_info(&meta, true);
3530 		if (prog)
3531 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3532 	}
3533 
3534 	if (iter->cur_sk < iter->end_sk)
3535 		bpf_iter_unix_put_batch(iter);
3536 }
3537 
3538 static const struct seq_operations bpf_iter_unix_seq_ops = {
3539 	.start	= bpf_iter_unix_seq_start,
3540 	.next	= bpf_iter_unix_seq_next,
3541 	.stop	= bpf_iter_unix_seq_stop,
3542 	.show	= bpf_iter_unix_seq_show,
3543 };
3544 #endif
3545 #endif
3546 
3547 static const struct net_proto_family unix_family_ops = {
3548 	.family = PF_UNIX,
3549 	.create = unix_create,
3550 	.owner	= THIS_MODULE,
3551 };
3552 
3553 
3554 static int __net_init unix_net_init(struct net *net)
3555 {
3556 	int i;
3557 
3558 	net->unx.sysctl_max_dgram_qlen = 10;
3559 	if (unix_sysctl_register(net))
3560 		goto out;
3561 
3562 #ifdef CONFIG_PROC_FS
3563 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3564 			     sizeof(struct seq_net_private)))
3565 		goto err_sysctl;
3566 #endif
3567 
3568 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3569 					      sizeof(spinlock_t), GFP_KERNEL);
3570 	if (!net->unx.table.locks)
3571 		goto err_proc;
3572 
3573 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3574 						sizeof(struct hlist_head),
3575 						GFP_KERNEL);
3576 	if (!net->unx.table.buckets)
3577 		goto free_locks;
3578 
3579 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3580 		spin_lock_init(&net->unx.table.locks[i]);
3581 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3582 	}
3583 
3584 	return 0;
3585 
3586 free_locks:
3587 	kvfree(net->unx.table.locks);
3588 err_proc:
3589 #ifdef CONFIG_PROC_FS
3590 	remove_proc_entry("unix", net->proc_net);
3591 err_sysctl:
3592 #endif
3593 	unix_sysctl_unregister(net);
3594 out:
3595 	return -ENOMEM;
3596 }
3597 
3598 static void __net_exit unix_net_exit(struct net *net)
3599 {
3600 	kvfree(net->unx.table.buckets);
3601 	kvfree(net->unx.table.locks);
3602 	unix_sysctl_unregister(net);
3603 	remove_proc_entry("unix", net->proc_net);
3604 }
3605 
3606 static struct pernet_operations unix_net_ops = {
3607 	.init = unix_net_init,
3608 	.exit = unix_net_exit,
3609 };
3610 
3611 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3612 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3613 		     struct unix_sock *unix_sk, uid_t uid)
3614 
3615 #define INIT_BATCH_SZ 16
3616 
3617 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3618 {
3619 	struct bpf_unix_iter_state *iter = priv_data;
3620 	int err;
3621 
3622 	err = bpf_iter_init_seq_net(priv_data, aux);
3623 	if (err)
3624 		return err;
3625 
3626 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3627 	if (err) {
3628 		bpf_iter_fini_seq_net(priv_data);
3629 		return err;
3630 	}
3631 
3632 	return 0;
3633 }
3634 
3635 static void bpf_iter_fini_unix(void *priv_data)
3636 {
3637 	struct bpf_unix_iter_state *iter = priv_data;
3638 
3639 	bpf_iter_fini_seq_net(priv_data);
3640 	kvfree(iter->batch);
3641 }
3642 
3643 static const struct bpf_iter_seq_info unix_seq_info = {
3644 	.seq_ops		= &bpf_iter_unix_seq_ops,
3645 	.init_seq_private	= bpf_iter_init_unix,
3646 	.fini_seq_private	= bpf_iter_fini_unix,
3647 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3648 };
3649 
3650 static const struct bpf_func_proto *
3651 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3652 			     const struct bpf_prog *prog)
3653 {
3654 	switch (func_id) {
3655 	case BPF_FUNC_setsockopt:
3656 		return &bpf_sk_setsockopt_proto;
3657 	case BPF_FUNC_getsockopt:
3658 		return &bpf_sk_getsockopt_proto;
3659 	default:
3660 		return NULL;
3661 	}
3662 }
3663 
3664 static struct bpf_iter_reg unix_reg_info = {
3665 	.target			= "unix",
3666 	.ctx_arg_info_size	= 1,
3667 	.ctx_arg_info		= {
3668 		{ offsetof(struct bpf_iter__unix, unix_sk),
3669 		  PTR_TO_BTF_ID_OR_NULL },
3670 	},
3671 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3672 	.seq_info		= &unix_seq_info,
3673 };
3674 
3675 static void __init bpf_iter_register(void)
3676 {
3677 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3678 	if (bpf_iter_reg_target(&unix_reg_info))
3679 		pr_warn("Warning: could not register bpf iterator unix\n");
3680 }
3681 #endif
3682 
3683 static int __init af_unix_init(void)
3684 {
3685 	int rc = -1;
3686 
3687 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3688 
3689 	rc = proto_register(&unix_dgram_proto, 1);
3690 	if (rc != 0) {
3691 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3692 		goto out;
3693 	}
3694 
3695 	rc = proto_register(&unix_stream_proto, 1);
3696 	if (rc != 0) {
3697 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3698 		goto out;
3699 	}
3700 
3701 	sock_register(&unix_family_ops);
3702 	register_pernet_subsys(&unix_net_ops);
3703 	unix_bpf_build_proto();
3704 
3705 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3706 	bpf_iter_register();
3707 #endif
3708 
3709 out:
3710 	return rc;
3711 }
3712 
3713 static void __exit af_unix_exit(void)
3714 {
3715 	sock_unregister(PF_UNIX);
3716 	proto_unregister(&unix_dgram_proto);
3717 	proto_unregister(&unix_stream_proto);
3718 	unregister_pernet_subsys(&unix_net_ops);
3719 }
3720 
3721 /* Earlier than device_initcall() so that other drivers invoking
3722    request_module() don't end up in a loop when modprobe tries
3723    to use a UNIX socket. But later than subsys_initcall() because
3724    we depend on stuff initialised there */
3725 fs_initcall(af_unix_init);
3726 module_exit(af_unix_exit);
3727 
3728 MODULE_LICENSE("GPL");
3729 MODULE_ALIAS_NETPROTO(PF_UNIX);
3730