xref: /linux/net/unix/af_unix.c (revision e95ab1d852897a0b697cd0fb609d496ce97fff3a)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/freezer.h>
116 #include <linux/file.h>
117 #include <linux/btf_ids.h>
118 
119 #include "scm.h"
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return hash & UNIX_HASH_MOD;
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	/* hash1 and hash2 is never the same because
163 	 * one is between 0 and UNIX_HASH_MOD, and
164 	 * another is between UNIX_HASH_MOD + 1 and UNIX_HASH_SIZE - 1.
165 	 */
166 	if (hash1 > hash2)
167 		swap(hash1, hash2);
168 
169 	spin_lock(&net->unx.table.locks[hash1]);
170 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
171 }
172 
173 static void unix_table_double_unlock(struct net *net,
174 				     unsigned int hash1, unsigned int hash2)
175 {
176 	spin_unlock(&net->unx.table.locks[hash1]);
177 	spin_unlock(&net->unx.table.locks[hash2]);
178 }
179 
180 #ifdef CONFIG_SECURITY_NETWORK
181 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
182 {
183 	UNIXCB(skb).secid = scm->secid;
184 }
185 
186 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
187 {
188 	scm->secid = UNIXCB(skb).secid;
189 }
190 
191 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
192 {
193 	return (scm->secid == UNIXCB(skb).secid);
194 }
195 #else
196 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
197 { }
198 
199 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
200 { }
201 
202 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
203 {
204 	return true;
205 }
206 #endif /* CONFIG_SECURITY_NETWORK */
207 
208 #define unix_peer(sk) (unix_sk(sk)->peer)
209 
210 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
211 {
212 	return unix_peer(osk) == sk;
213 }
214 
215 static inline int unix_may_send(struct sock *sk, struct sock *osk)
216 {
217 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
218 }
219 
220 static inline int unix_recvq_full(const struct sock *sk)
221 {
222 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
223 }
224 
225 static inline int unix_recvq_full_lockless(const struct sock *sk)
226 {
227 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
228 		READ_ONCE(sk->sk_max_ack_backlog);
229 }
230 
231 struct sock *unix_peer_get(struct sock *s)
232 {
233 	struct sock *peer;
234 
235 	unix_state_lock(s);
236 	peer = unix_peer(s);
237 	if (peer)
238 		sock_hold(peer);
239 	unix_state_unlock(s);
240 	return peer;
241 }
242 EXPORT_SYMBOL_GPL(unix_peer_get);
243 
244 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
245 					     int addr_len)
246 {
247 	struct unix_address *addr;
248 
249 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
250 	if (!addr)
251 		return NULL;
252 
253 	refcount_set(&addr->refcnt, 1);
254 	addr->len = addr_len;
255 	memcpy(addr->name, sunaddr, addr_len);
256 
257 	return addr;
258 }
259 
260 static inline void unix_release_addr(struct unix_address *addr)
261 {
262 	if (refcount_dec_and_test(&addr->refcnt))
263 		kfree(addr);
264 }
265 
266 /*
267  *	Check unix socket name:
268  *		- should be not zero length.
269  *	        - if started by not zero, should be NULL terminated (FS object)
270  *		- if started by zero, it is abstract name.
271  */
272 
273 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
274 {
275 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
276 	    addr_len > sizeof(*sunaddr))
277 		return -EINVAL;
278 
279 	if (sunaddr->sun_family != AF_UNIX)
280 		return -EINVAL;
281 
282 	return 0;
283 }
284 
285 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
286 {
287 	/* This may look like an off by one error but it is a bit more
288 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
289 	 * sun_path[108] doesn't as such exist.  However in kernel space
290 	 * we are guaranteed that it is a valid memory location in our
291 	 * kernel address buffer because syscall functions always pass
292 	 * a pointer of struct sockaddr_storage which has a bigger buffer
293 	 * than 108.
294 	 */
295 	((char *)sunaddr)[addr_len] = 0;
296 }
297 
298 static void __unix_remove_socket(struct sock *sk)
299 {
300 	sk_del_node_init(sk);
301 }
302 
303 static void __unix_insert_socket(struct net *net, struct sock *sk)
304 {
305 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
306 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
307 }
308 
309 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
310 				 struct unix_address *addr, unsigned int hash)
311 {
312 	__unix_remove_socket(sk);
313 	smp_store_release(&unix_sk(sk)->addr, addr);
314 
315 	sk->sk_hash = hash;
316 	__unix_insert_socket(net, sk);
317 }
318 
319 static void unix_remove_socket(struct net *net, struct sock *sk)
320 {
321 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
322 	__unix_remove_socket(sk);
323 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
324 }
325 
326 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
327 {
328 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
329 	__unix_insert_socket(net, sk);
330 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
331 }
332 
333 static void unix_insert_bsd_socket(struct sock *sk)
334 {
335 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
336 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
337 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
338 }
339 
340 static void unix_remove_bsd_socket(struct sock *sk)
341 {
342 	if (!hlist_unhashed(&sk->sk_bind_node)) {
343 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
344 		__sk_del_bind_node(sk);
345 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
346 
347 		sk_node_init(&sk->sk_bind_node);
348 	}
349 }
350 
351 static struct sock *__unix_find_socket_byname(struct net *net,
352 					      struct sockaddr_un *sunname,
353 					      int len, unsigned int hash)
354 {
355 	struct sock *s;
356 
357 	sk_for_each(s, &net->unx.table.buckets[hash]) {
358 		struct unix_sock *u = unix_sk(s);
359 
360 		if (u->addr->len == len &&
361 		    !memcmp(u->addr->name, sunname, len))
362 			return s;
363 	}
364 	return NULL;
365 }
366 
367 static inline struct sock *unix_find_socket_byname(struct net *net,
368 						   struct sockaddr_un *sunname,
369 						   int len, unsigned int hash)
370 {
371 	struct sock *s;
372 
373 	spin_lock(&net->unx.table.locks[hash]);
374 	s = __unix_find_socket_byname(net, sunname, len, hash);
375 	if (s)
376 		sock_hold(s);
377 	spin_unlock(&net->unx.table.locks[hash]);
378 	return s;
379 }
380 
381 static struct sock *unix_find_socket_byinode(struct inode *i)
382 {
383 	unsigned int hash = unix_bsd_hash(i);
384 	struct sock *s;
385 
386 	spin_lock(&bsd_socket_locks[hash]);
387 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
388 		struct dentry *dentry = unix_sk(s)->path.dentry;
389 
390 		if (dentry && d_backing_inode(dentry) == i) {
391 			sock_hold(s);
392 			spin_unlock(&bsd_socket_locks[hash]);
393 			return s;
394 		}
395 	}
396 	spin_unlock(&bsd_socket_locks[hash]);
397 	return NULL;
398 }
399 
400 /* Support code for asymmetrically connected dgram sockets
401  *
402  * If a datagram socket is connected to a socket not itself connected
403  * to the first socket (eg, /dev/log), clients may only enqueue more
404  * messages if the present receive queue of the server socket is not
405  * "too large". This means there's a second writeability condition
406  * poll and sendmsg need to test. The dgram recv code will do a wake
407  * up on the peer_wait wait queue of a socket upon reception of a
408  * datagram which needs to be propagated to sleeping would-be writers
409  * since these might not have sent anything so far. This can't be
410  * accomplished via poll_wait because the lifetime of the server
411  * socket might be less than that of its clients if these break their
412  * association with it or if the server socket is closed while clients
413  * are still connected to it and there's no way to inform "a polling
414  * implementation" that it should let go of a certain wait queue
415  *
416  * In order to propagate a wake up, a wait_queue_entry_t of the client
417  * socket is enqueued on the peer_wait queue of the server socket
418  * whose wake function does a wake_up on the ordinary client socket
419  * wait queue. This connection is established whenever a write (or
420  * poll for write) hit the flow control condition and broken when the
421  * association to the server socket is dissolved or after a wake up
422  * was relayed.
423  */
424 
425 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
426 				      void *key)
427 {
428 	struct unix_sock *u;
429 	wait_queue_head_t *u_sleep;
430 
431 	u = container_of(q, struct unix_sock, peer_wake);
432 
433 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
434 			    q);
435 	u->peer_wake.private = NULL;
436 
437 	/* relaying can only happen while the wq still exists */
438 	u_sleep = sk_sleep(&u->sk);
439 	if (u_sleep)
440 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
441 
442 	return 0;
443 }
444 
445 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
446 {
447 	struct unix_sock *u, *u_other;
448 	int rc;
449 
450 	u = unix_sk(sk);
451 	u_other = unix_sk(other);
452 	rc = 0;
453 	spin_lock(&u_other->peer_wait.lock);
454 
455 	if (!u->peer_wake.private) {
456 		u->peer_wake.private = other;
457 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
458 
459 		rc = 1;
460 	}
461 
462 	spin_unlock(&u_other->peer_wait.lock);
463 	return rc;
464 }
465 
466 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
467 					    struct sock *other)
468 {
469 	struct unix_sock *u, *u_other;
470 
471 	u = unix_sk(sk);
472 	u_other = unix_sk(other);
473 	spin_lock(&u_other->peer_wait.lock);
474 
475 	if (u->peer_wake.private == other) {
476 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
477 		u->peer_wake.private = NULL;
478 	}
479 
480 	spin_unlock(&u_other->peer_wait.lock);
481 }
482 
483 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
484 						   struct sock *other)
485 {
486 	unix_dgram_peer_wake_disconnect(sk, other);
487 	wake_up_interruptible_poll(sk_sleep(sk),
488 				   EPOLLOUT |
489 				   EPOLLWRNORM |
490 				   EPOLLWRBAND);
491 }
492 
493 /* preconditions:
494  *	- unix_peer(sk) == other
495  *	- association is stable
496  */
497 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
498 {
499 	int connected;
500 
501 	connected = unix_dgram_peer_wake_connect(sk, other);
502 
503 	/* If other is SOCK_DEAD, we want to make sure we signal
504 	 * POLLOUT, such that a subsequent write() can get a
505 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
506 	 * to other and its full, we will hang waiting for POLLOUT.
507 	 */
508 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
509 		return 1;
510 
511 	if (connected)
512 		unix_dgram_peer_wake_disconnect(sk, other);
513 
514 	return 0;
515 }
516 
517 static int unix_writable(const struct sock *sk)
518 {
519 	return sk->sk_state != TCP_LISTEN &&
520 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
521 }
522 
523 static void unix_write_space(struct sock *sk)
524 {
525 	struct socket_wq *wq;
526 
527 	rcu_read_lock();
528 	if (unix_writable(sk)) {
529 		wq = rcu_dereference(sk->sk_wq);
530 		if (skwq_has_sleeper(wq))
531 			wake_up_interruptible_sync_poll(&wq->wait,
532 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
533 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
534 	}
535 	rcu_read_unlock();
536 }
537 
538 /* When dgram socket disconnects (or changes its peer), we clear its receive
539  * queue of packets arrived from previous peer. First, it allows to do
540  * flow control based only on wmem_alloc; second, sk connected to peer
541  * may receive messages only from that peer. */
542 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
543 {
544 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
545 		skb_queue_purge(&sk->sk_receive_queue);
546 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
547 
548 		/* If one link of bidirectional dgram pipe is disconnected,
549 		 * we signal error. Messages are lost. Do not make this,
550 		 * when peer was not connected to us.
551 		 */
552 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
553 			other->sk_err = ECONNRESET;
554 			sk_error_report(other);
555 		}
556 	}
557 	other->sk_state = TCP_CLOSE;
558 }
559 
560 static void unix_sock_destructor(struct sock *sk)
561 {
562 	struct unix_sock *u = unix_sk(sk);
563 
564 	skb_queue_purge(&sk->sk_receive_queue);
565 
566 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
567 	if (u->oob_skb) {
568 		kfree_skb(u->oob_skb);
569 		u->oob_skb = NULL;
570 	}
571 #endif
572 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
573 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
574 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
575 	if (!sock_flag(sk, SOCK_DEAD)) {
576 		pr_info("Attempt to release alive unix socket: %p\n", sk);
577 		return;
578 	}
579 
580 	if (u->addr)
581 		unix_release_addr(u->addr);
582 
583 	atomic_long_dec(&unix_nr_socks);
584 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
585 #ifdef UNIX_REFCNT_DEBUG
586 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
587 		atomic_long_read(&unix_nr_socks));
588 #endif
589 }
590 
591 static void unix_release_sock(struct sock *sk, int embrion)
592 {
593 	struct unix_sock *u = unix_sk(sk);
594 	struct sock *skpair;
595 	struct sk_buff *skb;
596 	struct path path;
597 	int state;
598 
599 	unix_remove_socket(sock_net(sk), sk);
600 	unix_remove_bsd_socket(sk);
601 
602 	/* Clear state */
603 	unix_state_lock(sk);
604 	sock_orphan(sk);
605 	sk->sk_shutdown = SHUTDOWN_MASK;
606 	path	     = u->path;
607 	u->path.dentry = NULL;
608 	u->path.mnt = NULL;
609 	state = sk->sk_state;
610 	sk->sk_state = TCP_CLOSE;
611 
612 	skpair = unix_peer(sk);
613 	unix_peer(sk) = NULL;
614 
615 	unix_state_unlock(sk);
616 
617 	wake_up_interruptible_all(&u->peer_wait);
618 
619 	if (skpair != NULL) {
620 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
621 			unix_state_lock(skpair);
622 			/* No more writes */
623 			skpair->sk_shutdown = SHUTDOWN_MASK;
624 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
625 				skpair->sk_err = ECONNRESET;
626 			unix_state_unlock(skpair);
627 			skpair->sk_state_change(skpair);
628 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
629 		}
630 
631 		unix_dgram_peer_wake_disconnect(sk, skpair);
632 		sock_put(skpair); /* It may now die */
633 	}
634 
635 	/* Try to flush out this socket. Throw out buffers at least */
636 
637 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
638 		if (state == TCP_LISTEN)
639 			unix_release_sock(skb->sk, 1);
640 		/* passed fds are erased in the kfree_skb hook	      */
641 		UNIXCB(skb).consumed = skb->len;
642 		kfree_skb(skb);
643 	}
644 
645 	if (path.dentry)
646 		path_put(&path);
647 
648 	sock_put(sk);
649 
650 	/* ---- Socket is dead now and most probably destroyed ---- */
651 
652 	/*
653 	 * Fixme: BSD difference: In BSD all sockets connected to us get
654 	 *	  ECONNRESET and we die on the spot. In Linux we behave
655 	 *	  like files and pipes do and wait for the last
656 	 *	  dereference.
657 	 *
658 	 * Can't we simply set sock->err?
659 	 *
660 	 *	  What the above comment does talk about? --ANK(980817)
661 	 */
662 
663 	if (unix_tot_inflight)
664 		unix_gc();		/* Garbage collect fds */
665 }
666 
667 static void init_peercred(struct sock *sk)
668 {
669 	const struct cred *old_cred;
670 	struct pid *old_pid;
671 
672 	spin_lock(&sk->sk_peer_lock);
673 	old_pid = sk->sk_peer_pid;
674 	old_cred = sk->sk_peer_cred;
675 	sk->sk_peer_pid  = get_pid(task_tgid(current));
676 	sk->sk_peer_cred = get_current_cred();
677 	spin_unlock(&sk->sk_peer_lock);
678 
679 	put_pid(old_pid);
680 	put_cred(old_cred);
681 }
682 
683 static void copy_peercred(struct sock *sk, struct sock *peersk)
684 {
685 	const struct cred *old_cred;
686 	struct pid *old_pid;
687 
688 	if (sk < peersk) {
689 		spin_lock(&sk->sk_peer_lock);
690 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
691 	} else {
692 		spin_lock(&peersk->sk_peer_lock);
693 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
694 	}
695 	old_pid = sk->sk_peer_pid;
696 	old_cred = sk->sk_peer_cred;
697 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
698 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
699 
700 	spin_unlock(&sk->sk_peer_lock);
701 	spin_unlock(&peersk->sk_peer_lock);
702 
703 	put_pid(old_pid);
704 	put_cred(old_cred);
705 }
706 
707 static int unix_listen(struct socket *sock, int backlog)
708 {
709 	int err;
710 	struct sock *sk = sock->sk;
711 	struct unix_sock *u = unix_sk(sk);
712 
713 	err = -EOPNOTSUPP;
714 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
715 		goto out;	/* Only stream/seqpacket sockets accept */
716 	err = -EINVAL;
717 	if (!u->addr)
718 		goto out;	/* No listens on an unbound socket */
719 	unix_state_lock(sk);
720 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
721 		goto out_unlock;
722 	if (backlog > sk->sk_max_ack_backlog)
723 		wake_up_interruptible_all(&u->peer_wait);
724 	sk->sk_max_ack_backlog	= backlog;
725 	sk->sk_state		= TCP_LISTEN;
726 	/* set credentials so connect can copy them */
727 	init_peercred(sk);
728 	err = 0;
729 
730 out_unlock:
731 	unix_state_unlock(sk);
732 out:
733 	return err;
734 }
735 
736 static int unix_release(struct socket *);
737 static int unix_bind(struct socket *, struct sockaddr *, int);
738 static int unix_stream_connect(struct socket *, struct sockaddr *,
739 			       int addr_len, int flags);
740 static int unix_socketpair(struct socket *, struct socket *);
741 static int unix_accept(struct socket *, struct socket *, int, bool);
742 static int unix_getname(struct socket *, struct sockaddr *, int);
743 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
744 static __poll_t unix_dgram_poll(struct file *, struct socket *,
745 				    poll_table *);
746 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
747 #ifdef CONFIG_COMPAT
748 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
749 #endif
750 static int unix_shutdown(struct socket *, int);
751 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
752 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
753 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
754 				    size_t size, int flags);
755 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
756 				       struct pipe_inode_info *, size_t size,
757 				       unsigned int flags);
758 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
759 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
760 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
761 			  sk_read_actor_t recv_actor);
762 static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
763 				 sk_read_actor_t recv_actor);
764 static int unix_dgram_connect(struct socket *, struct sockaddr *,
765 			      int, int);
766 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
767 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
768 				  int);
769 
770 static int unix_set_peek_off(struct sock *sk, int val)
771 {
772 	struct unix_sock *u = unix_sk(sk);
773 
774 	if (mutex_lock_interruptible(&u->iolock))
775 		return -EINTR;
776 
777 	sk->sk_peek_off = val;
778 	mutex_unlock(&u->iolock);
779 
780 	return 0;
781 }
782 
783 #ifdef CONFIG_PROC_FS
784 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
785 {
786 	struct sock *sk = sock->sk;
787 	struct unix_sock *u;
788 
789 	if (sk) {
790 		u = unix_sk(sock->sk);
791 		seq_printf(m, "scm_fds: %u\n",
792 			   atomic_read(&u->scm_stat.nr_fds));
793 	}
794 }
795 #else
796 #define unix_show_fdinfo NULL
797 #endif
798 
799 static const struct proto_ops unix_stream_ops = {
800 	.family =	PF_UNIX,
801 	.owner =	THIS_MODULE,
802 	.release =	unix_release,
803 	.bind =		unix_bind,
804 	.connect =	unix_stream_connect,
805 	.socketpair =	unix_socketpair,
806 	.accept =	unix_accept,
807 	.getname =	unix_getname,
808 	.poll =		unix_poll,
809 	.ioctl =	unix_ioctl,
810 #ifdef CONFIG_COMPAT
811 	.compat_ioctl =	unix_compat_ioctl,
812 #endif
813 	.listen =	unix_listen,
814 	.shutdown =	unix_shutdown,
815 	.sendmsg =	unix_stream_sendmsg,
816 	.recvmsg =	unix_stream_recvmsg,
817 	.read_sock =	unix_stream_read_sock,
818 	.mmap =		sock_no_mmap,
819 	.sendpage =	unix_stream_sendpage,
820 	.splice_read =	unix_stream_splice_read,
821 	.set_peek_off =	unix_set_peek_off,
822 	.show_fdinfo =	unix_show_fdinfo,
823 };
824 
825 static const struct proto_ops unix_dgram_ops = {
826 	.family =	PF_UNIX,
827 	.owner =	THIS_MODULE,
828 	.release =	unix_release,
829 	.bind =		unix_bind,
830 	.connect =	unix_dgram_connect,
831 	.socketpair =	unix_socketpair,
832 	.accept =	sock_no_accept,
833 	.getname =	unix_getname,
834 	.poll =		unix_dgram_poll,
835 	.ioctl =	unix_ioctl,
836 #ifdef CONFIG_COMPAT
837 	.compat_ioctl =	unix_compat_ioctl,
838 #endif
839 	.listen =	sock_no_listen,
840 	.shutdown =	unix_shutdown,
841 	.sendmsg =	unix_dgram_sendmsg,
842 	.read_sock =	unix_read_sock,
843 	.recvmsg =	unix_dgram_recvmsg,
844 	.mmap =		sock_no_mmap,
845 	.sendpage =	sock_no_sendpage,
846 	.set_peek_off =	unix_set_peek_off,
847 	.show_fdinfo =	unix_show_fdinfo,
848 };
849 
850 static const struct proto_ops unix_seqpacket_ops = {
851 	.family =	PF_UNIX,
852 	.owner =	THIS_MODULE,
853 	.release =	unix_release,
854 	.bind =		unix_bind,
855 	.connect =	unix_stream_connect,
856 	.socketpair =	unix_socketpair,
857 	.accept =	unix_accept,
858 	.getname =	unix_getname,
859 	.poll =		unix_dgram_poll,
860 	.ioctl =	unix_ioctl,
861 #ifdef CONFIG_COMPAT
862 	.compat_ioctl =	unix_compat_ioctl,
863 #endif
864 	.listen =	unix_listen,
865 	.shutdown =	unix_shutdown,
866 	.sendmsg =	unix_seqpacket_sendmsg,
867 	.recvmsg =	unix_seqpacket_recvmsg,
868 	.mmap =		sock_no_mmap,
869 	.sendpage =	sock_no_sendpage,
870 	.set_peek_off =	unix_set_peek_off,
871 	.show_fdinfo =	unix_show_fdinfo,
872 };
873 
874 static void unix_close(struct sock *sk, long timeout)
875 {
876 	/* Nothing to do here, unix socket does not need a ->close().
877 	 * This is merely for sockmap.
878 	 */
879 }
880 
881 static void unix_unhash(struct sock *sk)
882 {
883 	/* Nothing to do here, unix socket does not need a ->unhash().
884 	 * This is merely for sockmap.
885 	 */
886 }
887 
888 struct proto unix_dgram_proto = {
889 	.name			= "UNIX",
890 	.owner			= THIS_MODULE,
891 	.obj_size		= sizeof(struct unix_sock),
892 	.close			= unix_close,
893 #ifdef CONFIG_BPF_SYSCALL
894 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
895 #endif
896 };
897 
898 struct proto unix_stream_proto = {
899 	.name			= "UNIX-STREAM",
900 	.owner			= THIS_MODULE,
901 	.obj_size		= sizeof(struct unix_sock),
902 	.close			= unix_close,
903 	.unhash			= unix_unhash,
904 #ifdef CONFIG_BPF_SYSCALL
905 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
906 #endif
907 };
908 
909 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
910 {
911 	struct unix_sock *u;
912 	struct sock *sk;
913 	int err;
914 
915 	atomic_long_inc(&unix_nr_socks);
916 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
917 		err = -ENFILE;
918 		goto err;
919 	}
920 
921 	if (type == SOCK_STREAM)
922 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
923 	else /*dgram and  seqpacket */
924 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
925 
926 	if (!sk) {
927 		err = -ENOMEM;
928 		goto err;
929 	}
930 
931 	sock_init_data(sock, sk);
932 
933 	sk->sk_hash		= unix_unbound_hash(sk);
934 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
935 	sk->sk_write_space	= unix_write_space;
936 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
937 	sk->sk_destruct		= unix_sock_destructor;
938 	u	  = unix_sk(sk);
939 	u->path.dentry = NULL;
940 	u->path.mnt = NULL;
941 	spin_lock_init(&u->lock);
942 	atomic_long_set(&u->inflight, 0);
943 	INIT_LIST_HEAD(&u->link);
944 	mutex_init(&u->iolock); /* single task reading lock */
945 	mutex_init(&u->bindlock); /* single task binding lock */
946 	init_waitqueue_head(&u->peer_wait);
947 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
948 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
949 	unix_insert_unbound_socket(net, sk);
950 
951 	sock_prot_inuse_add(net, sk->sk_prot, 1);
952 
953 	return sk;
954 
955 err:
956 	atomic_long_dec(&unix_nr_socks);
957 	return ERR_PTR(err);
958 }
959 
960 static int unix_create(struct net *net, struct socket *sock, int protocol,
961 		       int kern)
962 {
963 	struct sock *sk;
964 
965 	if (protocol && protocol != PF_UNIX)
966 		return -EPROTONOSUPPORT;
967 
968 	sock->state = SS_UNCONNECTED;
969 
970 	switch (sock->type) {
971 	case SOCK_STREAM:
972 		sock->ops = &unix_stream_ops;
973 		break;
974 		/*
975 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
976 		 *	nothing uses it.
977 		 */
978 	case SOCK_RAW:
979 		sock->type = SOCK_DGRAM;
980 		fallthrough;
981 	case SOCK_DGRAM:
982 		sock->ops = &unix_dgram_ops;
983 		break;
984 	case SOCK_SEQPACKET:
985 		sock->ops = &unix_seqpacket_ops;
986 		break;
987 	default:
988 		return -ESOCKTNOSUPPORT;
989 	}
990 
991 	sk = unix_create1(net, sock, kern, sock->type);
992 	if (IS_ERR(sk))
993 		return PTR_ERR(sk);
994 
995 	return 0;
996 }
997 
998 static int unix_release(struct socket *sock)
999 {
1000 	struct sock *sk = sock->sk;
1001 
1002 	if (!sk)
1003 		return 0;
1004 
1005 	sk->sk_prot->close(sk, 0);
1006 	unix_release_sock(sk, 0);
1007 	sock->sk = NULL;
1008 
1009 	return 0;
1010 }
1011 
1012 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1013 				  int type)
1014 {
1015 	struct inode *inode;
1016 	struct path path;
1017 	struct sock *sk;
1018 	int err;
1019 
1020 	unix_mkname_bsd(sunaddr, addr_len);
1021 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1022 	if (err)
1023 		goto fail;
1024 
1025 	err = path_permission(&path, MAY_WRITE);
1026 	if (err)
1027 		goto path_put;
1028 
1029 	err = -ECONNREFUSED;
1030 	inode = d_backing_inode(path.dentry);
1031 	if (!S_ISSOCK(inode->i_mode))
1032 		goto path_put;
1033 
1034 	sk = unix_find_socket_byinode(inode);
1035 	if (!sk)
1036 		goto path_put;
1037 
1038 	err = -EPROTOTYPE;
1039 	if (sk->sk_type == type)
1040 		touch_atime(&path);
1041 	else
1042 		goto sock_put;
1043 
1044 	path_put(&path);
1045 
1046 	return sk;
1047 
1048 sock_put:
1049 	sock_put(sk);
1050 path_put:
1051 	path_put(&path);
1052 fail:
1053 	return ERR_PTR(err);
1054 }
1055 
1056 static struct sock *unix_find_abstract(struct net *net,
1057 				       struct sockaddr_un *sunaddr,
1058 				       int addr_len, int type)
1059 {
1060 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1061 	struct dentry *dentry;
1062 	struct sock *sk;
1063 
1064 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1065 	if (!sk)
1066 		return ERR_PTR(-ECONNREFUSED);
1067 
1068 	dentry = unix_sk(sk)->path.dentry;
1069 	if (dentry)
1070 		touch_atime(&unix_sk(sk)->path);
1071 
1072 	return sk;
1073 }
1074 
1075 static struct sock *unix_find_other(struct net *net,
1076 				    struct sockaddr_un *sunaddr,
1077 				    int addr_len, int type)
1078 {
1079 	struct sock *sk;
1080 
1081 	if (sunaddr->sun_path[0])
1082 		sk = unix_find_bsd(sunaddr, addr_len, type);
1083 	else
1084 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1085 
1086 	return sk;
1087 }
1088 
1089 static int unix_autobind(struct sock *sk)
1090 {
1091 	unsigned int new_hash, old_hash = sk->sk_hash;
1092 	struct unix_sock *u = unix_sk(sk);
1093 	struct net *net = sock_net(sk);
1094 	struct unix_address *addr;
1095 	u32 lastnum, ordernum;
1096 	int err;
1097 
1098 	err = mutex_lock_interruptible(&u->bindlock);
1099 	if (err)
1100 		return err;
1101 
1102 	if (u->addr)
1103 		goto out;
1104 
1105 	err = -ENOMEM;
1106 	addr = kzalloc(sizeof(*addr) +
1107 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1108 	if (!addr)
1109 		goto out;
1110 
1111 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1112 	addr->name->sun_family = AF_UNIX;
1113 	refcount_set(&addr->refcnt, 1);
1114 
1115 	ordernum = prandom_u32();
1116 	lastnum = ordernum & 0xFFFFF;
1117 retry:
1118 	ordernum = (ordernum + 1) & 0xFFFFF;
1119 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1120 
1121 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1122 	unix_table_double_lock(net, old_hash, new_hash);
1123 
1124 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1125 		unix_table_double_unlock(net, old_hash, new_hash);
1126 
1127 		/* __unix_find_socket_byname() may take long time if many names
1128 		 * are already in use.
1129 		 */
1130 		cond_resched();
1131 
1132 		if (ordernum == lastnum) {
1133 			/* Give up if all names seems to be in use. */
1134 			err = -ENOSPC;
1135 			unix_release_addr(addr);
1136 			goto out;
1137 		}
1138 
1139 		goto retry;
1140 	}
1141 
1142 	__unix_set_addr_hash(net, sk, addr, new_hash);
1143 	unix_table_double_unlock(net, old_hash, new_hash);
1144 	err = 0;
1145 
1146 out:	mutex_unlock(&u->bindlock);
1147 	return err;
1148 }
1149 
1150 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1151 			 int addr_len)
1152 {
1153 	umode_t mode = S_IFSOCK |
1154 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1155 	unsigned int new_hash, old_hash = sk->sk_hash;
1156 	struct unix_sock *u = unix_sk(sk);
1157 	struct net *net = sock_net(sk);
1158 	struct user_namespace *ns; // barf...
1159 	struct unix_address *addr;
1160 	struct dentry *dentry;
1161 	struct path parent;
1162 	int err;
1163 
1164 	unix_mkname_bsd(sunaddr, addr_len);
1165 	addr_len = strlen(sunaddr->sun_path) +
1166 		offsetof(struct sockaddr_un, sun_path) + 1;
1167 
1168 	addr = unix_create_addr(sunaddr, addr_len);
1169 	if (!addr)
1170 		return -ENOMEM;
1171 
1172 	/*
1173 	 * Get the parent directory, calculate the hash for last
1174 	 * component.
1175 	 */
1176 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1177 	if (IS_ERR(dentry)) {
1178 		err = PTR_ERR(dentry);
1179 		goto out;
1180 	}
1181 
1182 	/*
1183 	 * All right, let's create it.
1184 	 */
1185 	ns = mnt_user_ns(parent.mnt);
1186 	err = security_path_mknod(&parent, dentry, mode, 0);
1187 	if (!err)
1188 		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1189 	if (err)
1190 		goto out_path;
1191 	err = mutex_lock_interruptible(&u->bindlock);
1192 	if (err)
1193 		goto out_unlink;
1194 	if (u->addr)
1195 		goto out_unlock;
1196 
1197 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1198 	unix_table_double_lock(net, old_hash, new_hash);
1199 	u->path.mnt = mntget(parent.mnt);
1200 	u->path.dentry = dget(dentry);
1201 	__unix_set_addr_hash(net, sk, addr, new_hash);
1202 	unix_table_double_unlock(net, old_hash, new_hash);
1203 	unix_insert_bsd_socket(sk);
1204 	mutex_unlock(&u->bindlock);
1205 	done_path_create(&parent, dentry);
1206 	return 0;
1207 
1208 out_unlock:
1209 	mutex_unlock(&u->bindlock);
1210 	err = -EINVAL;
1211 out_unlink:
1212 	/* failed after successful mknod?  unlink what we'd created... */
1213 	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1214 out_path:
1215 	done_path_create(&parent, dentry);
1216 out:
1217 	unix_release_addr(addr);
1218 	return err == -EEXIST ? -EADDRINUSE : err;
1219 }
1220 
1221 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1222 			      int addr_len)
1223 {
1224 	unsigned int new_hash, old_hash = sk->sk_hash;
1225 	struct unix_sock *u = unix_sk(sk);
1226 	struct net *net = sock_net(sk);
1227 	struct unix_address *addr;
1228 	int err;
1229 
1230 	addr = unix_create_addr(sunaddr, addr_len);
1231 	if (!addr)
1232 		return -ENOMEM;
1233 
1234 	err = mutex_lock_interruptible(&u->bindlock);
1235 	if (err)
1236 		goto out;
1237 
1238 	if (u->addr) {
1239 		err = -EINVAL;
1240 		goto out_mutex;
1241 	}
1242 
1243 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1244 	unix_table_double_lock(net, old_hash, new_hash);
1245 
1246 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1247 		goto out_spin;
1248 
1249 	__unix_set_addr_hash(net, sk, addr, new_hash);
1250 	unix_table_double_unlock(net, old_hash, new_hash);
1251 	mutex_unlock(&u->bindlock);
1252 	return 0;
1253 
1254 out_spin:
1255 	unix_table_double_unlock(net, old_hash, new_hash);
1256 	err = -EADDRINUSE;
1257 out_mutex:
1258 	mutex_unlock(&u->bindlock);
1259 out:
1260 	unix_release_addr(addr);
1261 	return err;
1262 }
1263 
1264 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1265 {
1266 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1267 	struct sock *sk = sock->sk;
1268 	int err;
1269 
1270 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1271 	    sunaddr->sun_family == AF_UNIX)
1272 		return unix_autobind(sk);
1273 
1274 	err = unix_validate_addr(sunaddr, addr_len);
1275 	if (err)
1276 		return err;
1277 
1278 	if (sunaddr->sun_path[0])
1279 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1280 	else
1281 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1282 
1283 	return err;
1284 }
1285 
1286 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1287 {
1288 	if (unlikely(sk1 == sk2) || !sk2) {
1289 		unix_state_lock(sk1);
1290 		return;
1291 	}
1292 	if (sk1 < sk2) {
1293 		unix_state_lock(sk1);
1294 		unix_state_lock_nested(sk2);
1295 	} else {
1296 		unix_state_lock(sk2);
1297 		unix_state_lock_nested(sk1);
1298 	}
1299 }
1300 
1301 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1302 {
1303 	if (unlikely(sk1 == sk2) || !sk2) {
1304 		unix_state_unlock(sk1);
1305 		return;
1306 	}
1307 	unix_state_unlock(sk1);
1308 	unix_state_unlock(sk2);
1309 }
1310 
1311 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1312 			      int alen, int flags)
1313 {
1314 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1315 	struct sock *sk = sock->sk;
1316 	struct sock *other;
1317 	int err;
1318 
1319 	err = -EINVAL;
1320 	if (alen < offsetofend(struct sockaddr, sa_family))
1321 		goto out;
1322 
1323 	if (addr->sa_family != AF_UNSPEC) {
1324 		err = unix_validate_addr(sunaddr, alen);
1325 		if (err)
1326 			goto out;
1327 
1328 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1329 		    !unix_sk(sk)->addr) {
1330 			err = unix_autobind(sk);
1331 			if (err)
1332 				goto out;
1333 		}
1334 
1335 restart:
1336 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1337 		if (IS_ERR(other)) {
1338 			err = PTR_ERR(other);
1339 			goto out;
1340 		}
1341 
1342 		unix_state_double_lock(sk, other);
1343 
1344 		/* Apparently VFS overslept socket death. Retry. */
1345 		if (sock_flag(other, SOCK_DEAD)) {
1346 			unix_state_double_unlock(sk, other);
1347 			sock_put(other);
1348 			goto restart;
1349 		}
1350 
1351 		err = -EPERM;
1352 		if (!unix_may_send(sk, other))
1353 			goto out_unlock;
1354 
1355 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1356 		if (err)
1357 			goto out_unlock;
1358 
1359 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1360 	} else {
1361 		/*
1362 		 *	1003.1g breaking connected state with AF_UNSPEC
1363 		 */
1364 		other = NULL;
1365 		unix_state_double_lock(sk, other);
1366 	}
1367 
1368 	/*
1369 	 * If it was connected, reconnect.
1370 	 */
1371 	if (unix_peer(sk)) {
1372 		struct sock *old_peer = unix_peer(sk);
1373 
1374 		unix_peer(sk) = other;
1375 		if (!other)
1376 			sk->sk_state = TCP_CLOSE;
1377 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1378 
1379 		unix_state_double_unlock(sk, other);
1380 
1381 		if (other != old_peer)
1382 			unix_dgram_disconnected(sk, old_peer);
1383 		sock_put(old_peer);
1384 	} else {
1385 		unix_peer(sk) = other;
1386 		unix_state_double_unlock(sk, other);
1387 	}
1388 
1389 	return 0;
1390 
1391 out_unlock:
1392 	unix_state_double_unlock(sk, other);
1393 	sock_put(other);
1394 out:
1395 	return err;
1396 }
1397 
1398 static long unix_wait_for_peer(struct sock *other, long timeo)
1399 	__releases(&unix_sk(other)->lock)
1400 {
1401 	struct unix_sock *u = unix_sk(other);
1402 	int sched;
1403 	DEFINE_WAIT(wait);
1404 
1405 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1406 
1407 	sched = !sock_flag(other, SOCK_DEAD) &&
1408 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1409 		unix_recvq_full(other);
1410 
1411 	unix_state_unlock(other);
1412 
1413 	if (sched)
1414 		timeo = schedule_timeout(timeo);
1415 
1416 	finish_wait(&u->peer_wait, &wait);
1417 	return timeo;
1418 }
1419 
1420 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1421 			       int addr_len, int flags)
1422 {
1423 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1424 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1425 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1426 	struct net *net = sock_net(sk);
1427 	struct sk_buff *skb = NULL;
1428 	long timeo;
1429 	int err;
1430 	int st;
1431 
1432 	err = unix_validate_addr(sunaddr, addr_len);
1433 	if (err)
1434 		goto out;
1435 
1436 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1437 		err = unix_autobind(sk);
1438 		if (err)
1439 			goto out;
1440 	}
1441 
1442 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1443 
1444 	/* First of all allocate resources.
1445 	   If we will make it after state is locked,
1446 	   we will have to recheck all again in any case.
1447 	 */
1448 
1449 	/* create new sock for complete connection */
1450 	newsk = unix_create1(net, NULL, 0, sock->type);
1451 	if (IS_ERR(newsk)) {
1452 		err = PTR_ERR(newsk);
1453 		newsk = NULL;
1454 		goto out;
1455 	}
1456 
1457 	err = -ENOMEM;
1458 
1459 	/* Allocate skb for sending to listening sock */
1460 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1461 	if (skb == NULL)
1462 		goto out;
1463 
1464 restart:
1465 	/*  Find listening sock. */
1466 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1467 	if (IS_ERR(other)) {
1468 		err = PTR_ERR(other);
1469 		other = NULL;
1470 		goto out;
1471 	}
1472 
1473 	/* Latch state of peer */
1474 	unix_state_lock(other);
1475 
1476 	/* Apparently VFS overslept socket death. Retry. */
1477 	if (sock_flag(other, SOCK_DEAD)) {
1478 		unix_state_unlock(other);
1479 		sock_put(other);
1480 		goto restart;
1481 	}
1482 
1483 	err = -ECONNREFUSED;
1484 	if (other->sk_state != TCP_LISTEN)
1485 		goto out_unlock;
1486 	if (other->sk_shutdown & RCV_SHUTDOWN)
1487 		goto out_unlock;
1488 
1489 	if (unix_recvq_full(other)) {
1490 		err = -EAGAIN;
1491 		if (!timeo)
1492 			goto out_unlock;
1493 
1494 		timeo = unix_wait_for_peer(other, timeo);
1495 
1496 		err = sock_intr_errno(timeo);
1497 		if (signal_pending(current))
1498 			goto out;
1499 		sock_put(other);
1500 		goto restart;
1501 	}
1502 
1503 	/* Latch our state.
1504 
1505 	   It is tricky place. We need to grab our state lock and cannot
1506 	   drop lock on peer. It is dangerous because deadlock is
1507 	   possible. Connect to self case and simultaneous
1508 	   attempt to connect are eliminated by checking socket
1509 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1510 	   check this before attempt to grab lock.
1511 
1512 	   Well, and we have to recheck the state after socket locked.
1513 	 */
1514 	st = sk->sk_state;
1515 
1516 	switch (st) {
1517 	case TCP_CLOSE:
1518 		/* This is ok... continue with connect */
1519 		break;
1520 	case TCP_ESTABLISHED:
1521 		/* Socket is already connected */
1522 		err = -EISCONN;
1523 		goto out_unlock;
1524 	default:
1525 		err = -EINVAL;
1526 		goto out_unlock;
1527 	}
1528 
1529 	unix_state_lock_nested(sk);
1530 
1531 	if (sk->sk_state != st) {
1532 		unix_state_unlock(sk);
1533 		unix_state_unlock(other);
1534 		sock_put(other);
1535 		goto restart;
1536 	}
1537 
1538 	err = security_unix_stream_connect(sk, other, newsk);
1539 	if (err) {
1540 		unix_state_unlock(sk);
1541 		goto out_unlock;
1542 	}
1543 
1544 	/* The way is open! Fastly set all the necessary fields... */
1545 
1546 	sock_hold(sk);
1547 	unix_peer(newsk)	= sk;
1548 	newsk->sk_state		= TCP_ESTABLISHED;
1549 	newsk->sk_type		= sk->sk_type;
1550 	init_peercred(newsk);
1551 	newu = unix_sk(newsk);
1552 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1553 	otheru = unix_sk(other);
1554 
1555 	/* copy address information from listening to new sock
1556 	 *
1557 	 * The contents of *(otheru->addr) and otheru->path
1558 	 * are seen fully set up here, since we have found
1559 	 * otheru in hash under its lock.  Insertion into the
1560 	 * hash chain we'd found it in had been done in an
1561 	 * earlier critical area protected by the chain's lock,
1562 	 * the same one where we'd set *(otheru->addr) contents,
1563 	 * as well as otheru->path and otheru->addr itself.
1564 	 *
1565 	 * Using smp_store_release() here to set newu->addr
1566 	 * is enough to make those stores, as well as stores
1567 	 * to newu->path visible to anyone who gets newu->addr
1568 	 * by smp_load_acquire().  IOW, the same warranties
1569 	 * as for unix_sock instances bound in unix_bind() or
1570 	 * in unix_autobind().
1571 	 */
1572 	if (otheru->path.dentry) {
1573 		path_get(&otheru->path);
1574 		newu->path = otheru->path;
1575 	}
1576 	refcount_inc(&otheru->addr->refcnt);
1577 	smp_store_release(&newu->addr, otheru->addr);
1578 
1579 	/* Set credentials */
1580 	copy_peercred(sk, other);
1581 
1582 	sock->state	= SS_CONNECTED;
1583 	sk->sk_state	= TCP_ESTABLISHED;
1584 	sock_hold(newsk);
1585 
1586 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1587 	unix_peer(sk)	= newsk;
1588 
1589 	unix_state_unlock(sk);
1590 
1591 	/* take ten and send info to listening sock */
1592 	spin_lock(&other->sk_receive_queue.lock);
1593 	__skb_queue_tail(&other->sk_receive_queue, skb);
1594 	spin_unlock(&other->sk_receive_queue.lock);
1595 	unix_state_unlock(other);
1596 	other->sk_data_ready(other);
1597 	sock_put(other);
1598 	return 0;
1599 
1600 out_unlock:
1601 	if (other)
1602 		unix_state_unlock(other);
1603 
1604 out:
1605 	kfree_skb(skb);
1606 	if (newsk)
1607 		unix_release_sock(newsk, 0);
1608 	if (other)
1609 		sock_put(other);
1610 	return err;
1611 }
1612 
1613 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1614 {
1615 	struct sock *ska = socka->sk, *skb = sockb->sk;
1616 
1617 	/* Join our sockets back to back */
1618 	sock_hold(ska);
1619 	sock_hold(skb);
1620 	unix_peer(ska) = skb;
1621 	unix_peer(skb) = ska;
1622 	init_peercred(ska);
1623 	init_peercred(skb);
1624 
1625 	ska->sk_state = TCP_ESTABLISHED;
1626 	skb->sk_state = TCP_ESTABLISHED;
1627 	socka->state  = SS_CONNECTED;
1628 	sockb->state  = SS_CONNECTED;
1629 	return 0;
1630 }
1631 
1632 static void unix_sock_inherit_flags(const struct socket *old,
1633 				    struct socket *new)
1634 {
1635 	if (test_bit(SOCK_PASSCRED, &old->flags))
1636 		set_bit(SOCK_PASSCRED, &new->flags);
1637 	if (test_bit(SOCK_PASSSEC, &old->flags))
1638 		set_bit(SOCK_PASSSEC, &new->flags);
1639 }
1640 
1641 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1642 		       bool kern)
1643 {
1644 	struct sock *sk = sock->sk;
1645 	struct sock *tsk;
1646 	struct sk_buff *skb;
1647 	int err;
1648 
1649 	err = -EOPNOTSUPP;
1650 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1651 		goto out;
1652 
1653 	err = -EINVAL;
1654 	if (sk->sk_state != TCP_LISTEN)
1655 		goto out;
1656 
1657 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1658 	 * so that no locks are necessary.
1659 	 */
1660 
1661 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1662 				&err);
1663 	if (!skb) {
1664 		/* This means receive shutdown. */
1665 		if (err == 0)
1666 			err = -EINVAL;
1667 		goto out;
1668 	}
1669 
1670 	tsk = skb->sk;
1671 	skb_free_datagram(sk, skb);
1672 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1673 
1674 	/* attach accepted sock to socket */
1675 	unix_state_lock(tsk);
1676 	newsock->state = SS_CONNECTED;
1677 	unix_sock_inherit_flags(sock, newsock);
1678 	sock_graft(tsk, newsock);
1679 	unix_state_unlock(tsk);
1680 	return 0;
1681 
1682 out:
1683 	return err;
1684 }
1685 
1686 
1687 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1688 {
1689 	struct sock *sk = sock->sk;
1690 	struct unix_address *addr;
1691 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1692 	int err = 0;
1693 
1694 	if (peer) {
1695 		sk = unix_peer_get(sk);
1696 
1697 		err = -ENOTCONN;
1698 		if (!sk)
1699 			goto out;
1700 		err = 0;
1701 	} else {
1702 		sock_hold(sk);
1703 	}
1704 
1705 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1706 	if (!addr) {
1707 		sunaddr->sun_family = AF_UNIX;
1708 		sunaddr->sun_path[0] = 0;
1709 		err = offsetof(struct sockaddr_un, sun_path);
1710 	} else {
1711 		err = addr->len;
1712 		memcpy(sunaddr, addr->name, addr->len);
1713 	}
1714 	sock_put(sk);
1715 out:
1716 	return err;
1717 }
1718 
1719 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1720 {
1721 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1722 
1723 	/*
1724 	 * Garbage collection of unix sockets starts by selecting a set of
1725 	 * candidate sockets which have reference only from being in flight
1726 	 * (total_refs == inflight_refs).  This condition is checked once during
1727 	 * the candidate collection phase, and candidates are marked as such, so
1728 	 * that non-candidates can later be ignored.  While inflight_refs is
1729 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1730 	 * is an instantaneous decision.
1731 	 *
1732 	 * Once a candidate, however, the socket must not be reinstalled into a
1733 	 * file descriptor while the garbage collection is in progress.
1734 	 *
1735 	 * If the above conditions are met, then the directed graph of
1736 	 * candidates (*) does not change while unix_gc_lock is held.
1737 	 *
1738 	 * Any operations that changes the file count through file descriptors
1739 	 * (dup, close, sendmsg) does not change the graph since candidates are
1740 	 * not installed in fds.
1741 	 *
1742 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1743 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1744 	 * serialized with garbage collection.
1745 	 *
1746 	 * MSG_PEEK is special in that it does not change the inflight count,
1747 	 * yet does install the socket into an fd.  The following lock/unlock
1748 	 * pair is to ensure serialization with garbage collection.  It must be
1749 	 * done between incrementing the file count and installing the file into
1750 	 * an fd.
1751 	 *
1752 	 * If garbage collection starts after the barrier provided by the
1753 	 * lock/unlock, then it will see the elevated refcount and not mark this
1754 	 * as a candidate.  If a garbage collection is already in progress
1755 	 * before the file count was incremented, then the lock/unlock pair will
1756 	 * ensure that garbage collection is finished before progressing to
1757 	 * installing the fd.
1758 	 *
1759 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1760 	 * which is on the queue of listening socket A.
1761 	 */
1762 	spin_lock(&unix_gc_lock);
1763 	spin_unlock(&unix_gc_lock);
1764 }
1765 
1766 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1767 {
1768 	int err = 0;
1769 
1770 	UNIXCB(skb).pid  = get_pid(scm->pid);
1771 	UNIXCB(skb).uid = scm->creds.uid;
1772 	UNIXCB(skb).gid = scm->creds.gid;
1773 	UNIXCB(skb).fp = NULL;
1774 	unix_get_secdata(scm, skb);
1775 	if (scm->fp && send_fds)
1776 		err = unix_attach_fds(scm, skb);
1777 
1778 	skb->destructor = unix_destruct_scm;
1779 	return err;
1780 }
1781 
1782 static bool unix_passcred_enabled(const struct socket *sock,
1783 				  const struct sock *other)
1784 {
1785 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1786 	       !other->sk_socket ||
1787 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1788 }
1789 
1790 /*
1791  * Some apps rely on write() giving SCM_CREDENTIALS
1792  * We include credentials if source or destination socket
1793  * asserted SOCK_PASSCRED.
1794  */
1795 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1796 			    const struct sock *other)
1797 {
1798 	if (UNIXCB(skb).pid)
1799 		return;
1800 	if (unix_passcred_enabled(sock, other)) {
1801 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1802 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1803 	}
1804 }
1805 
1806 static int maybe_init_creds(struct scm_cookie *scm,
1807 			    struct socket *socket,
1808 			    const struct sock *other)
1809 {
1810 	int err;
1811 	struct msghdr msg = { .msg_controllen = 0 };
1812 
1813 	err = scm_send(socket, &msg, scm, false);
1814 	if (err)
1815 		return err;
1816 
1817 	if (unix_passcred_enabled(socket, other)) {
1818 		scm->pid = get_pid(task_tgid(current));
1819 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1820 	}
1821 	return err;
1822 }
1823 
1824 static bool unix_skb_scm_eq(struct sk_buff *skb,
1825 			    struct scm_cookie *scm)
1826 {
1827 	return UNIXCB(skb).pid == scm->pid &&
1828 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1829 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1830 	       unix_secdata_eq(scm, skb);
1831 }
1832 
1833 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1834 {
1835 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1836 	struct unix_sock *u = unix_sk(sk);
1837 
1838 	if (unlikely(fp && fp->count))
1839 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1840 }
1841 
1842 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1843 {
1844 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1845 	struct unix_sock *u = unix_sk(sk);
1846 
1847 	if (unlikely(fp && fp->count))
1848 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1849 }
1850 
1851 /*
1852  *	Send AF_UNIX data.
1853  */
1854 
1855 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1856 			      size_t len)
1857 {
1858 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1859 	struct sock *sk = sock->sk, *other = NULL;
1860 	struct unix_sock *u = unix_sk(sk);
1861 	struct scm_cookie scm;
1862 	struct sk_buff *skb;
1863 	int data_len = 0;
1864 	int sk_locked;
1865 	long timeo;
1866 	int err;
1867 
1868 	wait_for_unix_gc();
1869 	err = scm_send(sock, msg, &scm, false);
1870 	if (err < 0)
1871 		return err;
1872 
1873 	err = -EOPNOTSUPP;
1874 	if (msg->msg_flags&MSG_OOB)
1875 		goto out;
1876 
1877 	if (msg->msg_namelen) {
1878 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1879 		if (err)
1880 			goto out;
1881 	} else {
1882 		sunaddr = NULL;
1883 		err = -ENOTCONN;
1884 		other = unix_peer_get(sk);
1885 		if (!other)
1886 			goto out;
1887 	}
1888 
1889 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1890 		err = unix_autobind(sk);
1891 		if (err)
1892 			goto out;
1893 	}
1894 
1895 	err = -EMSGSIZE;
1896 	if (len > sk->sk_sndbuf - 32)
1897 		goto out;
1898 
1899 	if (len > SKB_MAX_ALLOC) {
1900 		data_len = min_t(size_t,
1901 				 len - SKB_MAX_ALLOC,
1902 				 MAX_SKB_FRAGS * PAGE_SIZE);
1903 		data_len = PAGE_ALIGN(data_len);
1904 
1905 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1906 	}
1907 
1908 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1909 				   msg->msg_flags & MSG_DONTWAIT, &err,
1910 				   PAGE_ALLOC_COSTLY_ORDER);
1911 	if (skb == NULL)
1912 		goto out;
1913 
1914 	err = unix_scm_to_skb(&scm, skb, true);
1915 	if (err < 0)
1916 		goto out_free;
1917 
1918 	skb_put(skb, len - data_len);
1919 	skb->data_len = data_len;
1920 	skb->len = len;
1921 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1922 	if (err)
1923 		goto out_free;
1924 
1925 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1926 
1927 restart:
1928 	if (!other) {
1929 		err = -ECONNRESET;
1930 		if (sunaddr == NULL)
1931 			goto out_free;
1932 
1933 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1934 					sk->sk_type);
1935 		if (IS_ERR(other)) {
1936 			err = PTR_ERR(other);
1937 			other = NULL;
1938 			goto out_free;
1939 		}
1940 	}
1941 
1942 	if (sk_filter(other, skb) < 0) {
1943 		/* Toss the packet but do not return any error to the sender */
1944 		err = len;
1945 		goto out_free;
1946 	}
1947 
1948 	sk_locked = 0;
1949 	unix_state_lock(other);
1950 restart_locked:
1951 	err = -EPERM;
1952 	if (!unix_may_send(sk, other))
1953 		goto out_unlock;
1954 
1955 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1956 		/*
1957 		 *	Check with 1003.1g - what should
1958 		 *	datagram error
1959 		 */
1960 		unix_state_unlock(other);
1961 		sock_put(other);
1962 
1963 		if (!sk_locked)
1964 			unix_state_lock(sk);
1965 
1966 		err = 0;
1967 		if (unix_peer(sk) == other) {
1968 			unix_peer(sk) = NULL;
1969 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1970 
1971 			unix_state_unlock(sk);
1972 
1973 			sk->sk_state = TCP_CLOSE;
1974 			unix_dgram_disconnected(sk, other);
1975 			sock_put(other);
1976 			err = -ECONNREFUSED;
1977 		} else {
1978 			unix_state_unlock(sk);
1979 		}
1980 
1981 		other = NULL;
1982 		if (err)
1983 			goto out_free;
1984 		goto restart;
1985 	}
1986 
1987 	err = -EPIPE;
1988 	if (other->sk_shutdown & RCV_SHUTDOWN)
1989 		goto out_unlock;
1990 
1991 	if (sk->sk_type != SOCK_SEQPACKET) {
1992 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1993 		if (err)
1994 			goto out_unlock;
1995 	}
1996 
1997 	/* other == sk && unix_peer(other) != sk if
1998 	 * - unix_peer(sk) == NULL, destination address bound to sk
1999 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2000 	 */
2001 	if (other != sk &&
2002 	    unlikely(unix_peer(other) != sk &&
2003 	    unix_recvq_full_lockless(other))) {
2004 		if (timeo) {
2005 			timeo = unix_wait_for_peer(other, timeo);
2006 
2007 			err = sock_intr_errno(timeo);
2008 			if (signal_pending(current))
2009 				goto out_free;
2010 
2011 			goto restart;
2012 		}
2013 
2014 		if (!sk_locked) {
2015 			unix_state_unlock(other);
2016 			unix_state_double_lock(sk, other);
2017 		}
2018 
2019 		if (unix_peer(sk) != other ||
2020 		    unix_dgram_peer_wake_me(sk, other)) {
2021 			err = -EAGAIN;
2022 			sk_locked = 1;
2023 			goto out_unlock;
2024 		}
2025 
2026 		if (!sk_locked) {
2027 			sk_locked = 1;
2028 			goto restart_locked;
2029 		}
2030 	}
2031 
2032 	if (unlikely(sk_locked))
2033 		unix_state_unlock(sk);
2034 
2035 	if (sock_flag(other, SOCK_RCVTSTAMP))
2036 		__net_timestamp(skb);
2037 	maybe_add_creds(skb, sock, other);
2038 	scm_stat_add(other, skb);
2039 	skb_queue_tail(&other->sk_receive_queue, skb);
2040 	unix_state_unlock(other);
2041 	other->sk_data_ready(other);
2042 	sock_put(other);
2043 	scm_destroy(&scm);
2044 	return len;
2045 
2046 out_unlock:
2047 	if (sk_locked)
2048 		unix_state_unlock(sk);
2049 	unix_state_unlock(other);
2050 out_free:
2051 	kfree_skb(skb);
2052 out:
2053 	if (other)
2054 		sock_put(other);
2055 	scm_destroy(&scm);
2056 	return err;
2057 }
2058 
2059 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2060  * bytes, and a minimum of a full page.
2061  */
2062 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2063 
2064 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2065 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2066 {
2067 	struct unix_sock *ousk = unix_sk(other);
2068 	struct sk_buff *skb;
2069 	int err = 0;
2070 
2071 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2072 
2073 	if (!skb)
2074 		return err;
2075 
2076 	skb_put(skb, 1);
2077 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2078 
2079 	if (err) {
2080 		kfree_skb(skb);
2081 		return err;
2082 	}
2083 
2084 	unix_state_lock(other);
2085 
2086 	if (sock_flag(other, SOCK_DEAD) ||
2087 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2088 		unix_state_unlock(other);
2089 		kfree_skb(skb);
2090 		return -EPIPE;
2091 	}
2092 
2093 	maybe_add_creds(skb, sock, other);
2094 	skb_get(skb);
2095 
2096 	if (ousk->oob_skb)
2097 		consume_skb(ousk->oob_skb);
2098 
2099 	WRITE_ONCE(ousk->oob_skb, skb);
2100 
2101 	scm_stat_add(other, skb);
2102 	skb_queue_tail(&other->sk_receive_queue, skb);
2103 	sk_send_sigurg(other);
2104 	unix_state_unlock(other);
2105 	other->sk_data_ready(other);
2106 
2107 	return err;
2108 }
2109 #endif
2110 
2111 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2112 			       size_t len)
2113 {
2114 	struct sock *sk = sock->sk;
2115 	struct sock *other = NULL;
2116 	int err, size;
2117 	struct sk_buff *skb;
2118 	int sent = 0;
2119 	struct scm_cookie scm;
2120 	bool fds_sent = false;
2121 	int data_len;
2122 
2123 	wait_for_unix_gc();
2124 	err = scm_send(sock, msg, &scm, false);
2125 	if (err < 0)
2126 		return err;
2127 
2128 	err = -EOPNOTSUPP;
2129 	if (msg->msg_flags & MSG_OOB) {
2130 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2131 		if (len)
2132 			len--;
2133 		else
2134 #endif
2135 			goto out_err;
2136 	}
2137 
2138 	if (msg->msg_namelen) {
2139 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2140 		goto out_err;
2141 	} else {
2142 		err = -ENOTCONN;
2143 		other = unix_peer(sk);
2144 		if (!other)
2145 			goto out_err;
2146 	}
2147 
2148 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2149 		goto pipe_err;
2150 
2151 	while (sent < len) {
2152 		size = len - sent;
2153 
2154 		/* Keep two messages in the pipe so it schedules better */
2155 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2156 
2157 		/* allow fallback to order-0 allocations */
2158 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2159 
2160 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2161 
2162 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2163 
2164 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2165 					   msg->msg_flags & MSG_DONTWAIT, &err,
2166 					   get_order(UNIX_SKB_FRAGS_SZ));
2167 		if (!skb)
2168 			goto out_err;
2169 
2170 		/* Only send the fds in the first buffer */
2171 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2172 		if (err < 0) {
2173 			kfree_skb(skb);
2174 			goto out_err;
2175 		}
2176 		fds_sent = true;
2177 
2178 		skb_put(skb, size - data_len);
2179 		skb->data_len = data_len;
2180 		skb->len = size;
2181 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2182 		if (err) {
2183 			kfree_skb(skb);
2184 			goto out_err;
2185 		}
2186 
2187 		unix_state_lock(other);
2188 
2189 		if (sock_flag(other, SOCK_DEAD) ||
2190 		    (other->sk_shutdown & RCV_SHUTDOWN))
2191 			goto pipe_err_free;
2192 
2193 		maybe_add_creds(skb, sock, other);
2194 		scm_stat_add(other, skb);
2195 		skb_queue_tail(&other->sk_receive_queue, skb);
2196 		unix_state_unlock(other);
2197 		other->sk_data_ready(other);
2198 		sent += size;
2199 	}
2200 
2201 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2202 	if (msg->msg_flags & MSG_OOB) {
2203 		err = queue_oob(sock, msg, other);
2204 		if (err)
2205 			goto out_err;
2206 		sent++;
2207 	}
2208 #endif
2209 
2210 	scm_destroy(&scm);
2211 
2212 	return sent;
2213 
2214 pipe_err_free:
2215 	unix_state_unlock(other);
2216 	kfree_skb(skb);
2217 pipe_err:
2218 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2219 		send_sig(SIGPIPE, current, 0);
2220 	err = -EPIPE;
2221 out_err:
2222 	scm_destroy(&scm);
2223 	return sent ? : err;
2224 }
2225 
2226 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2227 				    int offset, size_t size, int flags)
2228 {
2229 	int err;
2230 	bool send_sigpipe = false;
2231 	bool init_scm = true;
2232 	struct scm_cookie scm;
2233 	struct sock *other, *sk = socket->sk;
2234 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2235 
2236 	if (flags & MSG_OOB)
2237 		return -EOPNOTSUPP;
2238 
2239 	other = unix_peer(sk);
2240 	if (!other || sk->sk_state != TCP_ESTABLISHED)
2241 		return -ENOTCONN;
2242 
2243 	if (false) {
2244 alloc_skb:
2245 		unix_state_unlock(other);
2246 		mutex_unlock(&unix_sk(other)->iolock);
2247 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2248 					      &err, 0);
2249 		if (!newskb)
2250 			goto err;
2251 	}
2252 
2253 	/* we must acquire iolock as we modify already present
2254 	 * skbs in the sk_receive_queue and mess with skb->len
2255 	 */
2256 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2257 	if (err) {
2258 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2259 		goto err;
2260 	}
2261 
2262 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
2263 		err = -EPIPE;
2264 		send_sigpipe = true;
2265 		goto err_unlock;
2266 	}
2267 
2268 	unix_state_lock(other);
2269 
2270 	if (sock_flag(other, SOCK_DEAD) ||
2271 	    other->sk_shutdown & RCV_SHUTDOWN) {
2272 		err = -EPIPE;
2273 		send_sigpipe = true;
2274 		goto err_state_unlock;
2275 	}
2276 
2277 	if (init_scm) {
2278 		err = maybe_init_creds(&scm, socket, other);
2279 		if (err)
2280 			goto err_state_unlock;
2281 		init_scm = false;
2282 	}
2283 
2284 	skb = skb_peek_tail(&other->sk_receive_queue);
2285 	if (tail && tail == skb) {
2286 		skb = newskb;
2287 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2288 		if (newskb) {
2289 			skb = newskb;
2290 		} else {
2291 			tail = skb;
2292 			goto alloc_skb;
2293 		}
2294 	} else if (newskb) {
2295 		/* this is fast path, we don't necessarily need to
2296 		 * call to kfree_skb even though with newskb == NULL
2297 		 * this - does no harm
2298 		 */
2299 		consume_skb(newskb);
2300 		newskb = NULL;
2301 	}
2302 
2303 	if (skb_append_pagefrags(skb, page, offset, size)) {
2304 		tail = skb;
2305 		goto alloc_skb;
2306 	}
2307 
2308 	skb->len += size;
2309 	skb->data_len += size;
2310 	skb->truesize += size;
2311 	refcount_add(size, &sk->sk_wmem_alloc);
2312 
2313 	if (newskb) {
2314 		err = unix_scm_to_skb(&scm, skb, false);
2315 		if (err)
2316 			goto err_state_unlock;
2317 		spin_lock(&other->sk_receive_queue.lock);
2318 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2319 		spin_unlock(&other->sk_receive_queue.lock);
2320 	}
2321 
2322 	unix_state_unlock(other);
2323 	mutex_unlock(&unix_sk(other)->iolock);
2324 
2325 	other->sk_data_ready(other);
2326 	scm_destroy(&scm);
2327 	return size;
2328 
2329 err_state_unlock:
2330 	unix_state_unlock(other);
2331 err_unlock:
2332 	mutex_unlock(&unix_sk(other)->iolock);
2333 err:
2334 	kfree_skb(newskb);
2335 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2336 		send_sig(SIGPIPE, current, 0);
2337 	if (!init_scm)
2338 		scm_destroy(&scm);
2339 	return err;
2340 }
2341 
2342 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2343 				  size_t len)
2344 {
2345 	int err;
2346 	struct sock *sk = sock->sk;
2347 
2348 	err = sock_error(sk);
2349 	if (err)
2350 		return err;
2351 
2352 	if (sk->sk_state != TCP_ESTABLISHED)
2353 		return -ENOTCONN;
2354 
2355 	if (msg->msg_namelen)
2356 		msg->msg_namelen = 0;
2357 
2358 	return unix_dgram_sendmsg(sock, msg, len);
2359 }
2360 
2361 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2362 				  size_t size, int flags)
2363 {
2364 	struct sock *sk = sock->sk;
2365 
2366 	if (sk->sk_state != TCP_ESTABLISHED)
2367 		return -ENOTCONN;
2368 
2369 	return unix_dgram_recvmsg(sock, msg, size, flags);
2370 }
2371 
2372 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2373 {
2374 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2375 
2376 	if (addr) {
2377 		msg->msg_namelen = addr->len;
2378 		memcpy(msg->msg_name, addr->name, addr->len);
2379 	}
2380 }
2381 
2382 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2383 			 int flags)
2384 {
2385 	struct scm_cookie scm;
2386 	struct socket *sock = sk->sk_socket;
2387 	struct unix_sock *u = unix_sk(sk);
2388 	struct sk_buff *skb, *last;
2389 	long timeo;
2390 	int skip;
2391 	int err;
2392 
2393 	err = -EOPNOTSUPP;
2394 	if (flags&MSG_OOB)
2395 		goto out;
2396 
2397 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2398 
2399 	do {
2400 		mutex_lock(&u->iolock);
2401 
2402 		skip = sk_peek_offset(sk, flags);
2403 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2404 					      &skip, &err, &last);
2405 		if (skb) {
2406 			if (!(flags & MSG_PEEK))
2407 				scm_stat_del(sk, skb);
2408 			break;
2409 		}
2410 
2411 		mutex_unlock(&u->iolock);
2412 
2413 		if (err != -EAGAIN)
2414 			break;
2415 	} while (timeo &&
2416 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2417 					      &err, &timeo, last));
2418 
2419 	if (!skb) { /* implies iolock unlocked */
2420 		unix_state_lock(sk);
2421 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2422 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2423 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2424 			err = 0;
2425 		unix_state_unlock(sk);
2426 		goto out;
2427 	}
2428 
2429 	if (wq_has_sleeper(&u->peer_wait))
2430 		wake_up_interruptible_sync_poll(&u->peer_wait,
2431 						EPOLLOUT | EPOLLWRNORM |
2432 						EPOLLWRBAND);
2433 
2434 	if (msg->msg_name)
2435 		unix_copy_addr(msg, skb->sk);
2436 
2437 	if (size > skb->len - skip)
2438 		size = skb->len - skip;
2439 	else if (size < skb->len - skip)
2440 		msg->msg_flags |= MSG_TRUNC;
2441 
2442 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2443 	if (err)
2444 		goto out_free;
2445 
2446 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2447 		__sock_recv_timestamp(msg, sk, skb);
2448 
2449 	memset(&scm, 0, sizeof(scm));
2450 
2451 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2452 	unix_set_secdata(&scm, skb);
2453 
2454 	if (!(flags & MSG_PEEK)) {
2455 		if (UNIXCB(skb).fp)
2456 			unix_detach_fds(&scm, skb);
2457 
2458 		sk_peek_offset_bwd(sk, skb->len);
2459 	} else {
2460 		/* It is questionable: on PEEK we could:
2461 		   - do not return fds - good, but too simple 8)
2462 		   - return fds, and do not return them on read (old strategy,
2463 		     apparently wrong)
2464 		   - clone fds (I chose it for now, it is the most universal
2465 		     solution)
2466 
2467 		   POSIX 1003.1g does not actually define this clearly
2468 		   at all. POSIX 1003.1g doesn't define a lot of things
2469 		   clearly however!
2470 
2471 		*/
2472 
2473 		sk_peek_offset_fwd(sk, size);
2474 
2475 		if (UNIXCB(skb).fp)
2476 			unix_peek_fds(&scm, skb);
2477 	}
2478 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2479 
2480 	scm_recv(sock, msg, &scm, flags);
2481 
2482 out_free:
2483 	skb_free_datagram(sk, skb);
2484 	mutex_unlock(&u->iolock);
2485 out:
2486 	return err;
2487 }
2488 
2489 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2490 			      int flags)
2491 {
2492 	struct sock *sk = sock->sk;
2493 
2494 #ifdef CONFIG_BPF_SYSCALL
2495 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2496 
2497 	if (prot != &unix_dgram_proto)
2498 		return prot->recvmsg(sk, msg, size, flags, NULL);
2499 #endif
2500 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2501 }
2502 
2503 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
2504 			  sk_read_actor_t recv_actor)
2505 {
2506 	int copied = 0;
2507 
2508 	while (1) {
2509 		struct unix_sock *u = unix_sk(sk);
2510 		struct sk_buff *skb;
2511 		int used, err;
2512 
2513 		mutex_lock(&u->iolock);
2514 		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2515 		mutex_unlock(&u->iolock);
2516 		if (!skb)
2517 			return err;
2518 
2519 		used = recv_actor(desc, skb, 0, skb->len);
2520 		if (used <= 0) {
2521 			if (!copied)
2522 				copied = used;
2523 			kfree_skb(skb);
2524 			break;
2525 		} else if (used <= skb->len) {
2526 			copied += used;
2527 		}
2528 
2529 		kfree_skb(skb);
2530 		if (!desc->count)
2531 			break;
2532 	}
2533 
2534 	return copied;
2535 }
2536 
2537 /*
2538  *	Sleep until more data has arrived. But check for races..
2539  */
2540 static long unix_stream_data_wait(struct sock *sk, long timeo,
2541 				  struct sk_buff *last, unsigned int last_len,
2542 				  bool freezable)
2543 {
2544 	struct sk_buff *tail;
2545 	DEFINE_WAIT(wait);
2546 
2547 	unix_state_lock(sk);
2548 
2549 	for (;;) {
2550 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2551 
2552 		tail = skb_peek_tail(&sk->sk_receive_queue);
2553 		if (tail != last ||
2554 		    (tail && tail->len != last_len) ||
2555 		    sk->sk_err ||
2556 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2557 		    signal_pending(current) ||
2558 		    !timeo)
2559 			break;
2560 
2561 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2562 		unix_state_unlock(sk);
2563 		if (freezable)
2564 			timeo = freezable_schedule_timeout(timeo);
2565 		else
2566 			timeo = schedule_timeout(timeo);
2567 		unix_state_lock(sk);
2568 
2569 		if (sock_flag(sk, SOCK_DEAD))
2570 			break;
2571 
2572 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2573 	}
2574 
2575 	finish_wait(sk_sleep(sk), &wait);
2576 	unix_state_unlock(sk);
2577 	return timeo;
2578 }
2579 
2580 static unsigned int unix_skb_len(const struct sk_buff *skb)
2581 {
2582 	return skb->len - UNIXCB(skb).consumed;
2583 }
2584 
2585 struct unix_stream_read_state {
2586 	int (*recv_actor)(struct sk_buff *, int, int,
2587 			  struct unix_stream_read_state *);
2588 	struct socket *socket;
2589 	struct msghdr *msg;
2590 	struct pipe_inode_info *pipe;
2591 	size_t size;
2592 	int flags;
2593 	unsigned int splice_flags;
2594 };
2595 
2596 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2597 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2598 {
2599 	struct socket *sock = state->socket;
2600 	struct sock *sk = sock->sk;
2601 	struct unix_sock *u = unix_sk(sk);
2602 	int chunk = 1;
2603 	struct sk_buff *oob_skb;
2604 
2605 	mutex_lock(&u->iolock);
2606 	unix_state_lock(sk);
2607 
2608 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2609 		unix_state_unlock(sk);
2610 		mutex_unlock(&u->iolock);
2611 		return -EINVAL;
2612 	}
2613 
2614 	oob_skb = u->oob_skb;
2615 
2616 	if (!(state->flags & MSG_PEEK))
2617 		WRITE_ONCE(u->oob_skb, NULL);
2618 
2619 	unix_state_unlock(sk);
2620 
2621 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2622 
2623 	if (!(state->flags & MSG_PEEK)) {
2624 		UNIXCB(oob_skb).consumed += 1;
2625 		kfree_skb(oob_skb);
2626 	}
2627 
2628 	mutex_unlock(&u->iolock);
2629 
2630 	if (chunk < 0)
2631 		return -EFAULT;
2632 
2633 	state->msg->msg_flags |= MSG_OOB;
2634 	return 1;
2635 }
2636 
2637 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2638 				  int flags, int copied)
2639 {
2640 	struct unix_sock *u = unix_sk(sk);
2641 
2642 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2643 		skb_unlink(skb, &sk->sk_receive_queue);
2644 		consume_skb(skb);
2645 		skb = NULL;
2646 	} else {
2647 		if (skb == u->oob_skb) {
2648 			if (copied) {
2649 				skb = NULL;
2650 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2651 				if (!(flags & MSG_PEEK)) {
2652 					WRITE_ONCE(u->oob_skb, NULL);
2653 					consume_skb(skb);
2654 				}
2655 			} else if (!(flags & MSG_PEEK)) {
2656 				skb_unlink(skb, &sk->sk_receive_queue);
2657 				consume_skb(skb);
2658 				skb = skb_peek(&sk->sk_receive_queue);
2659 			}
2660 		}
2661 	}
2662 	return skb;
2663 }
2664 #endif
2665 
2666 static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
2667 				 sk_read_actor_t recv_actor)
2668 {
2669 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2670 		return -ENOTCONN;
2671 
2672 	return unix_read_sock(sk, desc, recv_actor);
2673 }
2674 
2675 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2676 				    bool freezable)
2677 {
2678 	struct scm_cookie scm;
2679 	struct socket *sock = state->socket;
2680 	struct sock *sk = sock->sk;
2681 	struct unix_sock *u = unix_sk(sk);
2682 	int copied = 0;
2683 	int flags = state->flags;
2684 	int noblock = flags & MSG_DONTWAIT;
2685 	bool check_creds = false;
2686 	int target;
2687 	int err = 0;
2688 	long timeo;
2689 	int skip;
2690 	size_t size = state->size;
2691 	unsigned int last_len;
2692 
2693 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2694 		err = -EINVAL;
2695 		goto out;
2696 	}
2697 
2698 	if (unlikely(flags & MSG_OOB)) {
2699 		err = -EOPNOTSUPP;
2700 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2701 		err = unix_stream_recv_urg(state);
2702 #endif
2703 		goto out;
2704 	}
2705 
2706 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2707 	timeo = sock_rcvtimeo(sk, noblock);
2708 
2709 	memset(&scm, 0, sizeof(scm));
2710 
2711 	/* Lock the socket to prevent queue disordering
2712 	 * while sleeps in memcpy_tomsg
2713 	 */
2714 	mutex_lock(&u->iolock);
2715 
2716 	skip = max(sk_peek_offset(sk, flags), 0);
2717 
2718 	do {
2719 		int chunk;
2720 		bool drop_skb;
2721 		struct sk_buff *skb, *last;
2722 
2723 redo:
2724 		unix_state_lock(sk);
2725 		if (sock_flag(sk, SOCK_DEAD)) {
2726 			err = -ECONNRESET;
2727 			goto unlock;
2728 		}
2729 		last = skb = skb_peek(&sk->sk_receive_queue);
2730 		last_len = last ? last->len : 0;
2731 
2732 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2733 		if (skb) {
2734 			skb = manage_oob(skb, sk, flags, copied);
2735 			if (!skb) {
2736 				unix_state_unlock(sk);
2737 				if (copied)
2738 					break;
2739 				goto redo;
2740 			}
2741 		}
2742 #endif
2743 again:
2744 		if (skb == NULL) {
2745 			if (copied >= target)
2746 				goto unlock;
2747 
2748 			/*
2749 			 *	POSIX 1003.1g mandates this order.
2750 			 */
2751 
2752 			err = sock_error(sk);
2753 			if (err)
2754 				goto unlock;
2755 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2756 				goto unlock;
2757 
2758 			unix_state_unlock(sk);
2759 			if (!timeo) {
2760 				err = -EAGAIN;
2761 				break;
2762 			}
2763 
2764 			mutex_unlock(&u->iolock);
2765 
2766 			timeo = unix_stream_data_wait(sk, timeo, last,
2767 						      last_len, freezable);
2768 
2769 			if (signal_pending(current)) {
2770 				err = sock_intr_errno(timeo);
2771 				scm_destroy(&scm);
2772 				goto out;
2773 			}
2774 
2775 			mutex_lock(&u->iolock);
2776 			goto redo;
2777 unlock:
2778 			unix_state_unlock(sk);
2779 			break;
2780 		}
2781 
2782 		while (skip >= unix_skb_len(skb)) {
2783 			skip -= unix_skb_len(skb);
2784 			last = skb;
2785 			last_len = skb->len;
2786 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2787 			if (!skb)
2788 				goto again;
2789 		}
2790 
2791 		unix_state_unlock(sk);
2792 
2793 		if (check_creds) {
2794 			/* Never glue messages from different writers */
2795 			if (!unix_skb_scm_eq(skb, &scm))
2796 				break;
2797 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2798 			/* Copy credentials */
2799 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2800 			unix_set_secdata(&scm, skb);
2801 			check_creds = true;
2802 		}
2803 
2804 		/* Copy address just once */
2805 		if (state->msg && state->msg->msg_name) {
2806 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2807 					 state->msg->msg_name);
2808 			unix_copy_addr(state->msg, skb->sk);
2809 			sunaddr = NULL;
2810 		}
2811 
2812 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2813 		skb_get(skb);
2814 		chunk = state->recv_actor(skb, skip, chunk, state);
2815 		drop_skb = !unix_skb_len(skb);
2816 		/* skb is only safe to use if !drop_skb */
2817 		consume_skb(skb);
2818 		if (chunk < 0) {
2819 			if (copied == 0)
2820 				copied = -EFAULT;
2821 			break;
2822 		}
2823 		copied += chunk;
2824 		size -= chunk;
2825 
2826 		if (drop_skb) {
2827 			/* the skb was touched by a concurrent reader;
2828 			 * we should not expect anything from this skb
2829 			 * anymore and assume it invalid - we can be
2830 			 * sure it was dropped from the socket queue
2831 			 *
2832 			 * let's report a short read
2833 			 */
2834 			err = 0;
2835 			break;
2836 		}
2837 
2838 		/* Mark read part of skb as used */
2839 		if (!(flags & MSG_PEEK)) {
2840 			UNIXCB(skb).consumed += chunk;
2841 
2842 			sk_peek_offset_bwd(sk, chunk);
2843 
2844 			if (UNIXCB(skb).fp) {
2845 				scm_stat_del(sk, skb);
2846 				unix_detach_fds(&scm, skb);
2847 			}
2848 
2849 			if (unix_skb_len(skb))
2850 				break;
2851 
2852 			skb_unlink(skb, &sk->sk_receive_queue);
2853 			consume_skb(skb);
2854 
2855 			if (scm.fp)
2856 				break;
2857 		} else {
2858 			/* It is questionable, see note in unix_dgram_recvmsg.
2859 			 */
2860 			if (UNIXCB(skb).fp)
2861 				unix_peek_fds(&scm, skb);
2862 
2863 			sk_peek_offset_fwd(sk, chunk);
2864 
2865 			if (UNIXCB(skb).fp)
2866 				break;
2867 
2868 			skip = 0;
2869 			last = skb;
2870 			last_len = skb->len;
2871 			unix_state_lock(sk);
2872 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2873 			if (skb)
2874 				goto again;
2875 			unix_state_unlock(sk);
2876 			break;
2877 		}
2878 	} while (size);
2879 
2880 	mutex_unlock(&u->iolock);
2881 	if (state->msg)
2882 		scm_recv(sock, state->msg, &scm, flags);
2883 	else
2884 		scm_destroy(&scm);
2885 out:
2886 	return copied ? : err;
2887 }
2888 
2889 static int unix_stream_read_actor(struct sk_buff *skb,
2890 				  int skip, int chunk,
2891 				  struct unix_stream_read_state *state)
2892 {
2893 	int ret;
2894 
2895 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2896 				    state->msg, chunk);
2897 	return ret ?: chunk;
2898 }
2899 
2900 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2901 			  size_t size, int flags)
2902 {
2903 	struct unix_stream_read_state state = {
2904 		.recv_actor = unix_stream_read_actor,
2905 		.socket = sk->sk_socket,
2906 		.msg = msg,
2907 		.size = size,
2908 		.flags = flags
2909 	};
2910 
2911 	return unix_stream_read_generic(&state, true);
2912 }
2913 
2914 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2915 			       size_t size, int flags)
2916 {
2917 	struct unix_stream_read_state state = {
2918 		.recv_actor = unix_stream_read_actor,
2919 		.socket = sock,
2920 		.msg = msg,
2921 		.size = size,
2922 		.flags = flags
2923 	};
2924 
2925 #ifdef CONFIG_BPF_SYSCALL
2926 	struct sock *sk = sock->sk;
2927 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2928 
2929 	if (prot != &unix_stream_proto)
2930 		return prot->recvmsg(sk, msg, size, flags, NULL);
2931 #endif
2932 	return unix_stream_read_generic(&state, true);
2933 }
2934 
2935 static int unix_stream_splice_actor(struct sk_buff *skb,
2936 				    int skip, int chunk,
2937 				    struct unix_stream_read_state *state)
2938 {
2939 	return skb_splice_bits(skb, state->socket->sk,
2940 			       UNIXCB(skb).consumed + skip,
2941 			       state->pipe, chunk, state->splice_flags);
2942 }
2943 
2944 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2945 				       struct pipe_inode_info *pipe,
2946 				       size_t size, unsigned int flags)
2947 {
2948 	struct unix_stream_read_state state = {
2949 		.recv_actor = unix_stream_splice_actor,
2950 		.socket = sock,
2951 		.pipe = pipe,
2952 		.size = size,
2953 		.splice_flags = flags,
2954 	};
2955 
2956 	if (unlikely(*ppos))
2957 		return -ESPIPE;
2958 
2959 	if (sock->file->f_flags & O_NONBLOCK ||
2960 	    flags & SPLICE_F_NONBLOCK)
2961 		state.flags = MSG_DONTWAIT;
2962 
2963 	return unix_stream_read_generic(&state, false);
2964 }
2965 
2966 static int unix_shutdown(struct socket *sock, int mode)
2967 {
2968 	struct sock *sk = sock->sk;
2969 	struct sock *other;
2970 
2971 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2972 		return -EINVAL;
2973 	/* This maps:
2974 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2975 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2976 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2977 	 */
2978 	++mode;
2979 
2980 	unix_state_lock(sk);
2981 	sk->sk_shutdown |= mode;
2982 	other = unix_peer(sk);
2983 	if (other)
2984 		sock_hold(other);
2985 	unix_state_unlock(sk);
2986 	sk->sk_state_change(sk);
2987 
2988 	if (other &&
2989 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2990 
2991 		int peer_mode = 0;
2992 		const struct proto *prot = READ_ONCE(other->sk_prot);
2993 
2994 		if (prot->unhash)
2995 			prot->unhash(other);
2996 		if (mode&RCV_SHUTDOWN)
2997 			peer_mode |= SEND_SHUTDOWN;
2998 		if (mode&SEND_SHUTDOWN)
2999 			peer_mode |= RCV_SHUTDOWN;
3000 		unix_state_lock(other);
3001 		other->sk_shutdown |= peer_mode;
3002 		unix_state_unlock(other);
3003 		other->sk_state_change(other);
3004 		if (peer_mode == SHUTDOWN_MASK)
3005 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3006 		else if (peer_mode & RCV_SHUTDOWN)
3007 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3008 	}
3009 	if (other)
3010 		sock_put(other);
3011 
3012 	return 0;
3013 }
3014 
3015 long unix_inq_len(struct sock *sk)
3016 {
3017 	struct sk_buff *skb;
3018 	long amount = 0;
3019 
3020 	if (sk->sk_state == TCP_LISTEN)
3021 		return -EINVAL;
3022 
3023 	spin_lock(&sk->sk_receive_queue.lock);
3024 	if (sk->sk_type == SOCK_STREAM ||
3025 	    sk->sk_type == SOCK_SEQPACKET) {
3026 		skb_queue_walk(&sk->sk_receive_queue, skb)
3027 			amount += unix_skb_len(skb);
3028 	} else {
3029 		skb = skb_peek(&sk->sk_receive_queue);
3030 		if (skb)
3031 			amount = skb->len;
3032 	}
3033 	spin_unlock(&sk->sk_receive_queue.lock);
3034 
3035 	return amount;
3036 }
3037 EXPORT_SYMBOL_GPL(unix_inq_len);
3038 
3039 long unix_outq_len(struct sock *sk)
3040 {
3041 	return sk_wmem_alloc_get(sk);
3042 }
3043 EXPORT_SYMBOL_GPL(unix_outq_len);
3044 
3045 static int unix_open_file(struct sock *sk)
3046 {
3047 	struct path path;
3048 	struct file *f;
3049 	int fd;
3050 
3051 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3052 		return -EPERM;
3053 
3054 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3055 		return -ENOENT;
3056 
3057 	path = unix_sk(sk)->path;
3058 	if (!path.dentry)
3059 		return -ENOENT;
3060 
3061 	path_get(&path);
3062 
3063 	fd = get_unused_fd_flags(O_CLOEXEC);
3064 	if (fd < 0)
3065 		goto out;
3066 
3067 	f = dentry_open(&path, O_PATH, current_cred());
3068 	if (IS_ERR(f)) {
3069 		put_unused_fd(fd);
3070 		fd = PTR_ERR(f);
3071 		goto out;
3072 	}
3073 
3074 	fd_install(fd, f);
3075 out:
3076 	path_put(&path);
3077 
3078 	return fd;
3079 }
3080 
3081 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3082 {
3083 	struct sock *sk = sock->sk;
3084 	long amount = 0;
3085 	int err;
3086 
3087 	switch (cmd) {
3088 	case SIOCOUTQ:
3089 		amount = unix_outq_len(sk);
3090 		err = put_user(amount, (int __user *)arg);
3091 		break;
3092 	case SIOCINQ:
3093 		amount = unix_inq_len(sk);
3094 		if (amount < 0)
3095 			err = amount;
3096 		else
3097 			err = put_user(amount, (int __user *)arg);
3098 		break;
3099 	case SIOCUNIXFILE:
3100 		err = unix_open_file(sk);
3101 		break;
3102 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3103 	case SIOCATMARK:
3104 		{
3105 			struct sk_buff *skb;
3106 			int answ = 0;
3107 
3108 			skb = skb_peek(&sk->sk_receive_queue);
3109 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3110 				answ = 1;
3111 			err = put_user(answ, (int __user *)arg);
3112 		}
3113 		break;
3114 #endif
3115 	default:
3116 		err = -ENOIOCTLCMD;
3117 		break;
3118 	}
3119 	return err;
3120 }
3121 
3122 #ifdef CONFIG_COMPAT
3123 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3124 {
3125 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3126 }
3127 #endif
3128 
3129 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3130 {
3131 	struct sock *sk = sock->sk;
3132 	__poll_t mask;
3133 
3134 	sock_poll_wait(file, sock, wait);
3135 	mask = 0;
3136 
3137 	/* exceptional events? */
3138 	if (sk->sk_err)
3139 		mask |= EPOLLERR;
3140 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3141 		mask |= EPOLLHUP;
3142 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3143 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3144 
3145 	/* readable? */
3146 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3147 		mask |= EPOLLIN | EPOLLRDNORM;
3148 	if (sk_is_readable(sk))
3149 		mask |= EPOLLIN | EPOLLRDNORM;
3150 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3151 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3152 		mask |= EPOLLPRI;
3153 #endif
3154 
3155 	/* Connection-based need to check for termination and startup */
3156 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3157 	    sk->sk_state == TCP_CLOSE)
3158 		mask |= EPOLLHUP;
3159 
3160 	/*
3161 	 * we set writable also when the other side has shut down the
3162 	 * connection. This prevents stuck sockets.
3163 	 */
3164 	if (unix_writable(sk))
3165 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3166 
3167 	return mask;
3168 }
3169 
3170 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3171 				    poll_table *wait)
3172 {
3173 	struct sock *sk = sock->sk, *other;
3174 	unsigned int writable;
3175 	__poll_t mask;
3176 
3177 	sock_poll_wait(file, sock, wait);
3178 	mask = 0;
3179 
3180 	/* exceptional events? */
3181 	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3182 		mask |= EPOLLERR |
3183 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3184 
3185 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3186 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3187 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3188 		mask |= EPOLLHUP;
3189 
3190 	/* readable? */
3191 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3192 		mask |= EPOLLIN | EPOLLRDNORM;
3193 	if (sk_is_readable(sk))
3194 		mask |= EPOLLIN | EPOLLRDNORM;
3195 
3196 	/* Connection-based need to check for termination and startup */
3197 	if (sk->sk_type == SOCK_SEQPACKET) {
3198 		if (sk->sk_state == TCP_CLOSE)
3199 			mask |= EPOLLHUP;
3200 		/* connection hasn't started yet? */
3201 		if (sk->sk_state == TCP_SYN_SENT)
3202 			return mask;
3203 	}
3204 
3205 	/* No write status requested, avoid expensive OUT tests. */
3206 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3207 		return mask;
3208 
3209 	writable = unix_writable(sk);
3210 	if (writable) {
3211 		unix_state_lock(sk);
3212 
3213 		other = unix_peer(sk);
3214 		if (other && unix_peer(other) != sk &&
3215 		    unix_recvq_full_lockless(other) &&
3216 		    unix_dgram_peer_wake_me(sk, other))
3217 			writable = 0;
3218 
3219 		unix_state_unlock(sk);
3220 	}
3221 
3222 	if (writable)
3223 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3224 	else
3225 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3226 
3227 	return mask;
3228 }
3229 
3230 #ifdef CONFIG_PROC_FS
3231 
3232 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3233 
3234 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3235 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3236 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3237 
3238 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3239 {
3240 	unsigned long offset = get_offset(*pos);
3241 	unsigned long bucket = get_bucket(*pos);
3242 	unsigned long count = 0;
3243 	struct sock *sk;
3244 
3245 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3246 	     sk; sk = sk_next(sk)) {
3247 		if (++count == offset)
3248 			break;
3249 	}
3250 
3251 	return sk;
3252 }
3253 
3254 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3255 {
3256 	unsigned long bucket = get_bucket(*pos);
3257 	struct net *net = seq_file_net(seq);
3258 	struct sock *sk;
3259 
3260 	while (bucket < UNIX_HASH_SIZE) {
3261 		spin_lock(&net->unx.table.locks[bucket]);
3262 
3263 		sk = unix_from_bucket(seq, pos);
3264 		if (sk)
3265 			return sk;
3266 
3267 		spin_unlock(&net->unx.table.locks[bucket]);
3268 
3269 		*pos = set_bucket_offset(++bucket, 1);
3270 	}
3271 
3272 	return NULL;
3273 }
3274 
3275 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3276 				  loff_t *pos)
3277 {
3278 	unsigned long bucket = get_bucket(*pos);
3279 
3280 	sk = sk_next(sk);
3281 	if (sk)
3282 		return sk;
3283 
3284 
3285 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3286 
3287 	*pos = set_bucket_offset(++bucket, 1);
3288 
3289 	return unix_get_first(seq, pos);
3290 }
3291 
3292 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3293 {
3294 	if (!*pos)
3295 		return SEQ_START_TOKEN;
3296 
3297 	return unix_get_first(seq, pos);
3298 }
3299 
3300 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3301 {
3302 	++*pos;
3303 
3304 	if (v == SEQ_START_TOKEN)
3305 		return unix_get_first(seq, pos);
3306 
3307 	return unix_get_next(seq, v, pos);
3308 }
3309 
3310 static void unix_seq_stop(struct seq_file *seq, void *v)
3311 {
3312 	struct sock *sk = v;
3313 
3314 	if (sk)
3315 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3316 }
3317 
3318 static int unix_seq_show(struct seq_file *seq, void *v)
3319 {
3320 
3321 	if (v == SEQ_START_TOKEN)
3322 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3323 			 "Inode Path\n");
3324 	else {
3325 		struct sock *s = v;
3326 		struct unix_sock *u = unix_sk(s);
3327 		unix_state_lock(s);
3328 
3329 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3330 			s,
3331 			refcount_read(&s->sk_refcnt),
3332 			0,
3333 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3334 			s->sk_type,
3335 			s->sk_socket ?
3336 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3337 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3338 			sock_i_ino(s));
3339 
3340 		if (u->addr) {	// under a hash table lock here
3341 			int i, len;
3342 			seq_putc(seq, ' ');
3343 
3344 			i = 0;
3345 			len = u->addr->len -
3346 				offsetof(struct sockaddr_un, sun_path);
3347 			if (u->addr->name->sun_path[0]) {
3348 				len--;
3349 			} else {
3350 				seq_putc(seq, '@');
3351 				i++;
3352 			}
3353 			for ( ; i < len; i++)
3354 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3355 					 '@');
3356 		}
3357 		unix_state_unlock(s);
3358 		seq_putc(seq, '\n');
3359 	}
3360 
3361 	return 0;
3362 }
3363 
3364 static const struct seq_operations unix_seq_ops = {
3365 	.start  = unix_seq_start,
3366 	.next   = unix_seq_next,
3367 	.stop   = unix_seq_stop,
3368 	.show   = unix_seq_show,
3369 };
3370 
3371 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3372 struct bpf_unix_iter_state {
3373 	struct seq_net_private p;
3374 	unsigned int cur_sk;
3375 	unsigned int end_sk;
3376 	unsigned int max_sk;
3377 	struct sock **batch;
3378 	bool st_bucket_done;
3379 };
3380 
3381 struct bpf_iter__unix {
3382 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3383 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3384 	uid_t uid __aligned(8);
3385 };
3386 
3387 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3388 			      struct unix_sock *unix_sk, uid_t uid)
3389 {
3390 	struct bpf_iter__unix ctx;
3391 
3392 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3393 	ctx.meta = meta;
3394 	ctx.unix_sk = unix_sk;
3395 	ctx.uid = uid;
3396 	return bpf_iter_run_prog(prog, &ctx);
3397 }
3398 
3399 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3400 
3401 {
3402 	struct bpf_unix_iter_state *iter = seq->private;
3403 	unsigned int expected = 1;
3404 	struct sock *sk;
3405 
3406 	sock_hold(start_sk);
3407 	iter->batch[iter->end_sk++] = start_sk;
3408 
3409 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3410 		if (iter->end_sk < iter->max_sk) {
3411 			sock_hold(sk);
3412 			iter->batch[iter->end_sk++] = sk;
3413 		}
3414 
3415 		expected++;
3416 	}
3417 
3418 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3419 
3420 	return expected;
3421 }
3422 
3423 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3424 {
3425 	while (iter->cur_sk < iter->end_sk)
3426 		sock_put(iter->batch[iter->cur_sk++]);
3427 }
3428 
3429 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3430 				       unsigned int new_batch_sz)
3431 {
3432 	struct sock **new_batch;
3433 
3434 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3435 			     GFP_USER | __GFP_NOWARN);
3436 	if (!new_batch)
3437 		return -ENOMEM;
3438 
3439 	bpf_iter_unix_put_batch(iter);
3440 	kvfree(iter->batch);
3441 	iter->batch = new_batch;
3442 	iter->max_sk = new_batch_sz;
3443 
3444 	return 0;
3445 }
3446 
3447 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3448 					loff_t *pos)
3449 {
3450 	struct bpf_unix_iter_state *iter = seq->private;
3451 	unsigned int expected;
3452 	bool resized = false;
3453 	struct sock *sk;
3454 
3455 	if (iter->st_bucket_done)
3456 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3457 
3458 again:
3459 	/* Get a new batch */
3460 	iter->cur_sk = 0;
3461 	iter->end_sk = 0;
3462 
3463 	sk = unix_get_first(seq, pos);
3464 	if (!sk)
3465 		return NULL; /* Done */
3466 
3467 	expected = bpf_iter_unix_hold_batch(seq, sk);
3468 
3469 	if (iter->end_sk == expected) {
3470 		iter->st_bucket_done = true;
3471 		return sk;
3472 	}
3473 
3474 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3475 		resized = true;
3476 		goto again;
3477 	}
3478 
3479 	return sk;
3480 }
3481 
3482 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3483 {
3484 	if (!*pos)
3485 		return SEQ_START_TOKEN;
3486 
3487 	/* bpf iter does not support lseek, so it always
3488 	 * continue from where it was stop()-ped.
3489 	 */
3490 	return bpf_iter_unix_batch(seq, pos);
3491 }
3492 
3493 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3494 {
3495 	struct bpf_unix_iter_state *iter = seq->private;
3496 	struct sock *sk;
3497 
3498 	/* Whenever seq_next() is called, the iter->cur_sk is
3499 	 * done with seq_show(), so advance to the next sk in
3500 	 * the batch.
3501 	 */
3502 	if (iter->cur_sk < iter->end_sk)
3503 		sock_put(iter->batch[iter->cur_sk++]);
3504 
3505 	++*pos;
3506 
3507 	if (iter->cur_sk < iter->end_sk)
3508 		sk = iter->batch[iter->cur_sk];
3509 	else
3510 		sk = bpf_iter_unix_batch(seq, pos);
3511 
3512 	return sk;
3513 }
3514 
3515 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3516 {
3517 	struct bpf_iter_meta meta;
3518 	struct bpf_prog *prog;
3519 	struct sock *sk = v;
3520 	uid_t uid;
3521 	bool slow;
3522 	int ret;
3523 
3524 	if (v == SEQ_START_TOKEN)
3525 		return 0;
3526 
3527 	slow = lock_sock_fast(sk);
3528 
3529 	if (unlikely(sk_unhashed(sk))) {
3530 		ret = SEQ_SKIP;
3531 		goto unlock;
3532 	}
3533 
3534 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3535 	meta.seq = seq;
3536 	prog = bpf_iter_get_info(&meta, false);
3537 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3538 unlock:
3539 	unlock_sock_fast(sk, slow);
3540 	return ret;
3541 }
3542 
3543 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3544 {
3545 	struct bpf_unix_iter_state *iter = seq->private;
3546 	struct bpf_iter_meta meta;
3547 	struct bpf_prog *prog;
3548 
3549 	if (!v) {
3550 		meta.seq = seq;
3551 		prog = bpf_iter_get_info(&meta, true);
3552 		if (prog)
3553 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3554 	}
3555 
3556 	if (iter->cur_sk < iter->end_sk)
3557 		bpf_iter_unix_put_batch(iter);
3558 }
3559 
3560 static const struct seq_operations bpf_iter_unix_seq_ops = {
3561 	.start	= bpf_iter_unix_seq_start,
3562 	.next	= bpf_iter_unix_seq_next,
3563 	.stop	= bpf_iter_unix_seq_stop,
3564 	.show	= bpf_iter_unix_seq_show,
3565 };
3566 #endif
3567 #endif
3568 
3569 static const struct net_proto_family unix_family_ops = {
3570 	.family = PF_UNIX,
3571 	.create = unix_create,
3572 	.owner	= THIS_MODULE,
3573 };
3574 
3575 
3576 static int __net_init unix_net_init(struct net *net)
3577 {
3578 	int i;
3579 
3580 	net->unx.sysctl_max_dgram_qlen = 10;
3581 	if (unix_sysctl_register(net))
3582 		goto out;
3583 
3584 #ifdef CONFIG_PROC_FS
3585 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3586 			     sizeof(struct seq_net_private)))
3587 		goto err_sysctl;
3588 #endif
3589 
3590 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3591 					      sizeof(spinlock_t), GFP_KERNEL);
3592 	if (!net->unx.table.locks)
3593 		goto err_proc;
3594 
3595 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3596 						sizeof(struct hlist_head),
3597 						GFP_KERNEL);
3598 	if (!net->unx.table.buckets)
3599 		goto free_locks;
3600 
3601 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3602 		spin_lock_init(&net->unx.table.locks[i]);
3603 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3604 	}
3605 
3606 	return 0;
3607 
3608 free_locks:
3609 	kvfree(net->unx.table.locks);
3610 err_proc:
3611 #ifdef CONFIG_PROC_FS
3612 	remove_proc_entry("unix", net->proc_net);
3613 err_sysctl:
3614 #endif
3615 	unix_sysctl_unregister(net);
3616 out:
3617 	return -ENOMEM;
3618 }
3619 
3620 static void __net_exit unix_net_exit(struct net *net)
3621 {
3622 	kvfree(net->unx.table.buckets);
3623 	kvfree(net->unx.table.locks);
3624 	unix_sysctl_unregister(net);
3625 	remove_proc_entry("unix", net->proc_net);
3626 }
3627 
3628 static struct pernet_operations unix_net_ops = {
3629 	.init = unix_net_init,
3630 	.exit = unix_net_exit,
3631 };
3632 
3633 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3634 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3635 		     struct unix_sock *unix_sk, uid_t uid)
3636 
3637 #define INIT_BATCH_SZ 16
3638 
3639 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3640 {
3641 	struct bpf_unix_iter_state *iter = priv_data;
3642 	int err;
3643 
3644 	err = bpf_iter_init_seq_net(priv_data, aux);
3645 	if (err)
3646 		return err;
3647 
3648 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3649 	if (err) {
3650 		bpf_iter_fini_seq_net(priv_data);
3651 		return err;
3652 	}
3653 
3654 	return 0;
3655 }
3656 
3657 static void bpf_iter_fini_unix(void *priv_data)
3658 {
3659 	struct bpf_unix_iter_state *iter = priv_data;
3660 
3661 	bpf_iter_fini_seq_net(priv_data);
3662 	kvfree(iter->batch);
3663 }
3664 
3665 static const struct bpf_iter_seq_info unix_seq_info = {
3666 	.seq_ops		= &bpf_iter_unix_seq_ops,
3667 	.init_seq_private	= bpf_iter_init_unix,
3668 	.fini_seq_private	= bpf_iter_fini_unix,
3669 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3670 };
3671 
3672 static const struct bpf_func_proto *
3673 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3674 			     const struct bpf_prog *prog)
3675 {
3676 	switch (func_id) {
3677 	case BPF_FUNC_setsockopt:
3678 		return &bpf_sk_setsockopt_proto;
3679 	case BPF_FUNC_getsockopt:
3680 		return &bpf_sk_getsockopt_proto;
3681 	default:
3682 		return NULL;
3683 	}
3684 }
3685 
3686 static struct bpf_iter_reg unix_reg_info = {
3687 	.target			= "unix",
3688 	.ctx_arg_info_size	= 1,
3689 	.ctx_arg_info		= {
3690 		{ offsetof(struct bpf_iter__unix, unix_sk),
3691 		  PTR_TO_BTF_ID_OR_NULL },
3692 	},
3693 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3694 	.seq_info		= &unix_seq_info,
3695 };
3696 
3697 static void __init bpf_iter_register(void)
3698 {
3699 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3700 	if (bpf_iter_reg_target(&unix_reg_info))
3701 		pr_warn("Warning: could not register bpf iterator unix\n");
3702 }
3703 #endif
3704 
3705 static int __init af_unix_init(void)
3706 {
3707 	int i, rc = -1;
3708 
3709 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3710 
3711 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3712 		spin_lock_init(&bsd_socket_locks[i]);
3713 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3714 	}
3715 
3716 	rc = proto_register(&unix_dgram_proto, 1);
3717 	if (rc != 0) {
3718 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3719 		goto out;
3720 	}
3721 
3722 	rc = proto_register(&unix_stream_proto, 1);
3723 	if (rc != 0) {
3724 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3725 		goto out;
3726 	}
3727 
3728 	sock_register(&unix_family_ops);
3729 	register_pernet_subsys(&unix_net_ops);
3730 	unix_bpf_build_proto();
3731 
3732 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3733 	bpf_iter_register();
3734 #endif
3735 
3736 out:
3737 	return rc;
3738 }
3739 
3740 static void __exit af_unix_exit(void)
3741 {
3742 	sock_unregister(PF_UNIX);
3743 	proto_unregister(&unix_dgram_proto);
3744 	proto_unregister(&unix_stream_proto);
3745 	unregister_pernet_subsys(&unix_net_ops);
3746 }
3747 
3748 /* Earlier than device_initcall() so that other drivers invoking
3749    request_module() don't end up in a loop when modprobe tries
3750    to use a UNIX socket. But later than subsys_initcall() because
3751    we depend on stuff initialised there */
3752 fs_initcall(af_unix_init);
3753 module_exit(af_unix_exit);
3754 
3755 MODULE_LICENSE("GPL");
3756 MODULE_ALIAS_NETPROTO(PF_UNIX);
3757