xref: /linux/net/unix/af_unix.c (revision 24aeeb107f0724fa15e16d5f28b39f3c3ecfc746)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/freezer.h>
116 #include <linux/file.h>
117 #include <linux/btf_ids.h>
118 
119 #include "scm.h"
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 #define unix_peer(sk) (unix_sk(sk)->peer)
215 
216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
217 {
218 	return unix_peer(osk) == sk;
219 }
220 
221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
222 {
223 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
224 }
225 
226 static inline int unix_recvq_full(const struct sock *sk)
227 {
228 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
229 }
230 
231 static inline int unix_recvq_full_lockless(const struct sock *sk)
232 {
233 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
234 		READ_ONCE(sk->sk_max_ack_backlog);
235 }
236 
237 struct sock *unix_peer_get(struct sock *s)
238 {
239 	struct sock *peer;
240 
241 	unix_state_lock(s);
242 	peer = unix_peer(s);
243 	if (peer)
244 		sock_hold(peer);
245 	unix_state_unlock(s);
246 	return peer;
247 }
248 EXPORT_SYMBOL_GPL(unix_peer_get);
249 
250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
251 					     int addr_len)
252 {
253 	struct unix_address *addr;
254 
255 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
256 	if (!addr)
257 		return NULL;
258 
259 	refcount_set(&addr->refcnt, 1);
260 	addr->len = addr_len;
261 	memcpy(addr->name, sunaddr, addr_len);
262 
263 	return addr;
264 }
265 
266 static inline void unix_release_addr(struct unix_address *addr)
267 {
268 	if (refcount_dec_and_test(&addr->refcnt))
269 		kfree(addr);
270 }
271 
272 /*
273  *	Check unix socket name:
274  *		- should be not zero length.
275  *	        - if started by not zero, should be NULL terminated (FS object)
276  *		- if started by zero, it is abstract name.
277  */
278 
279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
280 {
281 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
282 	    addr_len > sizeof(*sunaddr))
283 		return -EINVAL;
284 
285 	if (sunaddr->sun_family != AF_UNIX)
286 		return -EINVAL;
287 
288 	return 0;
289 }
290 
291 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
292 {
293 	/* This may look like an off by one error but it is a bit more
294 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
295 	 * sun_path[108] doesn't as such exist.  However in kernel space
296 	 * we are guaranteed that it is a valid memory location in our
297 	 * kernel address buffer because syscall functions always pass
298 	 * a pointer of struct sockaddr_storage which has a bigger buffer
299 	 * than 108.
300 	 */
301 	((char *)sunaddr)[addr_len] = 0;
302 }
303 
304 static void __unix_remove_socket(struct sock *sk)
305 {
306 	sk_del_node_init(sk);
307 }
308 
309 static void __unix_insert_socket(struct net *net, struct sock *sk)
310 {
311 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
312 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
313 }
314 
315 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
316 				 struct unix_address *addr, unsigned int hash)
317 {
318 	__unix_remove_socket(sk);
319 	smp_store_release(&unix_sk(sk)->addr, addr);
320 
321 	sk->sk_hash = hash;
322 	__unix_insert_socket(net, sk);
323 }
324 
325 static void unix_remove_socket(struct net *net, struct sock *sk)
326 {
327 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
328 	__unix_remove_socket(sk);
329 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
330 }
331 
332 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
333 {
334 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
335 	__unix_insert_socket(net, sk);
336 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
337 }
338 
339 static void unix_insert_bsd_socket(struct sock *sk)
340 {
341 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
342 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
343 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
344 }
345 
346 static void unix_remove_bsd_socket(struct sock *sk)
347 {
348 	if (!hlist_unhashed(&sk->sk_bind_node)) {
349 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
350 		__sk_del_bind_node(sk);
351 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
352 
353 		sk_node_init(&sk->sk_bind_node);
354 	}
355 }
356 
357 static struct sock *__unix_find_socket_byname(struct net *net,
358 					      struct sockaddr_un *sunname,
359 					      int len, unsigned int hash)
360 {
361 	struct sock *s;
362 
363 	sk_for_each(s, &net->unx.table.buckets[hash]) {
364 		struct unix_sock *u = unix_sk(s);
365 
366 		if (u->addr->len == len &&
367 		    !memcmp(u->addr->name, sunname, len))
368 			return s;
369 	}
370 	return NULL;
371 }
372 
373 static inline struct sock *unix_find_socket_byname(struct net *net,
374 						   struct sockaddr_un *sunname,
375 						   int len, unsigned int hash)
376 {
377 	struct sock *s;
378 
379 	spin_lock(&net->unx.table.locks[hash]);
380 	s = __unix_find_socket_byname(net, sunname, len, hash);
381 	if (s)
382 		sock_hold(s);
383 	spin_unlock(&net->unx.table.locks[hash]);
384 	return s;
385 }
386 
387 static struct sock *unix_find_socket_byinode(struct inode *i)
388 {
389 	unsigned int hash = unix_bsd_hash(i);
390 	struct sock *s;
391 
392 	spin_lock(&bsd_socket_locks[hash]);
393 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
394 		struct dentry *dentry = unix_sk(s)->path.dentry;
395 
396 		if (dentry && d_backing_inode(dentry) == i) {
397 			sock_hold(s);
398 			spin_unlock(&bsd_socket_locks[hash]);
399 			return s;
400 		}
401 	}
402 	spin_unlock(&bsd_socket_locks[hash]);
403 	return NULL;
404 }
405 
406 /* Support code for asymmetrically connected dgram sockets
407  *
408  * If a datagram socket is connected to a socket not itself connected
409  * to the first socket (eg, /dev/log), clients may only enqueue more
410  * messages if the present receive queue of the server socket is not
411  * "too large". This means there's a second writeability condition
412  * poll and sendmsg need to test. The dgram recv code will do a wake
413  * up on the peer_wait wait queue of a socket upon reception of a
414  * datagram which needs to be propagated to sleeping would-be writers
415  * since these might not have sent anything so far. This can't be
416  * accomplished via poll_wait because the lifetime of the server
417  * socket might be less than that of its clients if these break their
418  * association with it or if the server socket is closed while clients
419  * are still connected to it and there's no way to inform "a polling
420  * implementation" that it should let go of a certain wait queue
421  *
422  * In order to propagate a wake up, a wait_queue_entry_t of the client
423  * socket is enqueued on the peer_wait queue of the server socket
424  * whose wake function does a wake_up on the ordinary client socket
425  * wait queue. This connection is established whenever a write (or
426  * poll for write) hit the flow control condition and broken when the
427  * association to the server socket is dissolved or after a wake up
428  * was relayed.
429  */
430 
431 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
432 				      void *key)
433 {
434 	struct unix_sock *u;
435 	wait_queue_head_t *u_sleep;
436 
437 	u = container_of(q, struct unix_sock, peer_wake);
438 
439 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
440 			    q);
441 	u->peer_wake.private = NULL;
442 
443 	/* relaying can only happen while the wq still exists */
444 	u_sleep = sk_sleep(&u->sk);
445 	if (u_sleep)
446 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
447 
448 	return 0;
449 }
450 
451 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
452 {
453 	struct unix_sock *u, *u_other;
454 	int rc;
455 
456 	u = unix_sk(sk);
457 	u_other = unix_sk(other);
458 	rc = 0;
459 	spin_lock(&u_other->peer_wait.lock);
460 
461 	if (!u->peer_wake.private) {
462 		u->peer_wake.private = other;
463 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
464 
465 		rc = 1;
466 	}
467 
468 	spin_unlock(&u_other->peer_wait.lock);
469 	return rc;
470 }
471 
472 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
473 					    struct sock *other)
474 {
475 	struct unix_sock *u, *u_other;
476 
477 	u = unix_sk(sk);
478 	u_other = unix_sk(other);
479 	spin_lock(&u_other->peer_wait.lock);
480 
481 	if (u->peer_wake.private == other) {
482 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
483 		u->peer_wake.private = NULL;
484 	}
485 
486 	spin_unlock(&u_other->peer_wait.lock);
487 }
488 
489 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
490 						   struct sock *other)
491 {
492 	unix_dgram_peer_wake_disconnect(sk, other);
493 	wake_up_interruptible_poll(sk_sleep(sk),
494 				   EPOLLOUT |
495 				   EPOLLWRNORM |
496 				   EPOLLWRBAND);
497 }
498 
499 /* preconditions:
500  *	- unix_peer(sk) == other
501  *	- association is stable
502  */
503 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
504 {
505 	int connected;
506 
507 	connected = unix_dgram_peer_wake_connect(sk, other);
508 
509 	/* If other is SOCK_DEAD, we want to make sure we signal
510 	 * POLLOUT, such that a subsequent write() can get a
511 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
512 	 * to other and its full, we will hang waiting for POLLOUT.
513 	 */
514 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
515 		return 1;
516 
517 	if (connected)
518 		unix_dgram_peer_wake_disconnect(sk, other);
519 
520 	return 0;
521 }
522 
523 static int unix_writable(const struct sock *sk)
524 {
525 	return sk->sk_state != TCP_LISTEN &&
526 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
527 }
528 
529 static void unix_write_space(struct sock *sk)
530 {
531 	struct socket_wq *wq;
532 
533 	rcu_read_lock();
534 	if (unix_writable(sk)) {
535 		wq = rcu_dereference(sk->sk_wq);
536 		if (skwq_has_sleeper(wq))
537 			wake_up_interruptible_sync_poll(&wq->wait,
538 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
539 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
540 	}
541 	rcu_read_unlock();
542 }
543 
544 /* When dgram socket disconnects (or changes its peer), we clear its receive
545  * queue of packets arrived from previous peer. First, it allows to do
546  * flow control based only on wmem_alloc; second, sk connected to peer
547  * may receive messages only from that peer. */
548 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
549 {
550 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
551 		skb_queue_purge(&sk->sk_receive_queue);
552 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
553 
554 		/* If one link of bidirectional dgram pipe is disconnected,
555 		 * we signal error. Messages are lost. Do not make this,
556 		 * when peer was not connected to us.
557 		 */
558 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
559 			other->sk_err = ECONNRESET;
560 			sk_error_report(other);
561 		}
562 	}
563 	other->sk_state = TCP_CLOSE;
564 }
565 
566 static void unix_sock_destructor(struct sock *sk)
567 {
568 	struct unix_sock *u = unix_sk(sk);
569 
570 	skb_queue_purge(&sk->sk_receive_queue);
571 
572 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
573 	if (u->oob_skb) {
574 		kfree_skb(u->oob_skb);
575 		u->oob_skb = NULL;
576 	}
577 #endif
578 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
579 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
580 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
581 	if (!sock_flag(sk, SOCK_DEAD)) {
582 		pr_info("Attempt to release alive unix socket: %p\n", sk);
583 		return;
584 	}
585 
586 	if (u->addr)
587 		unix_release_addr(u->addr);
588 
589 	atomic_long_dec(&unix_nr_socks);
590 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
591 #ifdef UNIX_REFCNT_DEBUG
592 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
593 		atomic_long_read(&unix_nr_socks));
594 #endif
595 }
596 
597 static void unix_release_sock(struct sock *sk, int embrion)
598 {
599 	struct unix_sock *u = unix_sk(sk);
600 	struct sock *skpair;
601 	struct sk_buff *skb;
602 	struct path path;
603 	int state;
604 
605 	unix_remove_socket(sock_net(sk), sk);
606 	unix_remove_bsd_socket(sk);
607 
608 	/* Clear state */
609 	unix_state_lock(sk);
610 	sock_orphan(sk);
611 	sk->sk_shutdown = SHUTDOWN_MASK;
612 	path	     = u->path;
613 	u->path.dentry = NULL;
614 	u->path.mnt = NULL;
615 	state = sk->sk_state;
616 	sk->sk_state = TCP_CLOSE;
617 
618 	skpair = unix_peer(sk);
619 	unix_peer(sk) = NULL;
620 
621 	unix_state_unlock(sk);
622 
623 	wake_up_interruptible_all(&u->peer_wait);
624 
625 	if (skpair != NULL) {
626 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
627 			unix_state_lock(skpair);
628 			/* No more writes */
629 			skpair->sk_shutdown = SHUTDOWN_MASK;
630 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
631 				skpair->sk_err = ECONNRESET;
632 			unix_state_unlock(skpair);
633 			skpair->sk_state_change(skpair);
634 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
635 		}
636 
637 		unix_dgram_peer_wake_disconnect(sk, skpair);
638 		sock_put(skpair); /* It may now die */
639 	}
640 
641 	/* Try to flush out this socket. Throw out buffers at least */
642 
643 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
644 		if (state == TCP_LISTEN)
645 			unix_release_sock(skb->sk, 1);
646 		/* passed fds are erased in the kfree_skb hook	      */
647 		UNIXCB(skb).consumed = skb->len;
648 		kfree_skb(skb);
649 	}
650 
651 	if (path.dentry)
652 		path_put(&path);
653 
654 	sock_put(sk);
655 
656 	/* ---- Socket is dead now and most probably destroyed ---- */
657 
658 	/*
659 	 * Fixme: BSD difference: In BSD all sockets connected to us get
660 	 *	  ECONNRESET and we die on the spot. In Linux we behave
661 	 *	  like files and pipes do and wait for the last
662 	 *	  dereference.
663 	 *
664 	 * Can't we simply set sock->err?
665 	 *
666 	 *	  What the above comment does talk about? --ANK(980817)
667 	 */
668 
669 	if (unix_tot_inflight)
670 		unix_gc();		/* Garbage collect fds */
671 }
672 
673 static void init_peercred(struct sock *sk)
674 {
675 	const struct cred *old_cred;
676 	struct pid *old_pid;
677 
678 	spin_lock(&sk->sk_peer_lock);
679 	old_pid = sk->sk_peer_pid;
680 	old_cred = sk->sk_peer_cred;
681 	sk->sk_peer_pid  = get_pid(task_tgid(current));
682 	sk->sk_peer_cred = get_current_cred();
683 	spin_unlock(&sk->sk_peer_lock);
684 
685 	put_pid(old_pid);
686 	put_cred(old_cred);
687 }
688 
689 static void copy_peercred(struct sock *sk, struct sock *peersk)
690 {
691 	const struct cred *old_cred;
692 	struct pid *old_pid;
693 
694 	if (sk < peersk) {
695 		spin_lock(&sk->sk_peer_lock);
696 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
697 	} else {
698 		spin_lock(&peersk->sk_peer_lock);
699 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
700 	}
701 	old_pid = sk->sk_peer_pid;
702 	old_cred = sk->sk_peer_cred;
703 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
704 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
705 
706 	spin_unlock(&sk->sk_peer_lock);
707 	spin_unlock(&peersk->sk_peer_lock);
708 
709 	put_pid(old_pid);
710 	put_cred(old_cred);
711 }
712 
713 static int unix_listen(struct socket *sock, int backlog)
714 {
715 	int err;
716 	struct sock *sk = sock->sk;
717 	struct unix_sock *u = unix_sk(sk);
718 
719 	err = -EOPNOTSUPP;
720 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
721 		goto out;	/* Only stream/seqpacket sockets accept */
722 	err = -EINVAL;
723 	if (!u->addr)
724 		goto out;	/* No listens on an unbound socket */
725 	unix_state_lock(sk);
726 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
727 		goto out_unlock;
728 	if (backlog > sk->sk_max_ack_backlog)
729 		wake_up_interruptible_all(&u->peer_wait);
730 	sk->sk_max_ack_backlog	= backlog;
731 	sk->sk_state		= TCP_LISTEN;
732 	/* set credentials so connect can copy them */
733 	init_peercred(sk);
734 	err = 0;
735 
736 out_unlock:
737 	unix_state_unlock(sk);
738 out:
739 	return err;
740 }
741 
742 static int unix_release(struct socket *);
743 static int unix_bind(struct socket *, struct sockaddr *, int);
744 static int unix_stream_connect(struct socket *, struct sockaddr *,
745 			       int addr_len, int flags);
746 static int unix_socketpair(struct socket *, struct socket *);
747 static int unix_accept(struct socket *, struct socket *, int, bool);
748 static int unix_getname(struct socket *, struct sockaddr *, int);
749 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
750 static __poll_t unix_dgram_poll(struct file *, struct socket *,
751 				    poll_table *);
752 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
753 #ifdef CONFIG_COMPAT
754 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
755 #endif
756 static int unix_shutdown(struct socket *, int);
757 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
758 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
759 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
760 				    size_t size, int flags);
761 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
762 				       struct pipe_inode_info *, size_t size,
763 				       unsigned int flags);
764 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
765 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
766 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
767 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
768 static int unix_dgram_connect(struct socket *, struct sockaddr *,
769 			      int, int);
770 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
771 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
772 				  int);
773 
774 static int unix_set_peek_off(struct sock *sk, int val)
775 {
776 	struct unix_sock *u = unix_sk(sk);
777 
778 	if (mutex_lock_interruptible(&u->iolock))
779 		return -EINTR;
780 
781 	sk->sk_peek_off = val;
782 	mutex_unlock(&u->iolock);
783 
784 	return 0;
785 }
786 
787 #ifdef CONFIG_PROC_FS
788 static int unix_count_nr_fds(struct sock *sk)
789 {
790 	struct sk_buff *skb;
791 	struct unix_sock *u;
792 	int nr_fds = 0;
793 
794 	spin_lock(&sk->sk_receive_queue.lock);
795 	skb = skb_peek(&sk->sk_receive_queue);
796 	while (skb) {
797 		u = unix_sk(skb->sk);
798 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
799 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
800 	}
801 	spin_unlock(&sk->sk_receive_queue.lock);
802 
803 	return nr_fds;
804 }
805 
806 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
807 {
808 	struct sock *sk = sock->sk;
809 	struct unix_sock *u;
810 	int nr_fds;
811 
812 	if (sk) {
813 		u = unix_sk(sk);
814 		if (sock->type == SOCK_DGRAM) {
815 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
816 			goto out_print;
817 		}
818 
819 		unix_state_lock(sk);
820 		if (sk->sk_state != TCP_LISTEN)
821 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
822 		else
823 			nr_fds = unix_count_nr_fds(sk);
824 		unix_state_unlock(sk);
825 out_print:
826 		seq_printf(m, "scm_fds: %u\n", nr_fds);
827 	}
828 }
829 #else
830 #define unix_show_fdinfo NULL
831 #endif
832 
833 static const struct proto_ops unix_stream_ops = {
834 	.family =	PF_UNIX,
835 	.owner =	THIS_MODULE,
836 	.release =	unix_release,
837 	.bind =		unix_bind,
838 	.connect =	unix_stream_connect,
839 	.socketpair =	unix_socketpair,
840 	.accept =	unix_accept,
841 	.getname =	unix_getname,
842 	.poll =		unix_poll,
843 	.ioctl =	unix_ioctl,
844 #ifdef CONFIG_COMPAT
845 	.compat_ioctl =	unix_compat_ioctl,
846 #endif
847 	.listen =	unix_listen,
848 	.shutdown =	unix_shutdown,
849 	.sendmsg =	unix_stream_sendmsg,
850 	.recvmsg =	unix_stream_recvmsg,
851 	.read_skb =	unix_stream_read_skb,
852 	.mmap =		sock_no_mmap,
853 	.sendpage =	unix_stream_sendpage,
854 	.splice_read =	unix_stream_splice_read,
855 	.set_peek_off =	unix_set_peek_off,
856 	.show_fdinfo =	unix_show_fdinfo,
857 };
858 
859 static const struct proto_ops unix_dgram_ops = {
860 	.family =	PF_UNIX,
861 	.owner =	THIS_MODULE,
862 	.release =	unix_release,
863 	.bind =		unix_bind,
864 	.connect =	unix_dgram_connect,
865 	.socketpair =	unix_socketpair,
866 	.accept =	sock_no_accept,
867 	.getname =	unix_getname,
868 	.poll =		unix_dgram_poll,
869 	.ioctl =	unix_ioctl,
870 #ifdef CONFIG_COMPAT
871 	.compat_ioctl =	unix_compat_ioctl,
872 #endif
873 	.listen =	sock_no_listen,
874 	.shutdown =	unix_shutdown,
875 	.sendmsg =	unix_dgram_sendmsg,
876 	.read_skb =	unix_read_skb,
877 	.recvmsg =	unix_dgram_recvmsg,
878 	.mmap =		sock_no_mmap,
879 	.sendpage =	sock_no_sendpage,
880 	.set_peek_off =	unix_set_peek_off,
881 	.show_fdinfo =	unix_show_fdinfo,
882 };
883 
884 static const struct proto_ops unix_seqpacket_ops = {
885 	.family =	PF_UNIX,
886 	.owner =	THIS_MODULE,
887 	.release =	unix_release,
888 	.bind =		unix_bind,
889 	.connect =	unix_stream_connect,
890 	.socketpair =	unix_socketpair,
891 	.accept =	unix_accept,
892 	.getname =	unix_getname,
893 	.poll =		unix_dgram_poll,
894 	.ioctl =	unix_ioctl,
895 #ifdef CONFIG_COMPAT
896 	.compat_ioctl =	unix_compat_ioctl,
897 #endif
898 	.listen =	unix_listen,
899 	.shutdown =	unix_shutdown,
900 	.sendmsg =	unix_seqpacket_sendmsg,
901 	.recvmsg =	unix_seqpacket_recvmsg,
902 	.mmap =		sock_no_mmap,
903 	.sendpage =	sock_no_sendpage,
904 	.set_peek_off =	unix_set_peek_off,
905 	.show_fdinfo =	unix_show_fdinfo,
906 };
907 
908 static void unix_close(struct sock *sk, long timeout)
909 {
910 	/* Nothing to do here, unix socket does not need a ->close().
911 	 * This is merely for sockmap.
912 	 */
913 }
914 
915 static void unix_unhash(struct sock *sk)
916 {
917 	/* Nothing to do here, unix socket does not need a ->unhash().
918 	 * This is merely for sockmap.
919 	 */
920 }
921 
922 struct proto unix_dgram_proto = {
923 	.name			= "UNIX",
924 	.owner			= THIS_MODULE,
925 	.obj_size		= sizeof(struct unix_sock),
926 	.close			= unix_close,
927 #ifdef CONFIG_BPF_SYSCALL
928 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
929 #endif
930 };
931 
932 struct proto unix_stream_proto = {
933 	.name			= "UNIX-STREAM",
934 	.owner			= THIS_MODULE,
935 	.obj_size		= sizeof(struct unix_sock),
936 	.close			= unix_close,
937 	.unhash			= unix_unhash,
938 #ifdef CONFIG_BPF_SYSCALL
939 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
940 #endif
941 };
942 
943 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
944 {
945 	struct unix_sock *u;
946 	struct sock *sk;
947 	int err;
948 
949 	atomic_long_inc(&unix_nr_socks);
950 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
951 		err = -ENFILE;
952 		goto err;
953 	}
954 
955 	if (type == SOCK_STREAM)
956 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
957 	else /*dgram and  seqpacket */
958 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
959 
960 	if (!sk) {
961 		err = -ENOMEM;
962 		goto err;
963 	}
964 
965 	sock_init_data(sock, sk);
966 
967 	sk->sk_hash		= unix_unbound_hash(sk);
968 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
969 	sk->sk_write_space	= unix_write_space;
970 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
971 	sk->sk_destruct		= unix_sock_destructor;
972 	u	  = unix_sk(sk);
973 	u->path.dentry = NULL;
974 	u->path.mnt = NULL;
975 	spin_lock_init(&u->lock);
976 	atomic_long_set(&u->inflight, 0);
977 	INIT_LIST_HEAD(&u->link);
978 	mutex_init(&u->iolock); /* single task reading lock */
979 	mutex_init(&u->bindlock); /* single task binding lock */
980 	init_waitqueue_head(&u->peer_wait);
981 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
982 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
983 	unix_insert_unbound_socket(net, sk);
984 
985 	sock_prot_inuse_add(net, sk->sk_prot, 1);
986 
987 	return sk;
988 
989 err:
990 	atomic_long_dec(&unix_nr_socks);
991 	return ERR_PTR(err);
992 }
993 
994 static int unix_create(struct net *net, struct socket *sock, int protocol,
995 		       int kern)
996 {
997 	struct sock *sk;
998 
999 	if (protocol && protocol != PF_UNIX)
1000 		return -EPROTONOSUPPORT;
1001 
1002 	sock->state = SS_UNCONNECTED;
1003 
1004 	switch (sock->type) {
1005 	case SOCK_STREAM:
1006 		sock->ops = &unix_stream_ops;
1007 		break;
1008 		/*
1009 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1010 		 *	nothing uses it.
1011 		 */
1012 	case SOCK_RAW:
1013 		sock->type = SOCK_DGRAM;
1014 		fallthrough;
1015 	case SOCK_DGRAM:
1016 		sock->ops = &unix_dgram_ops;
1017 		break;
1018 	case SOCK_SEQPACKET:
1019 		sock->ops = &unix_seqpacket_ops;
1020 		break;
1021 	default:
1022 		return -ESOCKTNOSUPPORT;
1023 	}
1024 
1025 	sk = unix_create1(net, sock, kern, sock->type);
1026 	if (IS_ERR(sk))
1027 		return PTR_ERR(sk);
1028 
1029 	return 0;
1030 }
1031 
1032 static int unix_release(struct socket *sock)
1033 {
1034 	struct sock *sk = sock->sk;
1035 
1036 	if (!sk)
1037 		return 0;
1038 
1039 	sk->sk_prot->close(sk, 0);
1040 	unix_release_sock(sk, 0);
1041 	sock->sk = NULL;
1042 
1043 	return 0;
1044 }
1045 
1046 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1047 				  int type)
1048 {
1049 	struct inode *inode;
1050 	struct path path;
1051 	struct sock *sk;
1052 	int err;
1053 
1054 	unix_mkname_bsd(sunaddr, addr_len);
1055 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1056 	if (err)
1057 		goto fail;
1058 
1059 	err = path_permission(&path, MAY_WRITE);
1060 	if (err)
1061 		goto path_put;
1062 
1063 	err = -ECONNREFUSED;
1064 	inode = d_backing_inode(path.dentry);
1065 	if (!S_ISSOCK(inode->i_mode))
1066 		goto path_put;
1067 
1068 	sk = unix_find_socket_byinode(inode);
1069 	if (!sk)
1070 		goto path_put;
1071 
1072 	err = -EPROTOTYPE;
1073 	if (sk->sk_type == type)
1074 		touch_atime(&path);
1075 	else
1076 		goto sock_put;
1077 
1078 	path_put(&path);
1079 
1080 	return sk;
1081 
1082 sock_put:
1083 	sock_put(sk);
1084 path_put:
1085 	path_put(&path);
1086 fail:
1087 	return ERR_PTR(err);
1088 }
1089 
1090 static struct sock *unix_find_abstract(struct net *net,
1091 				       struct sockaddr_un *sunaddr,
1092 				       int addr_len, int type)
1093 {
1094 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1095 	struct dentry *dentry;
1096 	struct sock *sk;
1097 
1098 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1099 	if (!sk)
1100 		return ERR_PTR(-ECONNREFUSED);
1101 
1102 	dentry = unix_sk(sk)->path.dentry;
1103 	if (dentry)
1104 		touch_atime(&unix_sk(sk)->path);
1105 
1106 	return sk;
1107 }
1108 
1109 static struct sock *unix_find_other(struct net *net,
1110 				    struct sockaddr_un *sunaddr,
1111 				    int addr_len, int type)
1112 {
1113 	struct sock *sk;
1114 
1115 	if (sunaddr->sun_path[0])
1116 		sk = unix_find_bsd(sunaddr, addr_len, type);
1117 	else
1118 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1119 
1120 	return sk;
1121 }
1122 
1123 static int unix_autobind(struct sock *sk)
1124 {
1125 	unsigned int new_hash, old_hash = sk->sk_hash;
1126 	struct unix_sock *u = unix_sk(sk);
1127 	struct net *net = sock_net(sk);
1128 	struct unix_address *addr;
1129 	u32 lastnum, ordernum;
1130 	int err;
1131 
1132 	err = mutex_lock_interruptible(&u->bindlock);
1133 	if (err)
1134 		return err;
1135 
1136 	if (u->addr)
1137 		goto out;
1138 
1139 	err = -ENOMEM;
1140 	addr = kzalloc(sizeof(*addr) +
1141 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1142 	if (!addr)
1143 		goto out;
1144 
1145 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1146 	addr->name->sun_family = AF_UNIX;
1147 	refcount_set(&addr->refcnt, 1);
1148 
1149 	ordernum = prandom_u32();
1150 	lastnum = ordernum & 0xFFFFF;
1151 retry:
1152 	ordernum = (ordernum + 1) & 0xFFFFF;
1153 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1154 
1155 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1156 	unix_table_double_lock(net, old_hash, new_hash);
1157 
1158 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1159 		unix_table_double_unlock(net, old_hash, new_hash);
1160 
1161 		/* __unix_find_socket_byname() may take long time if many names
1162 		 * are already in use.
1163 		 */
1164 		cond_resched();
1165 
1166 		if (ordernum == lastnum) {
1167 			/* Give up if all names seems to be in use. */
1168 			err = -ENOSPC;
1169 			unix_release_addr(addr);
1170 			goto out;
1171 		}
1172 
1173 		goto retry;
1174 	}
1175 
1176 	__unix_set_addr_hash(net, sk, addr, new_hash);
1177 	unix_table_double_unlock(net, old_hash, new_hash);
1178 	err = 0;
1179 
1180 out:	mutex_unlock(&u->bindlock);
1181 	return err;
1182 }
1183 
1184 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1185 			 int addr_len)
1186 {
1187 	umode_t mode = S_IFSOCK |
1188 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1189 	unsigned int new_hash, old_hash = sk->sk_hash;
1190 	struct unix_sock *u = unix_sk(sk);
1191 	struct net *net = sock_net(sk);
1192 	struct user_namespace *ns; // barf...
1193 	struct unix_address *addr;
1194 	struct dentry *dentry;
1195 	struct path parent;
1196 	int err;
1197 
1198 	unix_mkname_bsd(sunaddr, addr_len);
1199 	addr_len = strlen(sunaddr->sun_path) +
1200 		offsetof(struct sockaddr_un, sun_path) + 1;
1201 
1202 	addr = unix_create_addr(sunaddr, addr_len);
1203 	if (!addr)
1204 		return -ENOMEM;
1205 
1206 	/*
1207 	 * Get the parent directory, calculate the hash for last
1208 	 * component.
1209 	 */
1210 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1211 	if (IS_ERR(dentry)) {
1212 		err = PTR_ERR(dentry);
1213 		goto out;
1214 	}
1215 
1216 	/*
1217 	 * All right, let's create it.
1218 	 */
1219 	ns = mnt_user_ns(parent.mnt);
1220 	err = security_path_mknod(&parent, dentry, mode, 0);
1221 	if (!err)
1222 		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1223 	if (err)
1224 		goto out_path;
1225 	err = mutex_lock_interruptible(&u->bindlock);
1226 	if (err)
1227 		goto out_unlink;
1228 	if (u->addr)
1229 		goto out_unlock;
1230 
1231 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1232 	unix_table_double_lock(net, old_hash, new_hash);
1233 	u->path.mnt = mntget(parent.mnt);
1234 	u->path.dentry = dget(dentry);
1235 	__unix_set_addr_hash(net, sk, addr, new_hash);
1236 	unix_table_double_unlock(net, old_hash, new_hash);
1237 	unix_insert_bsd_socket(sk);
1238 	mutex_unlock(&u->bindlock);
1239 	done_path_create(&parent, dentry);
1240 	return 0;
1241 
1242 out_unlock:
1243 	mutex_unlock(&u->bindlock);
1244 	err = -EINVAL;
1245 out_unlink:
1246 	/* failed after successful mknod?  unlink what we'd created... */
1247 	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1248 out_path:
1249 	done_path_create(&parent, dentry);
1250 out:
1251 	unix_release_addr(addr);
1252 	return err == -EEXIST ? -EADDRINUSE : err;
1253 }
1254 
1255 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1256 			      int addr_len)
1257 {
1258 	unsigned int new_hash, old_hash = sk->sk_hash;
1259 	struct unix_sock *u = unix_sk(sk);
1260 	struct net *net = sock_net(sk);
1261 	struct unix_address *addr;
1262 	int err;
1263 
1264 	addr = unix_create_addr(sunaddr, addr_len);
1265 	if (!addr)
1266 		return -ENOMEM;
1267 
1268 	err = mutex_lock_interruptible(&u->bindlock);
1269 	if (err)
1270 		goto out;
1271 
1272 	if (u->addr) {
1273 		err = -EINVAL;
1274 		goto out_mutex;
1275 	}
1276 
1277 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1278 	unix_table_double_lock(net, old_hash, new_hash);
1279 
1280 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1281 		goto out_spin;
1282 
1283 	__unix_set_addr_hash(net, sk, addr, new_hash);
1284 	unix_table_double_unlock(net, old_hash, new_hash);
1285 	mutex_unlock(&u->bindlock);
1286 	return 0;
1287 
1288 out_spin:
1289 	unix_table_double_unlock(net, old_hash, new_hash);
1290 	err = -EADDRINUSE;
1291 out_mutex:
1292 	mutex_unlock(&u->bindlock);
1293 out:
1294 	unix_release_addr(addr);
1295 	return err;
1296 }
1297 
1298 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1299 {
1300 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1301 	struct sock *sk = sock->sk;
1302 	int err;
1303 
1304 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1305 	    sunaddr->sun_family == AF_UNIX)
1306 		return unix_autobind(sk);
1307 
1308 	err = unix_validate_addr(sunaddr, addr_len);
1309 	if (err)
1310 		return err;
1311 
1312 	if (sunaddr->sun_path[0])
1313 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1314 	else
1315 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1316 
1317 	return err;
1318 }
1319 
1320 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1321 {
1322 	if (unlikely(sk1 == sk2) || !sk2) {
1323 		unix_state_lock(sk1);
1324 		return;
1325 	}
1326 	if (sk1 < sk2) {
1327 		unix_state_lock(sk1);
1328 		unix_state_lock_nested(sk2);
1329 	} else {
1330 		unix_state_lock(sk2);
1331 		unix_state_lock_nested(sk1);
1332 	}
1333 }
1334 
1335 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1336 {
1337 	if (unlikely(sk1 == sk2) || !sk2) {
1338 		unix_state_unlock(sk1);
1339 		return;
1340 	}
1341 	unix_state_unlock(sk1);
1342 	unix_state_unlock(sk2);
1343 }
1344 
1345 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1346 			      int alen, int flags)
1347 {
1348 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1349 	struct sock *sk = sock->sk;
1350 	struct sock *other;
1351 	int err;
1352 
1353 	err = -EINVAL;
1354 	if (alen < offsetofend(struct sockaddr, sa_family))
1355 		goto out;
1356 
1357 	if (addr->sa_family != AF_UNSPEC) {
1358 		err = unix_validate_addr(sunaddr, alen);
1359 		if (err)
1360 			goto out;
1361 
1362 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1363 		    !unix_sk(sk)->addr) {
1364 			err = unix_autobind(sk);
1365 			if (err)
1366 				goto out;
1367 		}
1368 
1369 restart:
1370 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1371 		if (IS_ERR(other)) {
1372 			err = PTR_ERR(other);
1373 			goto out;
1374 		}
1375 
1376 		unix_state_double_lock(sk, other);
1377 
1378 		/* Apparently VFS overslept socket death. Retry. */
1379 		if (sock_flag(other, SOCK_DEAD)) {
1380 			unix_state_double_unlock(sk, other);
1381 			sock_put(other);
1382 			goto restart;
1383 		}
1384 
1385 		err = -EPERM;
1386 		if (!unix_may_send(sk, other))
1387 			goto out_unlock;
1388 
1389 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1390 		if (err)
1391 			goto out_unlock;
1392 
1393 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1394 	} else {
1395 		/*
1396 		 *	1003.1g breaking connected state with AF_UNSPEC
1397 		 */
1398 		other = NULL;
1399 		unix_state_double_lock(sk, other);
1400 	}
1401 
1402 	/*
1403 	 * If it was connected, reconnect.
1404 	 */
1405 	if (unix_peer(sk)) {
1406 		struct sock *old_peer = unix_peer(sk);
1407 
1408 		unix_peer(sk) = other;
1409 		if (!other)
1410 			sk->sk_state = TCP_CLOSE;
1411 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1412 
1413 		unix_state_double_unlock(sk, other);
1414 
1415 		if (other != old_peer)
1416 			unix_dgram_disconnected(sk, old_peer);
1417 		sock_put(old_peer);
1418 	} else {
1419 		unix_peer(sk) = other;
1420 		unix_state_double_unlock(sk, other);
1421 	}
1422 
1423 	return 0;
1424 
1425 out_unlock:
1426 	unix_state_double_unlock(sk, other);
1427 	sock_put(other);
1428 out:
1429 	return err;
1430 }
1431 
1432 static long unix_wait_for_peer(struct sock *other, long timeo)
1433 	__releases(&unix_sk(other)->lock)
1434 {
1435 	struct unix_sock *u = unix_sk(other);
1436 	int sched;
1437 	DEFINE_WAIT(wait);
1438 
1439 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1440 
1441 	sched = !sock_flag(other, SOCK_DEAD) &&
1442 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1443 		unix_recvq_full(other);
1444 
1445 	unix_state_unlock(other);
1446 
1447 	if (sched)
1448 		timeo = schedule_timeout(timeo);
1449 
1450 	finish_wait(&u->peer_wait, &wait);
1451 	return timeo;
1452 }
1453 
1454 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1455 			       int addr_len, int flags)
1456 {
1457 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1458 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1459 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1460 	struct net *net = sock_net(sk);
1461 	struct sk_buff *skb = NULL;
1462 	long timeo;
1463 	int err;
1464 	int st;
1465 
1466 	err = unix_validate_addr(sunaddr, addr_len);
1467 	if (err)
1468 		goto out;
1469 
1470 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1471 		err = unix_autobind(sk);
1472 		if (err)
1473 			goto out;
1474 	}
1475 
1476 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1477 
1478 	/* First of all allocate resources.
1479 	   If we will make it after state is locked,
1480 	   we will have to recheck all again in any case.
1481 	 */
1482 
1483 	/* create new sock for complete connection */
1484 	newsk = unix_create1(net, NULL, 0, sock->type);
1485 	if (IS_ERR(newsk)) {
1486 		err = PTR_ERR(newsk);
1487 		newsk = NULL;
1488 		goto out;
1489 	}
1490 
1491 	err = -ENOMEM;
1492 
1493 	/* Allocate skb for sending to listening sock */
1494 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1495 	if (skb == NULL)
1496 		goto out;
1497 
1498 restart:
1499 	/*  Find listening sock. */
1500 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1501 	if (IS_ERR(other)) {
1502 		err = PTR_ERR(other);
1503 		other = NULL;
1504 		goto out;
1505 	}
1506 
1507 	/* Latch state of peer */
1508 	unix_state_lock(other);
1509 
1510 	/* Apparently VFS overslept socket death. Retry. */
1511 	if (sock_flag(other, SOCK_DEAD)) {
1512 		unix_state_unlock(other);
1513 		sock_put(other);
1514 		goto restart;
1515 	}
1516 
1517 	err = -ECONNREFUSED;
1518 	if (other->sk_state != TCP_LISTEN)
1519 		goto out_unlock;
1520 	if (other->sk_shutdown & RCV_SHUTDOWN)
1521 		goto out_unlock;
1522 
1523 	if (unix_recvq_full(other)) {
1524 		err = -EAGAIN;
1525 		if (!timeo)
1526 			goto out_unlock;
1527 
1528 		timeo = unix_wait_for_peer(other, timeo);
1529 
1530 		err = sock_intr_errno(timeo);
1531 		if (signal_pending(current))
1532 			goto out;
1533 		sock_put(other);
1534 		goto restart;
1535 	}
1536 
1537 	/* Latch our state.
1538 
1539 	   It is tricky place. We need to grab our state lock and cannot
1540 	   drop lock on peer. It is dangerous because deadlock is
1541 	   possible. Connect to self case and simultaneous
1542 	   attempt to connect are eliminated by checking socket
1543 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1544 	   check this before attempt to grab lock.
1545 
1546 	   Well, and we have to recheck the state after socket locked.
1547 	 */
1548 	st = sk->sk_state;
1549 
1550 	switch (st) {
1551 	case TCP_CLOSE:
1552 		/* This is ok... continue with connect */
1553 		break;
1554 	case TCP_ESTABLISHED:
1555 		/* Socket is already connected */
1556 		err = -EISCONN;
1557 		goto out_unlock;
1558 	default:
1559 		err = -EINVAL;
1560 		goto out_unlock;
1561 	}
1562 
1563 	unix_state_lock_nested(sk);
1564 
1565 	if (sk->sk_state != st) {
1566 		unix_state_unlock(sk);
1567 		unix_state_unlock(other);
1568 		sock_put(other);
1569 		goto restart;
1570 	}
1571 
1572 	err = security_unix_stream_connect(sk, other, newsk);
1573 	if (err) {
1574 		unix_state_unlock(sk);
1575 		goto out_unlock;
1576 	}
1577 
1578 	/* The way is open! Fastly set all the necessary fields... */
1579 
1580 	sock_hold(sk);
1581 	unix_peer(newsk)	= sk;
1582 	newsk->sk_state		= TCP_ESTABLISHED;
1583 	newsk->sk_type		= sk->sk_type;
1584 	init_peercred(newsk);
1585 	newu = unix_sk(newsk);
1586 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1587 	otheru = unix_sk(other);
1588 
1589 	/* copy address information from listening to new sock
1590 	 *
1591 	 * The contents of *(otheru->addr) and otheru->path
1592 	 * are seen fully set up here, since we have found
1593 	 * otheru in hash under its lock.  Insertion into the
1594 	 * hash chain we'd found it in had been done in an
1595 	 * earlier critical area protected by the chain's lock,
1596 	 * the same one where we'd set *(otheru->addr) contents,
1597 	 * as well as otheru->path and otheru->addr itself.
1598 	 *
1599 	 * Using smp_store_release() here to set newu->addr
1600 	 * is enough to make those stores, as well as stores
1601 	 * to newu->path visible to anyone who gets newu->addr
1602 	 * by smp_load_acquire().  IOW, the same warranties
1603 	 * as for unix_sock instances bound in unix_bind() or
1604 	 * in unix_autobind().
1605 	 */
1606 	if (otheru->path.dentry) {
1607 		path_get(&otheru->path);
1608 		newu->path = otheru->path;
1609 	}
1610 	refcount_inc(&otheru->addr->refcnt);
1611 	smp_store_release(&newu->addr, otheru->addr);
1612 
1613 	/* Set credentials */
1614 	copy_peercred(sk, other);
1615 
1616 	sock->state	= SS_CONNECTED;
1617 	sk->sk_state	= TCP_ESTABLISHED;
1618 	sock_hold(newsk);
1619 
1620 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1621 	unix_peer(sk)	= newsk;
1622 
1623 	unix_state_unlock(sk);
1624 
1625 	/* take ten and send info to listening sock */
1626 	spin_lock(&other->sk_receive_queue.lock);
1627 	__skb_queue_tail(&other->sk_receive_queue, skb);
1628 	spin_unlock(&other->sk_receive_queue.lock);
1629 	unix_state_unlock(other);
1630 	other->sk_data_ready(other);
1631 	sock_put(other);
1632 	return 0;
1633 
1634 out_unlock:
1635 	if (other)
1636 		unix_state_unlock(other);
1637 
1638 out:
1639 	kfree_skb(skb);
1640 	if (newsk)
1641 		unix_release_sock(newsk, 0);
1642 	if (other)
1643 		sock_put(other);
1644 	return err;
1645 }
1646 
1647 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1648 {
1649 	struct sock *ska = socka->sk, *skb = sockb->sk;
1650 
1651 	/* Join our sockets back to back */
1652 	sock_hold(ska);
1653 	sock_hold(skb);
1654 	unix_peer(ska) = skb;
1655 	unix_peer(skb) = ska;
1656 	init_peercred(ska);
1657 	init_peercred(skb);
1658 
1659 	ska->sk_state = TCP_ESTABLISHED;
1660 	skb->sk_state = TCP_ESTABLISHED;
1661 	socka->state  = SS_CONNECTED;
1662 	sockb->state  = SS_CONNECTED;
1663 	return 0;
1664 }
1665 
1666 static void unix_sock_inherit_flags(const struct socket *old,
1667 				    struct socket *new)
1668 {
1669 	if (test_bit(SOCK_PASSCRED, &old->flags))
1670 		set_bit(SOCK_PASSCRED, &new->flags);
1671 	if (test_bit(SOCK_PASSSEC, &old->flags))
1672 		set_bit(SOCK_PASSSEC, &new->flags);
1673 }
1674 
1675 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1676 		       bool kern)
1677 {
1678 	struct sock *sk = sock->sk;
1679 	struct sock *tsk;
1680 	struct sk_buff *skb;
1681 	int err;
1682 
1683 	err = -EOPNOTSUPP;
1684 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1685 		goto out;
1686 
1687 	err = -EINVAL;
1688 	if (sk->sk_state != TCP_LISTEN)
1689 		goto out;
1690 
1691 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1692 	 * so that no locks are necessary.
1693 	 */
1694 
1695 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1696 				&err);
1697 	if (!skb) {
1698 		/* This means receive shutdown. */
1699 		if (err == 0)
1700 			err = -EINVAL;
1701 		goto out;
1702 	}
1703 
1704 	tsk = skb->sk;
1705 	skb_free_datagram(sk, skb);
1706 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1707 
1708 	/* attach accepted sock to socket */
1709 	unix_state_lock(tsk);
1710 	newsock->state = SS_CONNECTED;
1711 	unix_sock_inherit_flags(sock, newsock);
1712 	sock_graft(tsk, newsock);
1713 	unix_state_unlock(tsk);
1714 	return 0;
1715 
1716 out:
1717 	return err;
1718 }
1719 
1720 
1721 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1722 {
1723 	struct sock *sk = sock->sk;
1724 	struct unix_address *addr;
1725 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1726 	int err = 0;
1727 
1728 	if (peer) {
1729 		sk = unix_peer_get(sk);
1730 
1731 		err = -ENOTCONN;
1732 		if (!sk)
1733 			goto out;
1734 		err = 0;
1735 	} else {
1736 		sock_hold(sk);
1737 	}
1738 
1739 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1740 	if (!addr) {
1741 		sunaddr->sun_family = AF_UNIX;
1742 		sunaddr->sun_path[0] = 0;
1743 		err = offsetof(struct sockaddr_un, sun_path);
1744 	} else {
1745 		err = addr->len;
1746 		memcpy(sunaddr, addr->name, addr->len);
1747 	}
1748 	sock_put(sk);
1749 out:
1750 	return err;
1751 }
1752 
1753 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1754 {
1755 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1756 
1757 	/*
1758 	 * Garbage collection of unix sockets starts by selecting a set of
1759 	 * candidate sockets which have reference only from being in flight
1760 	 * (total_refs == inflight_refs).  This condition is checked once during
1761 	 * the candidate collection phase, and candidates are marked as such, so
1762 	 * that non-candidates can later be ignored.  While inflight_refs is
1763 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1764 	 * is an instantaneous decision.
1765 	 *
1766 	 * Once a candidate, however, the socket must not be reinstalled into a
1767 	 * file descriptor while the garbage collection is in progress.
1768 	 *
1769 	 * If the above conditions are met, then the directed graph of
1770 	 * candidates (*) does not change while unix_gc_lock is held.
1771 	 *
1772 	 * Any operations that changes the file count through file descriptors
1773 	 * (dup, close, sendmsg) does not change the graph since candidates are
1774 	 * not installed in fds.
1775 	 *
1776 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1777 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1778 	 * serialized with garbage collection.
1779 	 *
1780 	 * MSG_PEEK is special in that it does not change the inflight count,
1781 	 * yet does install the socket into an fd.  The following lock/unlock
1782 	 * pair is to ensure serialization with garbage collection.  It must be
1783 	 * done between incrementing the file count and installing the file into
1784 	 * an fd.
1785 	 *
1786 	 * If garbage collection starts after the barrier provided by the
1787 	 * lock/unlock, then it will see the elevated refcount and not mark this
1788 	 * as a candidate.  If a garbage collection is already in progress
1789 	 * before the file count was incremented, then the lock/unlock pair will
1790 	 * ensure that garbage collection is finished before progressing to
1791 	 * installing the fd.
1792 	 *
1793 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1794 	 * which is on the queue of listening socket A.
1795 	 */
1796 	spin_lock(&unix_gc_lock);
1797 	spin_unlock(&unix_gc_lock);
1798 }
1799 
1800 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1801 {
1802 	int err = 0;
1803 
1804 	UNIXCB(skb).pid  = get_pid(scm->pid);
1805 	UNIXCB(skb).uid = scm->creds.uid;
1806 	UNIXCB(skb).gid = scm->creds.gid;
1807 	UNIXCB(skb).fp = NULL;
1808 	unix_get_secdata(scm, skb);
1809 	if (scm->fp && send_fds)
1810 		err = unix_attach_fds(scm, skb);
1811 
1812 	skb->destructor = unix_destruct_scm;
1813 	return err;
1814 }
1815 
1816 static bool unix_passcred_enabled(const struct socket *sock,
1817 				  const struct sock *other)
1818 {
1819 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1820 	       !other->sk_socket ||
1821 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1822 }
1823 
1824 /*
1825  * Some apps rely on write() giving SCM_CREDENTIALS
1826  * We include credentials if source or destination socket
1827  * asserted SOCK_PASSCRED.
1828  */
1829 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1830 			    const struct sock *other)
1831 {
1832 	if (UNIXCB(skb).pid)
1833 		return;
1834 	if (unix_passcred_enabled(sock, other)) {
1835 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1836 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1837 	}
1838 }
1839 
1840 static int maybe_init_creds(struct scm_cookie *scm,
1841 			    struct socket *socket,
1842 			    const struct sock *other)
1843 {
1844 	int err;
1845 	struct msghdr msg = { .msg_controllen = 0 };
1846 
1847 	err = scm_send(socket, &msg, scm, false);
1848 	if (err)
1849 		return err;
1850 
1851 	if (unix_passcred_enabled(socket, other)) {
1852 		scm->pid = get_pid(task_tgid(current));
1853 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1854 	}
1855 	return err;
1856 }
1857 
1858 static bool unix_skb_scm_eq(struct sk_buff *skb,
1859 			    struct scm_cookie *scm)
1860 {
1861 	return UNIXCB(skb).pid == scm->pid &&
1862 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1863 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1864 	       unix_secdata_eq(scm, skb);
1865 }
1866 
1867 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1868 {
1869 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1870 	struct unix_sock *u = unix_sk(sk);
1871 
1872 	if (unlikely(fp && fp->count))
1873 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1874 }
1875 
1876 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1877 {
1878 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1879 	struct unix_sock *u = unix_sk(sk);
1880 
1881 	if (unlikely(fp && fp->count))
1882 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1883 }
1884 
1885 /*
1886  *	Send AF_UNIX data.
1887  */
1888 
1889 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1890 			      size_t len)
1891 {
1892 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1893 	struct sock *sk = sock->sk, *other = NULL;
1894 	struct unix_sock *u = unix_sk(sk);
1895 	struct scm_cookie scm;
1896 	struct sk_buff *skb;
1897 	int data_len = 0;
1898 	int sk_locked;
1899 	long timeo;
1900 	int err;
1901 
1902 	wait_for_unix_gc();
1903 	err = scm_send(sock, msg, &scm, false);
1904 	if (err < 0)
1905 		return err;
1906 
1907 	err = -EOPNOTSUPP;
1908 	if (msg->msg_flags&MSG_OOB)
1909 		goto out;
1910 
1911 	if (msg->msg_namelen) {
1912 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1913 		if (err)
1914 			goto out;
1915 	} else {
1916 		sunaddr = NULL;
1917 		err = -ENOTCONN;
1918 		other = unix_peer_get(sk);
1919 		if (!other)
1920 			goto out;
1921 	}
1922 
1923 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1924 		err = unix_autobind(sk);
1925 		if (err)
1926 			goto out;
1927 	}
1928 
1929 	err = -EMSGSIZE;
1930 	if (len > sk->sk_sndbuf - 32)
1931 		goto out;
1932 
1933 	if (len > SKB_MAX_ALLOC) {
1934 		data_len = min_t(size_t,
1935 				 len - SKB_MAX_ALLOC,
1936 				 MAX_SKB_FRAGS * PAGE_SIZE);
1937 		data_len = PAGE_ALIGN(data_len);
1938 
1939 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1940 	}
1941 
1942 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1943 				   msg->msg_flags & MSG_DONTWAIT, &err,
1944 				   PAGE_ALLOC_COSTLY_ORDER);
1945 	if (skb == NULL)
1946 		goto out;
1947 
1948 	err = unix_scm_to_skb(&scm, skb, true);
1949 	if (err < 0)
1950 		goto out_free;
1951 
1952 	skb_put(skb, len - data_len);
1953 	skb->data_len = data_len;
1954 	skb->len = len;
1955 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1956 	if (err)
1957 		goto out_free;
1958 
1959 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1960 
1961 restart:
1962 	if (!other) {
1963 		err = -ECONNRESET;
1964 		if (sunaddr == NULL)
1965 			goto out_free;
1966 
1967 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1968 					sk->sk_type);
1969 		if (IS_ERR(other)) {
1970 			err = PTR_ERR(other);
1971 			other = NULL;
1972 			goto out_free;
1973 		}
1974 	}
1975 
1976 	if (sk_filter(other, skb) < 0) {
1977 		/* Toss the packet but do not return any error to the sender */
1978 		err = len;
1979 		goto out_free;
1980 	}
1981 
1982 	sk_locked = 0;
1983 	unix_state_lock(other);
1984 restart_locked:
1985 	err = -EPERM;
1986 	if (!unix_may_send(sk, other))
1987 		goto out_unlock;
1988 
1989 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1990 		/*
1991 		 *	Check with 1003.1g - what should
1992 		 *	datagram error
1993 		 */
1994 		unix_state_unlock(other);
1995 		sock_put(other);
1996 
1997 		if (!sk_locked)
1998 			unix_state_lock(sk);
1999 
2000 		err = 0;
2001 		if (unix_peer(sk) == other) {
2002 			unix_peer(sk) = NULL;
2003 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2004 
2005 			unix_state_unlock(sk);
2006 
2007 			sk->sk_state = TCP_CLOSE;
2008 			unix_dgram_disconnected(sk, other);
2009 			sock_put(other);
2010 			err = -ECONNREFUSED;
2011 		} else {
2012 			unix_state_unlock(sk);
2013 		}
2014 
2015 		other = NULL;
2016 		if (err)
2017 			goto out_free;
2018 		goto restart;
2019 	}
2020 
2021 	err = -EPIPE;
2022 	if (other->sk_shutdown & RCV_SHUTDOWN)
2023 		goto out_unlock;
2024 
2025 	if (sk->sk_type != SOCK_SEQPACKET) {
2026 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2027 		if (err)
2028 			goto out_unlock;
2029 	}
2030 
2031 	/* other == sk && unix_peer(other) != sk if
2032 	 * - unix_peer(sk) == NULL, destination address bound to sk
2033 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2034 	 */
2035 	if (other != sk &&
2036 	    unlikely(unix_peer(other) != sk &&
2037 	    unix_recvq_full_lockless(other))) {
2038 		if (timeo) {
2039 			timeo = unix_wait_for_peer(other, timeo);
2040 
2041 			err = sock_intr_errno(timeo);
2042 			if (signal_pending(current))
2043 				goto out_free;
2044 
2045 			goto restart;
2046 		}
2047 
2048 		if (!sk_locked) {
2049 			unix_state_unlock(other);
2050 			unix_state_double_lock(sk, other);
2051 		}
2052 
2053 		if (unix_peer(sk) != other ||
2054 		    unix_dgram_peer_wake_me(sk, other)) {
2055 			err = -EAGAIN;
2056 			sk_locked = 1;
2057 			goto out_unlock;
2058 		}
2059 
2060 		if (!sk_locked) {
2061 			sk_locked = 1;
2062 			goto restart_locked;
2063 		}
2064 	}
2065 
2066 	if (unlikely(sk_locked))
2067 		unix_state_unlock(sk);
2068 
2069 	if (sock_flag(other, SOCK_RCVTSTAMP))
2070 		__net_timestamp(skb);
2071 	maybe_add_creds(skb, sock, other);
2072 	scm_stat_add(other, skb);
2073 	skb_queue_tail(&other->sk_receive_queue, skb);
2074 	unix_state_unlock(other);
2075 	other->sk_data_ready(other);
2076 	sock_put(other);
2077 	scm_destroy(&scm);
2078 	return len;
2079 
2080 out_unlock:
2081 	if (sk_locked)
2082 		unix_state_unlock(sk);
2083 	unix_state_unlock(other);
2084 out_free:
2085 	kfree_skb(skb);
2086 out:
2087 	if (other)
2088 		sock_put(other);
2089 	scm_destroy(&scm);
2090 	return err;
2091 }
2092 
2093 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2094  * bytes, and a minimum of a full page.
2095  */
2096 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2097 
2098 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2099 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2100 {
2101 	struct unix_sock *ousk = unix_sk(other);
2102 	struct sk_buff *skb;
2103 	int err = 0;
2104 
2105 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2106 
2107 	if (!skb)
2108 		return err;
2109 
2110 	skb_put(skb, 1);
2111 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2112 
2113 	if (err) {
2114 		kfree_skb(skb);
2115 		return err;
2116 	}
2117 
2118 	unix_state_lock(other);
2119 
2120 	if (sock_flag(other, SOCK_DEAD) ||
2121 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2122 		unix_state_unlock(other);
2123 		kfree_skb(skb);
2124 		return -EPIPE;
2125 	}
2126 
2127 	maybe_add_creds(skb, sock, other);
2128 	skb_get(skb);
2129 
2130 	if (ousk->oob_skb)
2131 		consume_skb(ousk->oob_skb);
2132 
2133 	WRITE_ONCE(ousk->oob_skb, skb);
2134 
2135 	scm_stat_add(other, skb);
2136 	skb_queue_tail(&other->sk_receive_queue, skb);
2137 	sk_send_sigurg(other);
2138 	unix_state_unlock(other);
2139 	other->sk_data_ready(other);
2140 
2141 	return err;
2142 }
2143 #endif
2144 
2145 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2146 			       size_t len)
2147 {
2148 	struct sock *sk = sock->sk;
2149 	struct sock *other = NULL;
2150 	int err, size;
2151 	struct sk_buff *skb;
2152 	int sent = 0;
2153 	struct scm_cookie scm;
2154 	bool fds_sent = false;
2155 	int data_len;
2156 
2157 	wait_for_unix_gc();
2158 	err = scm_send(sock, msg, &scm, false);
2159 	if (err < 0)
2160 		return err;
2161 
2162 	err = -EOPNOTSUPP;
2163 	if (msg->msg_flags & MSG_OOB) {
2164 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2165 		if (len)
2166 			len--;
2167 		else
2168 #endif
2169 			goto out_err;
2170 	}
2171 
2172 	if (msg->msg_namelen) {
2173 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2174 		goto out_err;
2175 	} else {
2176 		err = -ENOTCONN;
2177 		other = unix_peer(sk);
2178 		if (!other)
2179 			goto out_err;
2180 	}
2181 
2182 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2183 		goto pipe_err;
2184 
2185 	while (sent < len) {
2186 		size = len - sent;
2187 
2188 		/* Keep two messages in the pipe so it schedules better */
2189 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2190 
2191 		/* allow fallback to order-0 allocations */
2192 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2193 
2194 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2195 
2196 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2197 
2198 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2199 					   msg->msg_flags & MSG_DONTWAIT, &err,
2200 					   get_order(UNIX_SKB_FRAGS_SZ));
2201 		if (!skb)
2202 			goto out_err;
2203 
2204 		/* Only send the fds in the first buffer */
2205 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2206 		if (err < 0) {
2207 			kfree_skb(skb);
2208 			goto out_err;
2209 		}
2210 		fds_sent = true;
2211 
2212 		skb_put(skb, size - data_len);
2213 		skb->data_len = data_len;
2214 		skb->len = size;
2215 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2216 		if (err) {
2217 			kfree_skb(skb);
2218 			goto out_err;
2219 		}
2220 
2221 		unix_state_lock(other);
2222 
2223 		if (sock_flag(other, SOCK_DEAD) ||
2224 		    (other->sk_shutdown & RCV_SHUTDOWN))
2225 			goto pipe_err_free;
2226 
2227 		maybe_add_creds(skb, sock, other);
2228 		scm_stat_add(other, skb);
2229 		skb_queue_tail(&other->sk_receive_queue, skb);
2230 		unix_state_unlock(other);
2231 		other->sk_data_ready(other);
2232 		sent += size;
2233 	}
2234 
2235 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2236 	if (msg->msg_flags & MSG_OOB) {
2237 		err = queue_oob(sock, msg, other);
2238 		if (err)
2239 			goto out_err;
2240 		sent++;
2241 	}
2242 #endif
2243 
2244 	scm_destroy(&scm);
2245 
2246 	return sent;
2247 
2248 pipe_err_free:
2249 	unix_state_unlock(other);
2250 	kfree_skb(skb);
2251 pipe_err:
2252 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2253 		send_sig(SIGPIPE, current, 0);
2254 	err = -EPIPE;
2255 out_err:
2256 	scm_destroy(&scm);
2257 	return sent ? : err;
2258 }
2259 
2260 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2261 				    int offset, size_t size, int flags)
2262 {
2263 	int err;
2264 	bool send_sigpipe = false;
2265 	bool init_scm = true;
2266 	struct scm_cookie scm;
2267 	struct sock *other, *sk = socket->sk;
2268 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2269 
2270 	if (flags & MSG_OOB)
2271 		return -EOPNOTSUPP;
2272 
2273 	other = unix_peer(sk);
2274 	if (!other || sk->sk_state != TCP_ESTABLISHED)
2275 		return -ENOTCONN;
2276 
2277 	if (false) {
2278 alloc_skb:
2279 		unix_state_unlock(other);
2280 		mutex_unlock(&unix_sk(other)->iolock);
2281 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2282 					      &err, 0);
2283 		if (!newskb)
2284 			goto err;
2285 	}
2286 
2287 	/* we must acquire iolock as we modify already present
2288 	 * skbs in the sk_receive_queue and mess with skb->len
2289 	 */
2290 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2291 	if (err) {
2292 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2293 		goto err;
2294 	}
2295 
2296 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
2297 		err = -EPIPE;
2298 		send_sigpipe = true;
2299 		goto err_unlock;
2300 	}
2301 
2302 	unix_state_lock(other);
2303 
2304 	if (sock_flag(other, SOCK_DEAD) ||
2305 	    other->sk_shutdown & RCV_SHUTDOWN) {
2306 		err = -EPIPE;
2307 		send_sigpipe = true;
2308 		goto err_state_unlock;
2309 	}
2310 
2311 	if (init_scm) {
2312 		err = maybe_init_creds(&scm, socket, other);
2313 		if (err)
2314 			goto err_state_unlock;
2315 		init_scm = false;
2316 	}
2317 
2318 	skb = skb_peek_tail(&other->sk_receive_queue);
2319 	if (tail && tail == skb) {
2320 		skb = newskb;
2321 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2322 		if (newskb) {
2323 			skb = newskb;
2324 		} else {
2325 			tail = skb;
2326 			goto alloc_skb;
2327 		}
2328 	} else if (newskb) {
2329 		/* this is fast path, we don't necessarily need to
2330 		 * call to kfree_skb even though with newskb == NULL
2331 		 * this - does no harm
2332 		 */
2333 		consume_skb(newskb);
2334 		newskb = NULL;
2335 	}
2336 
2337 	if (skb_append_pagefrags(skb, page, offset, size)) {
2338 		tail = skb;
2339 		goto alloc_skb;
2340 	}
2341 
2342 	skb->len += size;
2343 	skb->data_len += size;
2344 	skb->truesize += size;
2345 	refcount_add(size, &sk->sk_wmem_alloc);
2346 
2347 	if (newskb) {
2348 		err = unix_scm_to_skb(&scm, skb, false);
2349 		if (err)
2350 			goto err_state_unlock;
2351 		spin_lock(&other->sk_receive_queue.lock);
2352 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2353 		spin_unlock(&other->sk_receive_queue.lock);
2354 	}
2355 
2356 	unix_state_unlock(other);
2357 	mutex_unlock(&unix_sk(other)->iolock);
2358 
2359 	other->sk_data_ready(other);
2360 	scm_destroy(&scm);
2361 	return size;
2362 
2363 err_state_unlock:
2364 	unix_state_unlock(other);
2365 err_unlock:
2366 	mutex_unlock(&unix_sk(other)->iolock);
2367 err:
2368 	kfree_skb(newskb);
2369 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2370 		send_sig(SIGPIPE, current, 0);
2371 	if (!init_scm)
2372 		scm_destroy(&scm);
2373 	return err;
2374 }
2375 
2376 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2377 				  size_t len)
2378 {
2379 	int err;
2380 	struct sock *sk = sock->sk;
2381 
2382 	err = sock_error(sk);
2383 	if (err)
2384 		return err;
2385 
2386 	if (sk->sk_state != TCP_ESTABLISHED)
2387 		return -ENOTCONN;
2388 
2389 	if (msg->msg_namelen)
2390 		msg->msg_namelen = 0;
2391 
2392 	return unix_dgram_sendmsg(sock, msg, len);
2393 }
2394 
2395 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2396 				  size_t size, int flags)
2397 {
2398 	struct sock *sk = sock->sk;
2399 
2400 	if (sk->sk_state != TCP_ESTABLISHED)
2401 		return -ENOTCONN;
2402 
2403 	return unix_dgram_recvmsg(sock, msg, size, flags);
2404 }
2405 
2406 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2407 {
2408 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2409 
2410 	if (addr) {
2411 		msg->msg_namelen = addr->len;
2412 		memcpy(msg->msg_name, addr->name, addr->len);
2413 	}
2414 }
2415 
2416 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2417 			 int flags)
2418 {
2419 	struct scm_cookie scm;
2420 	struct socket *sock = sk->sk_socket;
2421 	struct unix_sock *u = unix_sk(sk);
2422 	struct sk_buff *skb, *last;
2423 	long timeo;
2424 	int skip;
2425 	int err;
2426 
2427 	err = -EOPNOTSUPP;
2428 	if (flags&MSG_OOB)
2429 		goto out;
2430 
2431 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2432 
2433 	do {
2434 		mutex_lock(&u->iolock);
2435 
2436 		skip = sk_peek_offset(sk, flags);
2437 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2438 					      &skip, &err, &last);
2439 		if (skb) {
2440 			if (!(flags & MSG_PEEK))
2441 				scm_stat_del(sk, skb);
2442 			break;
2443 		}
2444 
2445 		mutex_unlock(&u->iolock);
2446 
2447 		if (err != -EAGAIN)
2448 			break;
2449 	} while (timeo &&
2450 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2451 					      &err, &timeo, last));
2452 
2453 	if (!skb) { /* implies iolock unlocked */
2454 		unix_state_lock(sk);
2455 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2456 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2457 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2458 			err = 0;
2459 		unix_state_unlock(sk);
2460 		goto out;
2461 	}
2462 
2463 	if (wq_has_sleeper(&u->peer_wait))
2464 		wake_up_interruptible_sync_poll(&u->peer_wait,
2465 						EPOLLOUT | EPOLLWRNORM |
2466 						EPOLLWRBAND);
2467 
2468 	if (msg->msg_name)
2469 		unix_copy_addr(msg, skb->sk);
2470 
2471 	if (size > skb->len - skip)
2472 		size = skb->len - skip;
2473 	else if (size < skb->len - skip)
2474 		msg->msg_flags |= MSG_TRUNC;
2475 
2476 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2477 	if (err)
2478 		goto out_free;
2479 
2480 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2481 		__sock_recv_timestamp(msg, sk, skb);
2482 
2483 	memset(&scm, 0, sizeof(scm));
2484 
2485 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2486 	unix_set_secdata(&scm, skb);
2487 
2488 	if (!(flags & MSG_PEEK)) {
2489 		if (UNIXCB(skb).fp)
2490 			unix_detach_fds(&scm, skb);
2491 
2492 		sk_peek_offset_bwd(sk, skb->len);
2493 	} else {
2494 		/* It is questionable: on PEEK we could:
2495 		   - do not return fds - good, but too simple 8)
2496 		   - return fds, and do not return them on read (old strategy,
2497 		     apparently wrong)
2498 		   - clone fds (I chose it for now, it is the most universal
2499 		     solution)
2500 
2501 		   POSIX 1003.1g does not actually define this clearly
2502 		   at all. POSIX 1003.1g doesn't define a lot of things
2503 		   clearly however!
2504 
2505 		*/
2506 
2507 		sk_peek_offset_fwd(sk, size);
2508 
2509 		if (UNIXCB(skb).fp)
2510 			unix_peek_fds(&scm, skb);
2511 	}
2512 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2513 
2514 	scm_recv(sock, msg, &scm, flags);
2515 
2516 out_free:
2517 	skb_free_datagram(sk, skb);
2518 	mutex_unlock(&u->iolock);
2519 out:
2520 	return err;
2521 }
2522 
2523 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2524 			      int flags)
2525 {
2526 	struct sock *sk = sock->sk;
2527 
2528 #ifdef CONFIG_BPF_SYSCALL
2529 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2530 
2531 	if (prot != &unix_dgram_proto)
2532 		return prot->recvmsg(sk, msg, size, flags, NULL);
2533 #endif
2534 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2535 }
2536 
2537 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2538 {
2539 	int copied = 0;
2540 
2541 	while (1) {
2542 		struct unix_sock *u = unix_sk(sk);
2543 		struct sk_buff *skb;
2544 		int used, err;
2545 
2546 		mutex_lock(&u->iolock);
2547 		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2548 		mutex_unlock(&u->iolock);
2549 		if (!skb)
2550 			return err;
2551 
2552 		used = recv_actor(sk, skb);
2553 		if (used <= 0) {
2554 			if (!copied)
2555 				copied = used;
2556 			kfree_skb(skb);
2557 			break;
2558 		} else if (used <= skb->len) {
2559 			copied += used;
2560 		}
2561 
2562 		kfree_skb(skb);
2563 		break;
2564 	}
2565 
2566 	return copied;
2567 }
2568 
2569 /*
2570  *	Sleep until more data has arrived. But check for races..
2571  */
2572 static long unix_stream_data_wait(struct sock *sk, long timeo,
2573 				  struct sk_buff *last, unsigned int last_len,
2574 				  bool freezable)
2575 {
2576 	struct sk_buff *tail;
2577 	DEFINE_WAIT(wait);
2578 
2579 	unix_state_lock(sk);
2580 
2581 	for (;;) {
2582 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2583 
2584 		tail = skb_peek_tail(&sk->sk_receive_queue);
2585 		if (tail != last ||
2586 		    (tail && tail->len != last_len) ||
2587 		    sk->sk_err ||
2588 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2589 		    signal_pending(current) ||
2590 		    !timeo)
2591 			break;
2592 
2593 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2594 		unix_state_unlock(sk);
2595 		if (freezable)
2596 			timeo = freezable_schedule_timeout(timeo);
2597 		else
2598 			timeo = schedule_timeout(timeo);
2599 		unix_state_lock(sk);
2600 
2601 		if (sock_flag(sk, SOCK_DEAD))
2602 			break;
2603 
2604 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2605 	}
2606 
2607 	finish_wait(sk_sleep(sk), &wait);
2608 	unix_state_unlock(sk);
2609 	return timeo;
2610 }
2611 
2612 static unsigned int unix_skb_len(const struct sk_buff *skb)
2613 {
2614 	return skb->len - UNIXCB(skb).consumed;
2615 }
2616 
2617 struct unix_stream_read_state {
2618 	int (*recv_actor)(struct sk_buff *, int, int,
2619 			  struct unix_stream_read_state *);
2620 	struct socket *socket;
2621 	struct msghdr *msg;
2622 	struct pipe_inode_info *pipe;
2623 	size_t size;
2624 	int flags;
2625 	unsigned int splice_flags;
2626 };
2627 
2628 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2629 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2630 {
2631 	struct socket *sock = state->socket;
2632 	struct sock *sk = sock->sk;
2633 	struct unix_sock *u = unix_sk(sk);
2634 	int chunk = 1;
2635 	struct sk_buff *oob_skb;
2636 
2637 	mutex_lock(&u->iolock);
2638 	unix_state_lock(sk);
2639 
2640 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2641 		unix_state_unlock(sk);
2642 		mutex_unlock(&u->iolock);
2643 		return -EINVAL;
2644 	}
2645 
2646 	oob_skb = u->oob_skb;
2647 
2648 	if (!(state->flags & MSG_PEEK))
2649 		WRITE_ONCE(u->oob_skb, NULL);
2650 
2651 	unix_state_unlock(sk);
2652 
2653 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2654 
2655 	if (!(state->flags & MSG_PEEK)) {
2656 		UNIXCB(oob_skb).consumed += 1;
2657 		kfree_skb(oob_skb);
2658 	}
2659 
2660 	mutex_unlock(&u->iolock);
2661 
2662 	if (chunk < 0)
2663 		return -EFAULT;
2664 
2665 	state->msg->msg_flags |= MSG_OOB;
2666 	return 1;
2667 }
2668 
2669 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2670 				  int flags, int copied)
2671 {
2672 	struct unix_sock *u = unix_sk(sk);
2673 
2674 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2675 		skb_unlink(skb, &sk->sk_receive_queue);
2676 		consume_skb(skb);
2677 		skb = NULL;
2678 	} else {
2679 		if (skb == u->oob_skb) {
2680 			if (copied) {
2681 				skb = NULL;
2682 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2683 				if (!(flags & MSG_PEEK)) {
2684 					WRITE_ONCE(u->oob_skb, NULL);
2685 					consume_skb(skb);
2686 				}
2687 			} else if (!(flags & MSG_PEEK)) {
2688 				skb_unlink(skb, &sk->sk_receive_queue);
2689 				consume_skb(skb);
2690 				skb = skb_peek(&sk->sk_receive_queue);
2691 			}
2692 		}
2693 	}
2694 	return skb;
2695 }
2696 #endif
2697 
2698 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2699 {
2700 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2701 		return -ENOTCONN;
2702 
2703 	return unix_read_skb(sk, recv_actor);
2704 }
2705 
2706 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2707 				    bool freezable)
2708 {
2709 	struct scm_cookie scm;
2710 	struct socket *sock = state->socket;
2711 	struct sock *sk = sock->sk;
2712 	struct unix_sock *u = unix_sk(sk);
2713 	int copied = 0;
2714 	int flags = state->flags;
2715 	int noblock = flags & MSG_DONTWAIT;
2716 	bool check_creds = false;
2717 	int target;
2718 	int err = 0;
2719 	long timeo;
2720 	int skip;
2721 	size_t size = state->size;
2722 	unsigned int last_len;
2723 
2724 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2725 		err = -EINVAL;
2726 		goto out;
2727 	}
2728 
2729 	if (unlikely(flags & MSG_OOB)) {
2730 		err = -EOPNOTSUPP;
2731 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2732 		err = unix_stream_recv_urg(state);
2733 #endif
2734 		goto out;
2735 	}
2736 
2737 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2738 	timeo = sock_rcvtimeo(sk, noblock);
2739 
2740 	memset(&scm, 0, sizeof(scm));
2741 
2742 	/* Lock the socket to prevent queue disordering
2743 	 * while sleeps in memcpy_tomsg
2744 	 */
2745 	mutex_lock(&u->iolock);
2746 
2747 	skip = max(sk_peek_offset(sk, flags), 0);
2748 
2749 	do {
2750 		int chunk;
2751 		bool drop_skb;
2752 		struct sk_buff *skb, *last;
2753 
2754 redo:
2755 		unix_state_lock(sk);
2756 		if (sock_flag(sk, SOCK_DEAD)) {
2757 			err = -ECONNRESET;
2758 			goto unlock;
2759 		}
2760 		last = skb = skb_peek(&sk->sk_receive_queue);
2761 		last_len = last ? last->len : 0;
2762 
2763 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2764 		if (skb) {
2765 			skb = manage_oob(skb, sk, flags, copied);
2766 			if (!skb) {
2767 				unix_state_unlock(sk);
2768 				if (copied)
2769 					break;
2770 				goto redo;
2771 			}
2772 		}
2773 #endif
2774 again:
2775 		if (skb == NULL) {
2776 			if (copied >= target)
2777 				goto unlock;
2778 
2779 			/*
2780 			 *	POSIX 1003.1g mandates this order.
2781 			 */
2782 
2783 			err = sock_error(sk);
2784 			if (err)
2785 				goto unlock;
2786 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2787 				goto unlock;
2788 
2789 			unix_state_unlock(sk);
2790 			if (!timeo) {
2791 				err = -EAGAIN;
2792 				break;
2793 			}
2794 
2795 			mutex_unlock(&u->iolock);
2796 
2797 			timeo = unix_stream_data_wait(sk, timeo, last,
2798 						      last_len, freezable);
2799 
2800 			if (signal_pending(current)) {
2801 				err = sock_intr_errno(timeo);
2802 				scm_destroy(&scm);
2803 				goto out;
2804 			}
2805 
2806 			mutex_lock(&u->iolock);
2807 			goto redo;
2808 unlock:
2809 			unix_state_unlock(sk);
2810 			break;
2811 		}
2812 
2813 		while (skip >= unix_skb_len(skb)) {
2814 			skip -= unix_skb_len(skb);
2815 			last = skb;
2816 			last_len = skb->len;
2817 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2818 			if (!skb)
2819 				goto again;
2820 		}
2821 
2822 		unix_state_unlock(sk);
2823 
2824 		if (check_creds) {
2825 			/* Never glue messages from different writers */
2826 			if (!unix_skb_scm_eq(skb, &scm))
2827 				break;
2828 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2829 			/* Copy credentials */
2830 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2831 			unix_set_secdata(&scm, skb);
2832 			check_creds = true;
2833 		}
2834 
2835 		/* Copy address just once */
2836 		if (state->msg && state->msg->msg_name) {
2837 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2838 					 state->msg->msg_name);
2839 			unix_copy_addr(state->msg, skb->sk);
2840 			sunaddr = NULL;
2841 		}
2842 
2843 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2844 		skb_get(skb);
2845 		chunk = state->recv_actor(skb, skip, chunk, state);
2846 		drop_skb = !unix_skb_len(skb);
2847 		/* skb is only safe to use if !drop_skb */
2848 		consume_skb(skb);
2849 		if (chunk < 0) {
2850 			if (copied == 0)
2851 				copied = -EFAULT;
2852 			break;
2853 		}
2854 		copied += chunk;
2855 		size -= chunk;
2856 
2857 		if (drop_skb) {
2858 			/* the skb was touched by a concurrent reader;
2859 			 * we should not expect anything from this skb
2860 			 * anymore and assume it invalid - we can be
2861 			 * sure it was dropped from the socket queue
2862 			 *
2863 			 * let's report a short read
2864 			 */
2865 			err = 0;
2866 			break;
2867 		}
2868 
2869 		/* Mark read part of skb as used */
2870 		if (!(flags & MSG_PEEK)) {
2871 			UNIXCB(skb).consumed += chunk;
2872 
2873 			sk_peek_offset_bwd(sk, chunk);
2874 
2875 			if (UNIXCB(skb).fp) {
2876 				scm_stat_del(sk, skb);
2877 				unix_detach_fds(&scm, skb);
2878 			}
2879 
2880 			if (unix_skb_len(skb))
2881 				break;
2882 
2883 			skb_unlink(skb, &sk->sk_receive_queue);
2884 			consume_skb(skb);
2885 
2886 			if (scm.fp)
2887 				break;
2888 		} else {
2889 			/* It is questionable, see note in unix_dgram_recvmsg.
2890 			 */
2891 			if (UNIXCB(skb).fp)
2892 				unix_peek_fds(&scm, skb);
2893 
2894 			sk_peek_offset_fwd(sk, chunk);
2895 
2896 			if (UNIXCB(skb).fp)
2897 				break;
2898 
2899 			skip = 0;
2900 			last = skb;
2901 			last_len = skb->len;
2902 			unix_state_lock(sk);
2903 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2904 			if (skb)
2905 				goto again;
2906 			unix_state_unlock(sk);
2907 			break;
2908 		}
2909 	} while (size);
2910 
2911 	mutex_unlock(&u->iolock);
2912 	if (state->msg)
2913 		scm_recv(sock, state->msg, &scm, flags);
2914 	else
2915 		scm_destroy(&scm);
2916 out:
2917 	return copied ? : err;
2918 }
2919 
2920 static int unix_stream_read_actor(struct sk_buff *skb,
2921 				  int skip, int chunk,
2922 				  struct unix_stream_read_state *state)
2923 {
2924 	int ret;
2925 
2926 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2927 				    state->msg, chunk);
2928 	return ret ?: chunk;
2929 }
2930 
2931 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2932 			  size_t size, int flags)
2933 {
2934 	struct unix_stream_read_state state = {
2935 		.recv_actor = unix_stream_read_actor,
2936 		.socket = sk->sk_socket,
2937 		.msg = msg,
2938 		.size = size,
2939 		.flags = flags
2940 	};
2941 
2942 	return unix_stream_read_generic(&state, true);
2943 }
2944 
2945 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2946 			       size_t size, int flags)
2947 {
2948 	struct unix_stream_read_state state = {
2949 		.recv_actor = unix_stream_read_actor,
2950 		.socket = sock,
2951 		.msg = msg,
2952 		.size = size,
2953 		.flags = flags
2954 	};
2955 
2956 #ifdef CONFIG_BPF_SYSCALL
2957 	struct sock *sk = sock->sk;
2958 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2959 
2960 	if (prot != &unix_stream_proto)
2961 		return prot->recvmsg(sk, msg, size, flags, NULL);
2962 #endif
2963 	return unix_stream_read_generic(&state, true);
2964 }
2965 
2966 static int unix_stream_splice_actor(struct sk_buff *skb,
2967 				    int skip, int chunk,
2968 				    struct unix_stream_read_state *state)
2969 {
2970 	return skb_splice_bits(skb, state->socket->sk,
2971 			       UNIXCB(skb).consumed + skip,
2972 			       state->pipe, chunk, state->splice_flags);
2973 }
2974 
2975 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2976 				       struct pipe_inode_info *pipe,
2977 				       size_t size, unsigned int flags)
2978 {
2979 	struct unix_stream_read_state state = {
2980 		.recv_actor = unix_stream_splice_actor,
2981 		.socket = sock,
2982 		.pipe = pipe,
2983 		.size = size,
2984 		.splice_flags = flags,
2985 	};
2986 
2987 	if (unlikely(*ppos))
2988 		return -ESPIPE;
2989 
2990 	if (sock->file->f_flags & O_NONBLOCK ||
2991 	    flags & SPLICE_F_NONBLOCK)
2992 		state.flags = MSG_DONTWAIT;
2993 
2994 	return unix_stream_read_generic(&state, false);
2995 }
2996 
2997 static int unix_shutdown(struct socket *sock, int mode)
2998 {
2999 	struct sock *sk = sock->sk;
3000 	struct sock *other;
3001 
3002 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3003 		return -EINVAL;
3004 	/* This maps:
3005 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3006 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3007 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3008 	 */
3009 	++mode;
3010 
3011 	unix_state_lock(sk);
3012 	sk->sk_shutdown |= mode;
3013 	other = unix_peer(sk);
3014 	if (other)
3015 		sock_hold(other);
3016 	unix_state_unlock(sk);
3017 	sk->sk_state_change(sk);
3018 
3019 	if (other &&
3020 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3021 
3022 		int peer_mode = 0;
3023 		const struct proto *prot = READ_ONCE(other->sk_prot);
3024 
3025 		if (prot->unhash)
3026 			prot->unhash(other);
3027 		if (mode&RCV_SHUTDOWN)
3028 			peer_mode |= SEND_SHUTDOWN;
3029 		if (mode&SEND_SHUTDOWN)
3030 			peer_mode |= RCV_SHUTDOWN;
3031 		unix_state_lock(other);
3032 		other->sk_shutdown |= peer_mode;
3033 		unix_state_unlock(other);
3034 		other->sk_state_change(other);
3035 		if (peer_mode == SHUTDOWN_MASK)
3036 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3037 		else if (peer_mode & RCV_SHUTDOWN)
3038 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3039 	}
3040 	if (other)
3041 		sock_put(other);
3042 
3043 	return 0;
3044 }
3045 
3046 long unix_inq_len(struct sock *sk)
3047 {
3048 	struct sk_buff *skb;
3049 	long amount = 0;
3050 
3051 	if (sk->sk_state == TCP_LISTEN)
3052 		return -EINVAL;
3053 
3054 	spin_lock(&sk->sk_receive_queue.lock);
3055 	if (sk->sk_type == SOCK_STREAM ||
3056 	    sk->sk_type == SOCK_SEQPACKET) {
3057 		skb_queue_walk(&sk->sk_receive_queue, skb)
3058 			amount += unix_skb_len(skb);
3059 	} else {
3060 		skb = skb_peek(&sk->sk_receive_queue);
3061 		if (skb)
3062 			amount = skb->len;
3063 	}
3064 	spin_unlock(&sk->sk_receive_queue.lock);
3065 
3066 	return amount;
3067 }
3068 EXPORT_SYMBOL_GPL(unix_inq_len);
3069 
3070 long unix_outq_len(struct sock *sk)
3071 {
3072 	return sk_wmem_alloc_get(sk);
3073 }
3074 EXPORT_SYMBOL_GPL(unix_outq_len);
3075 
3076 static int unix_open_file(struct sock *sk)
3077 {
3078 	struct path path;
3079 	struct file *f;
3080 	int fd;
3081 
3082 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3083 		return -EPERM;
3084 
3085 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3086 		return -ENOENT;
3087 
3088 	path = unix_sk(sk)->path;
3089 	if (!path.dentry)
3090 		return -ENOENT;
3091 
3092 	path_get(&path);
3093 
3094 	fd = get_unused_fd_flags(O_CLOEXEC);
3095 	if (fd < 0)
3096 		goto out;
3097 
3098 	f = dentry_open(&path, O_PATH, current_cred());
3099 	if (IS_ERR(f)) {
3100 		put_unused_fd(fd);
3101 		fd = PTR_ERR(f);
3102 		goto out;
3103 	}
3104 
3105 	fd_install(fd, f);
3106 out:
3107 	path_put(&path);
3108 
3109 	return fd;
3110 }
3111 
3112 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3113 {
3114 	struct sock *sk = sock->sk;
3115 	long amount = 0;
3116 	int err;
3117 
3118 	switch (cmd) {
3119 	case SIOCOUTQ:
3120 		amount = unix_outq_len(sk);
3121 		err = put_user(amount, (int __user *)arg);
3122 		break;
3123 	case SIOCINQ:
3124 		amount = unix_inq_len(sk);
3125 		if (amount < 0)
3126 			err = amount;
3127 		else
3128 			err = put_user(amount, (int __user *)arg);
3129 		break;
3130 	case SIOCUNIXFILE:
3131 		err = unix_open_file(sk);
3132 		break;
3133 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3134 	case SIOCATMARK:
3135 		{
3136 			struct sk_buff *skb;
3137 			int answ = 0;
3138 
3139 			skb = skb_peek(&sk->sk_receive_queue);
3140 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3141 				answ = 1;
3142 			err = put_user(answ, (int __user *)arg);
3143 		}
3144 		break;
3145 #endif
3146 	default:
3147 		err = -ENOIOCTLCMD;
3148 		break;
3149 	}
3150 	return err;
3151 }
3152 
3153 #ifdef CONFIG_COMPAT
3154 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3155 {
3156 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3157 }
3158 #endif
3159 
3160 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3161 {
3162 	struct sock *sk = sock->sk;
3163 	__poll_t mask;
3164 
3165 	sock_poll_wait(file, sock, wait);
3166 	mask = 0;
3167 
3168 	/* exceptional events? */
3169 	if (sk->sk_err)
3170 		mask |= EPOLLERR;
3171 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3172 		mask |= EPOLLHUP;
3173 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3174 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3175 
3176 	/* readable? */
3177 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3178 		mask |= EPOLLIN | EPOLLRDNORM;
3179 	if (sk_is_readable(sk))
3180 		mask |= EPOLLIN | EPOLLRDNORM;
3181 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3182 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3183 		mask |= EPOLLPRI;
3184 #endif
3185 
3186 	/* Connection-based need to check for termination and startup */
3187 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3188 	    sk->sk_state == TCP_CLOSE)
3189 		mask |= EPOLLHUP;
3190 
3191 	/*
3192 	 * we set writable also when the other side has shut down the
3193 	 * connection. This prevents stuck sockets.
3194 	 */
3195 	if (unix_writable(sk))
3196 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3197 
3198 	return mask;
3199 }
3200 
3201 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3202 				    poll_table *wait)
3203 {
3204 	struct sock *sk = sock->sk, *other;
3205 	unsigned int writable;
3206 	__poll_t mask;
3207 
3208 	sock_poll_wait(file, sock, wait);
3209 	mask = 0;
3210 
3211 	/* exceptional events? */
3212 	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3213 		mask |= EPOLLERR |
3214 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3215 
3216 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3217 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3218 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3219 		mask |= EPOLLHUP;
3220 
3221 	/* readable? */
3222 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3223 		mask |= EPOLLIN | EPOLLRDNORM;
3224 	if (sk_is_readable(sk))
3225 		mask |= EPOLLIN | EPOLLRDNORM;
3226 
3227 	/* Connection-based need to check for termination and startup */
3228 	if (sk->sk_type == SOCK_SEQPACKET) {
3229 		if (sk->sk_state == TCP_CLOSE)
3230 			mask |= EPOLLHUP;
3231 		/* connection hasn't started yet? */
3232 		if (sk->sk_state == TCP_SYN_SENT)
3233 			return mask;
3234 	}
3235 
3236 	/* No write status requested, avoid expensive OUT tests. */
3237 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3238 		return mask;
3239 
3240 	writable = unix_writable(sk);
3241 	if (writable) {
3242 		unix_state_lock(sk);
3243 
3244 		other = unix_peer(sk);
3245 		if (other && unix_peer(other) != sk &&
3246 		    unix_recvq_full_lockless(other) &&
3247 		    unix_dgram_peer_wake_me(sk, other))
3248 			writable = 0;
3249 
3250 		unix_state_unlock(sk);
3251 	}
3252 
3253 	if (writable)
3254 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3255 	else
3256 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3257 
3258 	return mask;
3259 }
3260 
3261 #ifdef CONFIG_PROC_FS
3262 
3263 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3264 
3265 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3266 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3267 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3268 
3269 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3270 {
3271 	unsigned long offset = get_offset(*pos);
3272 	unsigned long bucket = get_bucket(*pos);
3273 	unsigned long count = 0;
3274 	struct sock *sk;
3275 
3276 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3277 	     sk; sk = sk_next(sk)) {
3278 		if (++count == offset)
3279 			break;
3280 	}
3281 
3282 	return sk;
3283 }
3284 
3285 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3286 {
3287 	unsigned long bucket = get_bucket(*pos);
3288 	struct net *net = seq_file_net(seq);
3289 	struct sock *sk;
3290 
3291 	while (bucket < UNIX_HASH_SIZE) {
3292 		spin_lock(&net->unx.table.locks[bucket]);
3293 
3294 		sk = unix_from_bucket(seq, pos);
3295 		if (sk)
3296 			return sk;
3297 
3298 		spin_unlock(&net->unx.table.locks[bucket]);
3299 
3300 		*pos = set_bucket_offset(++bucket, 1);
3301 	}
3302 
3303 	return NULL;
3304 }
3305 
3306 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3307 				  loff_t *pos)
3308 {
3309 	unsigned long bucket = get_bucket(*pos);
3310 
3311 	sk = sk_next(sk);
3312 	if (sk)
3313 		return sk;
3314 
3315 
3316 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3317 
3318 	*pos = set_bucket_offset(++bucket, 1);
3319 
3320 	return unix_get_first(seq, pos);
3321 }
3322 
3323 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3324 {
3325 	if (!*pos)
3326 		return SEQ_START_TOKEN;
3327 
3328 	return unix_get_first(seq, pos);
3329 }
3330 
3331 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3332 {
3333 	++*pos;
3334 
3335 	if (v == SEQ_START_TOKEN)
3336 		return unix_get_first(seq, pos);
3337 
3338 	return unix_get_next(seq, v, pos);
3339 }
3340 
3341 static void unix_seq_stop(struct seq_file *seq, void *v)
3342 {
3343 	struct sock *sk = v;
3344 
3345 	if (sk)
3346 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3347 }
3348 
3349 static int unix_seq_show(struct seq_file *seq, void *v)
3350 {
3351 
3352 	if (v == SEQ_START_TOKEN)
3353 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3354 			 "Inode Path\n");
3355 	else {
3356 		struct sock *s = v;
3357 		struct unix_sock *u = unix_sk(s);
3358 		unix_state_lock(s);
3359 
3360 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3361 			s,
3362 			refcount_read(&s->sk_refcnt),
3363 			0,
3364 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3365 			s->sk_type,
3366 			s->sk_socket ?
3367 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3368 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3369 			sock_i_ino(s));
3370 
3371 		if (u->addr) {	// under a hash table lock here
3372 			int i, len;
3373 			seq_putc(seq, ' ');
3374 
3375 			i = 0;
3376 			len = u->addr->len -
3377 				offsetof(struct sockaddr_un, sun_path);
3378 			if (u->addr->name->sun_path[0]) {
3379 				len--;
3380 			} else {
3381 				seq_putc(seq, '@');
3382 				i++;
3383 			}
3384 			for ( ; i < len; i++)
3385 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3386 					 '@');
3387 		}
3388 		unix_state_unlock(s);
3389 		seq_putc(seq, '\n');
3390 	}
3391 
3392 	return 0;
3393 }
3394 
3395 static const struct seq_operations unix_seq_ops = {
3396 	.start  = unix_seq_start,
3397 	.next   = unix_seq_next,
3398 	.stop   = unix_seq_stop,
3399 	.show   = unix_seq_show,
3400 };
3401 
3402 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3403 struct bpf_unix_iter_state {
3404 	struct seq_net_private p;
3405 	unsigned int cur_sk;
3406 	unsigned int end_sk;
3407 	unsigned int max_sk;
3408 	struct sock **batch;
3409 	bool st_bucket_done;
3410 };
3411 
3412 struct bpf_iter__unix {
3413 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3414 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3415 	uid_t uid __aligned(8);
3416 };
3417 
3418 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3419 			      struct unix_sock *unix_sk, uid_t uid)
3420 {
3421 	struct bpf_iter__unix ctx;
3422 
3423 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3424 	ctx.meta = meta;
3425 	ctx.unix_sk = unix_sk;
3426 	ctx.uid = uid;
3427 	return bpf_iter_run_prog(prog, &ctx);
3428 }
3429 
3430 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3431 
3432 {
3433 	struct bpf_unix_iter_state *iter = seq->private;
3434 	unsigned int expected = 1;
3435 	struct sock *sk;
3436 
3437 	sock_hold(start_sk);
3438 	iter->batch[iter->end_sk++] = start_sk;
3439 
3440 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3441 		if (iter->end_sk < iter->max_sk) {
3442 			sock_hold(sk);
3443 			iter->batch[iter->end_sk++] = sk;
3444 		}
3445 
3446 		expected++;
3447 	}
3448 
3449 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3450 
3451 	return expected;
3452 }
3453 
3454 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3455 {
3456 	while (iter->cur_sk < iter->end_sk)
3457 		sock_put(iter->batch[iter->cur_sk++]);
3458 }
3459 
3460 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3461 				       unsigned int new_batch_sz)
3462 {
3463 	struct sock **new_batch;
3464 
3465 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3466 			     GFP_USER | __GFP_NOWARN);
3467 	if (!new_batch)
3468 		return -ENOMEM;
3469 
3470 	bpf_iter_unix_put_batch(iter);
3471 	kvfree(iter->batch);
3472 	iter->batch = new_batch;
3473 	iter->max_sk = new_batch_sz;
3474 
3475 	return 0;
3476 }
3477 
3478 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3479 					loff_t *pos)
3480 {
3481 	struct bpf_unix_iter_state *iter = seq->private;
3482 	unsigned int expected;
3483 	bool resized = false;
3484 	struct sock *sk;
3485 
3486 	if (iter->st_bucket_done)
3487 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3488 
3489 again:
3490 	/* Get a new batch */
3491 	iter->cur_sk = 0;
3492 	iter->end_sk = 0;
3493 
3494 	sk = unix_get_first(seq, pos);
3495 	if (!sk)
3496 		return NULL; /* Done */
3497 
3498 	expected = bpf_iter_unix_hold_batch(seq, sk);
3499 
3500 	if (iter->end_sk == expected) {
3501 		iter->st_bucket_done = true;
3502 		return sk;
3503 	}
3504 
3505 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3506 		resized = true;
3507 		goto again;
3508 	}
3509 
3510 	return sk;
3511 }
3512 
3513 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3514 {
3515 	if (!*pos)
3516 		return SEQ_START_TOKEN;
3517 
3518 	/* bpf iter does not support lseek, so it always
3519 	 * continue from where it was stop()-ped.
3520 	 */
3521 	return bpf_iter_unix_batch(seq, pos);
3522 }
3523 
3524 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3525 {
3526 	struct bpf_unix_iter_state *iter = seq->private;
3527 	struct sock *sk;
3528 
3529 	/* Whenever seq_next() is called, the iter->cur_sk is
3530 	 * done with seq_show(), so advance to the next sk in
3531 	 * the batch.
3532 	 */
3533 	if (iter->cur_sk < iter->end_sk)
3534 		sock_put(iter->batch[iter->cur_sk++]);
3535 
3536 	++*pos;
3537 
3538 	if (iter->cur_sk < iter->end_sk)
3539 		sk = iter->batch[iter->cur_sk];
3540 	else
3541 		sk = bpf_iter_unix_batch(seq, pos);
3542 
3543 	return sk;
3544 }
3545 
3546 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3547 {
3548 	struct bpf_iter_meta meta;
3549 	struct bpf_prog *prog;
3550 	struct sock *sk = v;
3551 	uid_t uid;
3552 	bool slow;
3553 	int ret;
3554 
3555 	if (v == SEQ_START_TOKEN)
3556 		return 0;
3557 
3558 	slow = lock_sock_fast(sk);
3559 
3560 	if (unlikely(sk_unhashed(sk))) {
3561 		ret = SEQ_SKIP;
3562 		goto unlock;
3563 	}
3564 
3565 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3566 	meta.seq = seq;
3567 	prog = bpf_iter_get_info(&meta, false);
3568 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3569 unlock:
3570 	unlock_sock_fast(sk, slow);
3571 	return ret;
3572 }
3573 
3574 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3575 {
3576 	struct bpf_unix_iter_state *iter = seq->private;
3577 	struct bpf_iter_meta meta;
3578 	struct bpf_prog *prog;
3579 
3580 	if (!v) {
3581 		meta.seq = seq;
3582 		prog = bpf_iter_get_info(&meta, true);
3583 		if (prog)
3584 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3585 	}
3586 
3587 	if (iter->cur_sk < iter->end_sk)
3588 		bpf_iter_unix_put_batch(iter);
3589 }
3590 
3591 static const struct seq_operations bpf_iter_unix_seq_ops = {
3592 	.start	= bpf_iter_unix_seq_start,
3593 	.next	= bpf_iter_unix_seq_next,
3594 	.stop	= bpf_iter_unix_seq_stop,
3595 	.show	= bpf_iter_unix_seq_show,
3596 };
3597 #endif
3598 #endif
3599 
3600 static const struct net_proto_family unix_family_ops = {
3601 	.family = PF_UNIX,
3602 	.create = unix_create,
3603 	.owner	= THIS_MODULE,
3604 };
3605 
3606 
3607 static int __net_init unix_net_init(struct net *net)
3608 {
3609 	int i;
3610 
3611 	net->unx.sysctl_max_dgram_qlen = 10;
3612 	if (unix_sysctl_register(net))
3613 		goto out;
3614 
3615 #ifdef CONFIG_PROC_FS
3616 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3617 			     sizeof(struct seq_net_private)))
3618 		goto err_sysctl;
3619 #endif
3620 
3621 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3622 					      sizeof(spinlock_t), GFP_KERNEL);
3623 	if (!net->unx.table.locks)
3624 		goto err_proc;
3625 
3626 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3627 						sizeof(struct hlist_head),
3628 						GFP_KERNEL);
3629 	if (!net->unx.table.buckets)
3630 		goto free_locks;
3631 
3632 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3633 		spin_lock_init(&net->unx.table.locks[i]);
3634 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3635 	}
3636 
3637 	return 0;
3638 
3639 free_locks:
3640 	kvfree(net->unx.table.locks);
3641 err_proc:
3642 #ifdef CONFIG_PROC_FS
3643 	remove_proc_entry("unix", net->proc_net);
3644 err_sysctl:
3645 #endif
3646 	unix_sysctl_unregister(net);
3647 out:
3648 	return -ENOMEM;
3649 }
3650 
3651 static void __net_exit unix_net_exit(struct net *net)
3652 {
3653 	kvfree(net->unx.table.buckets);
3654 	kvfree(net->unx.table.locks);
3655 	unix_sysctl_unregister(net);
3656 	remove_proc_entry("unix", net->proc_net);
3657 }
3658 
3659 static struct pernet_operations unix_net_ops = {
3660 	.init = unix_net_init,
3661 	.exit = unix_net_exit,
3662 };
3663 
3664 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3665 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3666 		     struct unix_sock *unix_sk, uid_t uid)
3667 
3668 #define INIT_BATCH_SZ 16
3669 
3670 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3671 {
3672 	struct bpf_unix_iter_state *iter = priv_data;
3673 	int err;
3674 
3675 	err = bpf_iter_init_seq_net(priv_data, aux);
3676 	if (err)
3677 		return err;
3678 
3679 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3680 	if (err) {
3681 		bpf_iter_fini_seq_net(priv_data);
3682 		return err;
3683 	}
3684 
3685 	return 0;
3686 }
3687 
3688 static void bpf_iter_fini_unix(void *priv_data)
3689 {
3690 	struct bpf_unix_iter_state *iter = priv_data;
3691 
3692 	bpf_iter_fini_seq_net(priv_data);
3693 	kvfree(iter->batch);
3694 }
3695 
3696 static const struct bpf_iter_seq_info unix_seq_info = {
3697 	.seq_ops		= &bpf_iter_unix_seq_ops,
3698 	.init_seq_private	= bpf_iter_init_unix,
3699 	.fini_seq_private	= bpf_iter_fini_unix,
3700 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3701 };
3702 
3703 static const struct bpf_func_proto *
3704 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3705 			     const struct bpf_prog *prog)
3706 {
3707 	switch (func_id) {
3708 	case BPF_FUNC_setsockopt:
3709 		return &bpf_sk_setsockopt_proto;
3710 	case BPF_FUNC_getsockopt:
3711 		return &bpf_sk_getsockopt_proto;
3712 	default:
3713 		return NULL;
3714 	}
3715 }
3716 
3717 static struct bpf_iter_reg unix_reg_info = {
3718 	.target			= "unix",
3719 	.ctx_arg_info_size	= 1,
3720 	.ctx_arg_info		= {
3721 		{ offsetof(struct bpf_iter__unix, unix_sk),
3722 		  PTR_TO_BTF_ID_OR_NULL },
3723 	},
3724 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3725 	.seq_info		= &unix_seq_info,
3726 };
3727 
3728 static void __init bpf_iter_register(void)
3729 {
3730 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3731 	if (bpf_iter_reg_target(&unix_reg_info))
3732 		pr_warn("Warning: could not register bpf iterator unix\n");
3733 }
3734 #endif
3735 
3736 static int __init af_unix_init(void)
3737 {
3738 	int i, rc = -1;
3739 
3740 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3741 
3742 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3743 		spin_lock_init(&bsd_socket_locks[i]);
3744 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3745 	}
3746 
3747 	rc = proto_register(&unix_dgram_proto, 1);
3748 	if (rc != 0) {
3749 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3750 		goto out;
3751 	}
3752 
3753 	rc = proto_register(&unix_stream_proto, 1);
3754 	if (rc != 0) {
3755 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3756 		goto out;
3757 	}
3758 
3759 	sock_register(&unix_family_ops);
3760 	register_pernet_subsys(&unix_net_ops);
3761 	unix_bpf_build_proto();
3762 
3763 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3764 	bpf_iter_register();
3765 #endif
3766 
3767 out:
3768 	return rc;
3769 }
3770 
3771 static void __exit af_unix_exit(void)
3772 {
3773 	sock_unregister(PF_UNIX);
3774 	proto_unregister(&unix_dgram_proto);
3775 	proto_unregister(&unix_stream_proto);
3776 	unregister_pernet_subsys(&unix_net_ops);
3777 }
3778 
3779 /* Earlier than device_initcall() so that other drivers invoking
3780    request_module() don't end up in a loop when modprobe tries
3781    to use a UNIX socket. But later than subsys_initcall() because
3782    we depend on stuff initialised there */
3783 fs_initcall(af_unix_init);
3784 module_exit(af_unix_exit);
3785 
3786 MODULE_LICENSE("GPL");
3787 MODULE_ALIAS_NETPROTO(PF_UNIX);
3788