xref: /linux/net/unix/af_unix.c (revision d99ff463ecf651437e9e4abe68f331dfb6b5bd9d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 
120 #include "scm.h"
121 
122 static atomic_long_t unix_nr_socks;
123 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
124 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 
126 /* SMP locking strategy:
127  *    hash table is protected with spinlock.
128  *    each socket state is protected by separate spinlock.
129  */
130 
131 static unsigned int unix_unbound_hash(struct sock *sk)
132 {
133 	unsigned long hash = (unsigned long)sk;
134 
135 	hash ^= hash >> 16;
136 	hash ^= hash >> 8;
137 	hash ^= sk->sk_type;
138 
139 	return hash & UNIX_HASH_MOD;
140 }
141 
142 static unsigned int unix_bsd_hash(struct inode *i)
143 {
144 	return i->i_ino & UNIX_HASH_MOD;
145 }
146 
147 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
148 				       int addr_len, int type)
149 {
150 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
151 	unsigned int hash;
152 
153 	hash = (__force unsigned int)csum_fold(csum);
154 	hash ^= hash >> 8;
155 	hash ^= type;
156 
157 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
158 }
159 
160 static void unix_table_double_lock(struct net *net,
161 				   unsigned int hash1, unsigned int hash2)
162 {
163 	if (hash1 == hash2) {
164 		spin_lock(&net->unx.table.locks[hash1]);
165 		return;
166 	}
167 
168 	if (hash1 > hash2)
169 		swap(hash1, hash2);
170 
171 	spin_lock(&net->unx.table.locks[hash1]);
172 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
173 }
174 
175 static void unix_table_double_unlock(struct net *net,
176 				     unsigned int hash1, unsigned int hash2)
177 {
178 	if (hash1 == hash2) {
179 		spin_unlock(&net->unx.table.locks[hash1]);
180 		return;
181 	}
182 
183 	spin_unlock(&net->unx.table.locks[hash1]);
184 	spin_unlock(&net->unx.table.locks[hash2]);
185 }
186 
187 #ifdef CONFIG_SECURITY_NETWORK
188 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 {
190 	UNIXCB(skb).secid = scm->secid;
191 }
192 
193 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 {
195 	scm->secid = UNIXCB(skb).secid;
196 }
197 
198 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 {
200 	return (scm->secid == UNIXCB(skb).secid);
201 }
202 #else
203 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
204 { }
205 
206 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
207 { }
208 
209 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
210 {
211 	return true;
212 }
213 #endif /* CONFIG_SECURITY_NETWORK */
214 
215 #define unix_peer(sk) (unix_sk(sk)->peer)
216 
217 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
218 {
219 	return unix_peer(osk) == sk;
220 }
221 
222 static inline int unix_may_send(struct sock *sk, struct sock *osk)
223 {
224 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
225 }
226 
227 static inline int unix_recvq_full(const struct sock *sk)
228 {
229 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
230 }
231 
232 static inline int unix_recvq_full_lockless(const struct sock *sk)
233 {
234 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
235 		READ_ONCE(sk->sk_max_ack_backlog);
236 }
237 
238 struct sock *unix_peer_get(struct sock *s)
239 {
240 	struct sock *peer;
241 
242 	unix_state_lock(s);
243 	peer = unix_peer(s);
244 	if (peer)
245 		sock_hold(peer);
246 	unix_state_unlock(s);
247 	return peer;
248 }
249 EXPORT_SYMBOL_GPL(unix_peer_get);
250 
251 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
252 					     int addr_len)
253 {
254 	struct unix_address *addr;
255 
256 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
257 	if (!addr)
258 		return NULL;
259 
260 	refcount_set(&addr->refcnt, 1);
261 	addr->len = addr_len;
262 	memcpy(addr->name, sunaddr, addr_len);
263 
264 	return addr;
265 }
266 
267 static inline void unix_release_addr(struct unix_address *addr)
268 {
269 	if (refcount_dec_and_test(&addr->refcnt))
270 		kfree(addr);
271 }
272 
273 /*
274  *	Check unix socket name:
275  *		- should be not zero length.
276  *	        - if started by not zero, should be NULL terminated (FS object)
277  *		- if started by zero, it is abstract name.
278  */
279 
280 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
281 {
282 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
283 	    addr_len > sizeof(*sunaddr))
284 		return -EINVAL;
285 
286 	if (sunaddr->sun_family != AF_UNIX)
287 		return -EINVAL;
288 
289 	return 0;
290 }
291 
292 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
293 {
294 	/* This may look like an off by one error but it is a bit more
295 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
296 	 * sun_path[108] doesn't as such exist.  However in kernel space
297 	 * we are guaranteed that it is a valid memory location in our
298 	 * kernel address buffer because syscall functions always pass
299 	 * a pointer of struct sockaddr_storage which has a bigger buffer
300 	 * than 108.
301 	 */
302 	((char *)sunaddr)[addr_len] = 0;
303 }
304 
305 static void __unix_remove_socket(struct sock *sk)
306 {
307 	sk_del_node_init(sk);
308 }
309 
310 static void __unix_insert_socket(struct net *net, struct sock *sk)
311 {
312 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
313 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
314 }
315 
316 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
317 				 struct unix_address *addr, unsigned int hash)
318 {
319 	__unix_remove_socket(sk);
320 	smp_store_release(&unix_sk(sk)->addr, addr);
321 
322 	sk->sk_hash = hash;
323 	__unix_insert_socket(net, sk);
324 }
325 
326 static void unix_remove_socket(struct net *net, struct sock *sk)
327 {
328 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
329 	__unix_remove_socket(sk);
330 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
331 }
332 
333 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
334 {
335 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
336 	__unix_insert_socket(net, sk);
337 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
338 }
339 
340 static void unix_insert_bsd_socket(struct sock *sk)
341 {
342 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
343 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
344 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
345 }
346 
347 static void unix_remove_bsd_socket(struct sock *sk)
348 {
349 	if (!hlist_unhashed(&sk->sk_bind_node)) {
350 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
351 		__sk_del_bind_node(sk);
352 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
353 
354 		sk_node_init(&sk->sk_bind_node);
355 	}
356 }
357 
358 static struct sock *__unix_find_socket_byname(struct net *net,
359 					      struct sockaddr_un *sunname,
360 					      int len, unsigned int hash)
361 {
362 	struct sock *s;
363 
364 	sk_for_each(s, &net->unx.table.buckets[hash]) {
365 		struct unix_sock *u = unix_sk(s);
366 
367 		if (u->addr->len == len &&
368 		    !memcmp(u->addr->name, sunname, len))
369 			return s;
370 	}
371 	return NULL;
372 }
373 
374 static inline struct sock *unix_find_socket_byname(struct net *net,
375 						   struct sockaddr_un *sunname,
376 						   int len, unsigned int hash)
377 {
378 	struct sock *s;
379 
380 	spin_lock(&net->unx.table.locks[hash]);
381 	s = __unix_find_socket_byname(net, sunname, len, hash);
382 	if (s)
383 		sock_hold(s);
384 	spin_unlock(&net->unx.table.locks[hash]);
385 	return s;
386 }
387 
388 static struct sock *unix_find_socket_byinode(struct inode *i)
389 {
390 	unsigned int hash = unix_bsd_hash(i);
391 	struct sock *s;
392 
393 	spin_lock(&bsd_socket_locks[hash]);
394 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
395 		struct dentry *dentry = unix_sk(s)->path.dentry;
396 
397 		if (dentry && d_backing_inode(dentry) == i) {
398 			sock_hold(s);
399 			spin_unlock(&bsd_socket_locks[hash]);
400 			return s;
401 		}
402 	}
403 	spin_unlock(&bsd_socket_locks[hash]);
404 	return NULL;
405 }
406 
407 /* Support code for asymmetrically connected dgram sockets
408  *
409  * If a datagram socket is connected to a socket not itself connected
410  * to the first socket (eg, /dev/log), clients may only enqueue more
411  * messages if the present receive queue of the server socket is not
412  * "too large". This means there's a second writeability condition
413  * poll and sendmsg need to test. The dgram recv code will do a wake
414  * up on the peer_wait wait queue of a socket upon reception of a
415  * datagram which needs to be propagated to sleeping would-be writers
416  * since these might not have sent anything so far. This can't be
417  * accomplished via poll_wait because the lifetime of the server
418  * socket might be less than that of its clients if these break their
419  * association with it or if the server socket is closed while clients
420  * are still connected to it and there's no way to inform "a polling
421  * implementation" that it should let go of a certain wait queue
422  *
423  * In order to propagate a wake up, a wait_queue_entry_t of the client
424  * socket is enqueued on the peer_wait queue of the server socket
425  * whose wake function does a wake_up on the ordinary client socket
426  * wait queue. This connection is established whenever a write (or
427  * poll for write) hit the flow control condition and broken when the
428  * association to the server socket is dissolved or after a wake up
429  * was relayed.
430  */
431 
432 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
433 				      void *key)
434 {
435 	struct unix_sock *u;
436 	wait_queue_head_t *u_sleep;
437 
438 	u = container_of(q, struct unix_sock, peer_wake);
439 
440 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
441 			    q);
442 	u->peer_wake.private = NULL;
443 
444 	/* relaying can only happen while the wq still exists */
445 	u_sleep = sk_sleep(&u->sk);
446 	if (u_sleep)
447 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
448 
449 	return 0;
450 }
451 
452 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
453 {
454 	struct unix_sock *u, *u_other;
455 	int rc;
456 
457 	u = unix_sk(sk);
458 	u_other = unix_sk(other);
459 	rc = 0;
460 	spin_lock(&u_other->peer_wait.lock);
461 
462 	if (!u->peer_wake.private) {
463 		u->peer_wake.private = other;
464 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
465 
466 		rc = 1;
467 	}
468 
469 	spin_unlock(&u_other->peer_wait.lock);
470 	return rc;
471 }
472 
473 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
474 					    struct sock *other)
475 {
476 	struct unix_sock *u, *u_other;
477 
478 	u = unix_sk(sk);
479 	u_other = unix_sk(other);
480 	spin_lock(&u_other->peer_wait.lock);
481 
482 	if (u->peer_wake.private == other) {
483 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
484 		u->peer_wake.private = NULL;
485 	}
486 
487 	spin_unlock(&u_other->peer_wait.lock);
488 }
489 
490 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
491 						   struct sock *other)
492 {
493 	unix_dgram_peer_wake_disconnect(sk, other);
494 	wake_up_interruptible_poll(sk_sleep(sk),
495 				   EPOLLOUT |
496 				   EPOLLWRNORM |
497 				   EPOLLWRBAND);
498 }
499 
500 /* preconditions:
501  *	- unix_peer(sk) == other
502  *	- association is stable
503  */
504 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
505 {
506 	int connected;
507 
508 	connected = unix_dgram_peer_wake_connect(sk, other);
509 
510 	/* If other is SOCK_DEAD, we want to make sure we signal
511 	 * POLLOUT, such that a subsequent write() can get a
512 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
513 	 * to other and its full, we will hang waiting for POLLOUT.
514 	 */
515 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
516 		return 1;
517 
518 	if (connected)
519 		unix_dgram_peer_wake_disconnect(sk, other);
520 
521 	return 0;
522 }
523 
524 static int unix_writable(const struct sock *sk)
525 {
526 	return sk->sk_state != TCP_LISTEN &&
527 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
528 }
529 
530 static void unix_write_space(struct sock *sk)
531 {
532 	struct socket_wq *wq;
533 
534 	rcu_read_lock();
535 	if (unix_writable(sk)) {
536 		wq = rcu_dereference(sk->sk_wq);
537 		if (skwq_has_sleeper(wq))
538 			wake_up_interruptible_sync_poll(&wq->wait,
539 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
540 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
541 	}
542 	rcu_read_unlock();
543 }
544 
545 /* When dgram socket disconnects (or changes its peer), we clear its receive
546  * queue of packets arrived from previous peer. First, it allows to do
547  * flow control based only on wmem_alloc; second, sk connected to peer
548  * may receive messages only from that peer. */
549 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
550 {
551 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
552 		skb_queue_purge(&sk->sk_receive_queue);
553 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
554 
555 		/* If one link of bidirectional dgram pipe is disconnected,
556 		 * we signal error. Messages are lost. Do not make this,
557 		 * when peer was not connected to us.
558 		 */
559 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
560 			WRITE_ONCE(other->sk_err, ECONNRESET);
561 			sk_error_report(other);
562 		}
563 	}
564 	other->sk_state = TCP_CLOSE;
565 }
566 
567 static void unix_sock_destructor(struct sock *sk)
568 {
569 	struct unix_sock *u = unix_sk(sk);
570 
571 	skb_queue_purge(&sk->sk_receive_queue);
572 
573 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
574 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
575 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
576 	if (!sock_flag(sk, SOCK_DEAD)) {
577 		pr_info("Attempt to release alive unix socket: %p\n", sk);
578 		return;
579 	}
580 
581 	if (u->addr)
582 		unix_release_addr(u->addr);
583 
584 	atomic_long_dec(&unix_nr_socks);
585 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
586 #ifdef UNIX_REFCNT_DEBUG
587 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
588 		atomic_long_read(&unix_nr_socks));
589 #endif
590 }
591 
592 static void unix_release_sock(struct sock *sk, int embrion)
593 {
594 	struct unix_sock *u = unix_sk(sk);
595 	struct sock *skpair;
596 	struct sk_buff *skb;
597 	struct path path;
598 	int state;
599 
600 	unix_remove_socket(sock_net(sk), sk);
601 	unix_remove_bsd_socket(sk);
602 
603 	/* Clear state */
604 	unix_state_lock(sk);
605 	sock_orphan(sk);
606 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
607 	path	     = u->path;
608 	u->path.dentry = NULL;
609 	u->path.mnt = NULL;
610 	state = sk->sk_state;
611 	sk->sk_state = TCP_CLOSE;
612 
613 	skpair = unix_peer(sk);
614 	unix_peer(sk) = NULL;
615 
616 	unix_state_unlock(sk);
617 
618 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
619 	if (u->oob_skb) {
620 		kfree_skb(u->oob_skb);
621 		u->oob_skb = NULL;
622 	}
623 #endif
624 
625 	wake_up_interruptible_all(&u->peer_wait);
626 
627 	if (skpair != NULL) {
628 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
629 			unix_state_lock(skpair);
630 			/* No more writes */
631 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
632 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
633 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
634 			unix_state_unlock(skpair);
635 			skpair->sk_state_change(skpair);
636 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
637 		}
638 
639 		unix_dgram_peer_wake_disconnect(sk, skpair);
640 		sock_put(skpair); /* It may now die */
641 	}
642 
643 	/* Try to flush out this socket. Throw out buffers at least */
644 
645 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
646 		if (state == TCP_LISTEN)
647 			unix_release_sock(skb->sk, 1);
648 		/* passed fds are erased in the kfree_skb hook	      */
649 		UNIXCB(skb).consumed = skb->len;
650 		kfree_skb(skb);
651 	}
652 
653 	if (path.dentry)
654 		path_put(&path);
655 
656 	sock_put(sk);
657 
658 	/* ---- Socket is dead now and most probably destroyed ---- */
659 
660 	/*
661 	 * Fixme: BSD difference: In BSD all sockets connected to us get
662 	 *	  ECONNRESET and we die on the spot. In Linux we behave
663 	 *	  like files and pipes do and wait for the last
664 	 *	  dereference.
665 	 *
666 	 * Can't we simply set sock->err?
667 	 *
668 	 *	  What the above comment does talk about? --ANK(980817)
669 	 */
670 
671 	if (unix_tot_inflight)
672 		unix_gc();		/* Garbage collect fds */
673 }
674 
675 static void init_peercred(struct sock *sk)
676 {
677 	const struct cred *old_cred;
678 	struct pid *old_pid;
679 
680 	spin_lock(&sk->sk_peer_lock);
681 	old_pid = sk->sk_peer_pid;
682 	old_cred = sk->sk_peer_cred;
683 	sk->sk_peer_pid  = get_pid(task_tgid(current));
684 	sk->sk_peer_cred = get_current_cred();
685 	spin_unlock(&sk->sk_peer_lock);
686 
687 	put_pid(old_pid);
688 	put_cred(old_cred);
689 }
690 
691 static void copy_peercred(struct sock *sk, struct sock *peersk)
692 {
693 	const struct cred *old_cred;
694 	struct pid *old_pid;
695 
696 	if (sk < peersk) {
697 		spin_lock(&sk->sk_peer_lock);
698 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
699 	} else {
700 		spin_lock(&peersk->sk_peer_lock);
701 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
702 	}
703 	old_pid = sk->sk_peer_pid;
704 	old_cred = sk->sk_peer_cred;
705 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
706 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
707 
708 	spin_unlock(&sk->sk_peer_lock);
709 	spin_unlock(&peersk->sk_peer_lock);
710 
711 	put_pid(old_pid);
712 	put_cred(old_cred);
713 }
714 
715 static int unix_listen(struct socket *sock, int backlog)
716 {
717 	int err;
718 	struct sock *sk = sock->sk;
719 	struct unix_sock *u = unix_sk(sk);
720 
721 	err = -EOPNOTSUPP;
722 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
723 		goto out;	/* Only stream/seqpacket sockets accept */
724 	err = -EINVAL;
725 	if (!u->addr)
726 		goto out;	/* No listens on an unbound socket */
727 	unix_state_lock(sk);
728 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
729 		goto out_unlock;
730 	if (backlog > sk->sk_max_ack_backlog)
731 		wake_up_interruptible_all(&u->peer_wait);
732 	sk->sk_max_ack_backlog	= backlog;
733 	sk->sk_state		= TCP_LISTEN;
734 	/* set credentials so connect can copy them */
735 	init_peercred(sk);
736 	err = 0;
737 
738 out_unlock:
739 	unix_state_unlock(sk);
740 out:
741 	return err;
742 }
743 
744 static int unix_release(struct socket *);
745 static int unix_bind(struct socket *, struct sockaddr *, int);
746 static int unix_stream_connect(struct socket *, struct sockaddr *,
747 			       int addr_len, int flags);
748 static int unix_socketpair(struct socket *, struct socket *);
749 static int unix_accept(struct socket *, struct socket *, int, bool);
750 static int unix_getname(struct socket *, struct sockaddr *, int);
751 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
752 static __poll_t unix_dgram_poll(struct file *, struct socket *,
753 				    poll_table *);
754 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
755 #ifdef CONFIG_COMPAT
756 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
757 #endif
758 static int unix_shutdown(struct socket *, int);
759 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
760 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
761 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
762 				       struct pipe_inode_info *, size_t size,
763 				       unsigned int flags);
764 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
765 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
766 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
767 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
768 static int unix_dgram_connect(struct socket *, struct sockaddr *,
769 			      int, int);
770 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
771 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
772 				  int);
773 
774 static int unix_set_peek_off(struct sock *sk, int val)
775 {
776 	struct unix_sock *u = unix_sk(sk);
777 
778 	if (mutex_lock_interruptible(&u->iolock))
779 		return -EINTR;
780 
781 	sk->sk_peek_off = val;
782 	mutex_unlock(&u->iolock);
783 
784 	return 0;
785 }
786 
787 #ifdef CONFIG_PROC_FS
788 static int unix_count_nr_fds(struct sock *sk)
789 {
790 	struct sk_buff *skb;
791 	struct unix_sock *u;
792 	int nr_fds = 0;
793 
794 	spin_lock(&sk->sk_receive_queue.lock);
795 	skb = skb_peek(&sk->sk_receive_queue);
796 	while (skb) {
797 		u = unix_sk(skb->sk);
798 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
799 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
800 	}
801 	spin_unlock(&sk->sk_receive_queue.lock);
802 
803 	return nr_fds;
804 }
805 
806 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
807 {
808 	struct sock *sk = sock->sk;
809 	unsigned char s_state;
810 	struct unix_sock *u;
811 	int nr_fds = 0;
812 
813 	if (sk) {
814 		s_state = READ_ONCE(sk->sk_state);
815 		u = unix_sk(sk);
816 
817 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
818 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
819 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
820 		 */
821 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
822 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
823 		else if (s_state == TCP_LISTEN)
824 			nr_fds = unix_count_nr_fds(sk);
825 
826 		seq_printf(m, "scm_fds: %u\n", nr_fds);
827 	}
828 }
829 #else
830 #define unix_show_fdinfo NULL
831 #endif
832 
833 static const struct proto_ops unix_stream_ops = {
834 	.family =	PF_UNIX,
835 	.owner =	THIS_MODULE,
836 	.release =	unix_release,
837 	.bind =		unix_bind,
838 	.connect =	unix_stream_connect,
839 	.socketpair =	unix_socketpair,
840 	.accept =	unix_accept,
841 	.getname =	unix_getname,
842 	.poll =		unix_poll,
843 	.ioctl =	unix_ioctl,
844 #ifdef CONFIG_COMPAT
845 	.compat_ioctl =	unix_compat_ioctl,
846 #endif
847 	.listen =	unix_listen,
848 	.shutdown =	unix_shutdown,
849 	.sendmsg =	unix_stream_sendmsg,
850 	.recvmsg =	unix_stream_recvmsg,
851 	.read_skb =	unix_stream_read_skb,
852 	.mmap =		sock_no_mmap,
853 	.splice_read =	unix_stream_splice_read,
854 	.set_peek_off =	unix_set_peek_off,
855 	.show_fdinfo =	unix_show_fdinfo,
856 };
857 
858 static const struct proto_ops unix_dgram_ops = {
859 	.family =	PF_UNIX,
860 	.owner =	THIS_MODULE,
861 	.release =	unix_release,
862 	.bind =		unix_bind,
863 	.connect =	unix_dgram_connect,
864 	.socketpair =	unix_socketpair,
865 	.accept =	sock_no_accept,
866 	.getname =	unix_getname,
867 	.poll =		unix_dgram_poll,
868 	.ioctl =	unix_ioctl,
869 #ifdef CONFIG_COMPAT
870 	.compat_ioctl =	unix_compat_ioctl,
871 #endif
872 	.listen =	sock_no_listen,
873 	.shutdown =	unix_shutdown,
874 	.sendmsg =	unix_dgram_sendmsg,
875 	.read_skb =	unix_read_skb,
876 	.recvmsg =	unix_dgram_recvmsg,
877 	.mmap =		sock_no_mmap,
878 	.set_peek_off =	unix_set_peek_off,
879 	.show_fdinfo =	unix_show_fdinfo,
880 };
881 
882 static const struct proto_ops unix_seqpacket_ops = {
883 	.family =	PF_UNIX,
884 	.owner =	THIS_MODULE,
885 	.release =	unix_release,
886 	.bind =		unix_bind,
887 	.connect =	unix_stream_connect,
888 	.socketpair =	unix_socketpair,
889 	.accept =	unix_accept,
890 	.getname =	unix_getname,
891 	.poll =		unix_dgram_poll,
892 	.ioctl =	unix_ioctl,
893 #ifdef CONFIG_COMPAT
894 	.compat_ioctl =	unix_compat_ioctl,
895 #endif
896 	.listen =	unix_listen,
897 	.shutdown =	unix_shutdown,
898 	.sendmsg =	unix_seqpacket_sendmsg,
899 	.recvmsg =	unix_seqpacket_recvmsg,
900 	.mmap =		sock_no_mmap,
901 	.set_peek_off =	unix_set_peek_off,
902 	.show_fdinfo =	unix_show_fdinfo,
903 };
904 
905 static void unix_close(struct sock *sk, long timeout)
906 {
907 	/* Nothing to do here, unix socket does not need a ->close().
908 	 * This is merely for sockmap.
909 	 */
910 }
911 
912 static void unix_unhash(struct sock *sk)
913 {
914 	/* Nothing to do here, unix socket does not need a ->unhash().
915 	 * This is merely for sockmap.
916 	 */
917 }
918 
919 static bool unix_bpf_bypass_getsockopt(int level, int optname)
920 {
921 	if (level == SOL_SOCKET) {
922 		switch (optname) {
923 		case SO_PEERPIDFD:
924 			return true;
925 		default:
926 			return false;
927 		}
928 	}
929 
930 	return false;
931 }
932 
933 struct proto unix_dgram_proto = {
934 	.name			= "UNIX",
935 	.owner			= THIS_MODULE,
936 	.obj_size		= sizeof(struct unix_sock),
937 	.close			= unix_close,
938 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
939 #ifdef CONFIG_BPF_SYSCALL
940 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
941 #endif
942 };
943 
944 struct proto unix_stream_proto = {
945 	.name			= "UNIX-STREAM",
946 	.owner			= THIS_MODULE,
947 	.obj_size		= sizeof(struct unix_sock),
948 	.close			= unix_close,
949 	.unhash			= unix_unhash,
950 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
951 #ifdef CONFIG_BPF_SYSCALL
952 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
953 #endif
954 };
955 
956 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
957 {
958 	struct unix_sock *u;
959 	struct sock *sk;
960 	int err;
961 
962 	atomic_long_inc(&unix_nr_socks);
963 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
964 		err = -ENFILE;
965 		goto err;
966 	}
967 
968 	if (type == SOCK_STREAM)
969 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
970 	else /*dgram and  seqpacket */
971 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
972 
973 	if (!sk) {
974 		err = -ENOMEM;
975 		goto err;
976 	}
977 
978 	sock_init_data(sock, sk);
979 
980 	sk->sk_hash		= unix_unbound_hash(sk);
981 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
982 	sk->sk_write_space	= unix_write_space;
983 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
984 	sk->sk_destruct		= unix_sock_destructor;
985 	u	  = unix_sk(sk);
986 	u->path.dentry = NULL;
987 	u->path.mnt = NULL;
988 	spin_lock_init(&u->lock);
989 	atomic_long_set(&u->inflight, 0);
990 	INIT_LIST_HEAD(&u->link);
991 	mutex_init(&u->iolock); /* single task reading lock */
992 	mutex_init(&u->bindlock); /* single task binding lock */
993 	init_waitqueue_head(&u->peer_wait);
994 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
995 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
996 	unix_insert_unbound_socket(net, sk);
997 
998 	sock_prot_inuse_add(net, sk->sk_prot, 1);
999 
1000 	return sk;
1001 
1002 err:
1003 	atomic_long_dec(&unix_nr_socks);
1004 	return ERR_PTR(err);
1005 }
1006 
1007 static int unix_create(struct net *net, struct socket *sock, int protocol,
1008 		       int kern)
1009 {
1010 	struct sock *sk;
1011 
1012 	if (protocol && protocol != PF_UNIX)
1013 		return -EPROTONOSUPPORT;
1014 
1015 	sock->state = SS_UNCONNECTED;
1016 
1017 	switch (sock->type) {
1018 	case SOCK_STREAM:
1019 		sock->ops = &unix_stream_ops;
1020 		break;
1021 		/*
1022 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1023 		 *	nothing uses it.
1024 		 */
1025 	case SOCK_RAW:
1026 		sock->type = SOCK_DGRAM;
1027 		fallthrough;
1028 	case SOCK_DGRAM:
1029 		sock->ops = &unix_dgram_ops;
1030 		break;
1031 	case SOCK_SEQPACKET:
1032 		sock->ops = &unix_seqpacket_ops;
1033 		break;
1034 	default:
1035 		return -ESOCKTNOSUPPORT;
1036 	}
1037 
1038 	sk = unix_create1(net, sock, kern, sock->type);
1039 	if (IS_ERR(sk))
1040 		return PTR_ERR(sk);
1041 
1042 	return 0;
1043 }
1044 
1045 static int unix_release(struct socket *sock)
1046 {
1047 	struct sock *sk = sock->sk;
1048 
1049 	if (!sk)
1050 		return 0;
1051 
1052 	sk->sk_prot->close(sk, 0);
1053 	unix_release_sock(sk, 0);
1054 	sock->sk = NULL;
1055 
1056 	return 0;
1057 }
1058 
1059 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1060 				  int type)
1061 {
1062 	struct inode *inode;
1063 	struct path path;
1064 	struct sock *sk;
1065 	int err;
1066 
1067 	unix_mkname_bsd(sunaddr, addr_len);
1068 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1069 	if (err)
1070 		goto fail;
1071 
1072 	err = path_permission(&path, MAY_WRITE);
1073 	if (err)
1074 		goto path_put;
1075 
1076 	err = -ECONNREFUSED;
1077 	inode = d_backing_inode(path.dentry);
1078 	if (!S_ISSOCK(inode->i_mode))
1079 		goto path_put;
1080 
1081 	sk = unix_find_socket_byinode(inode);
1082 	if (!sk)
1083 		goto path_put;
1084 
1085 	err = -EPROTOTYPE;
1086 	if (sk->sk_type == type)
1087 		touch_atime(&path);
1088 	else
1089 		goto sock_put;
1090 
1091 	path_put(&path);
1092 
1093 	return sk;
1094 
1095 sock_put:
1096 	sock_put(sk);
1097 path_put:
1098 	path_put(&path);
1099 fail:
1100 	return ERR_PTR(err);
1101 }
1102 
1103 static struct sock *unix_find_abstract(struct net *net,
1104 				       struct sockaddr_un *sunaddr,
1105 				       int addr_len, int type)
1106 {
1107 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1108 	struct dentry *dentry;
1109 	struct sock *sk;
1110 
1111 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1112 	if (!sk)
1113 		return ERR_PTR(-ECONNREFUSED);
1114 
1115 	dentry = unix_sk(sk)->path.dentry;
1116 	if (dentry)
1117 		touch_atime(&unix_sk(sk)->path);
1118 
1119 	return sk;
1120 }
1121 
1122 static struct sock *unix_find_other(struct net *net,
1123 				    struct sockaddr_un *sunaddr,
1124 				    int addr_len, int type)
1125 {
1126 	struct sock *sk;
1127 
1128 	if (sunaddr->sun_path[0])
1129 		sk = unix_find_bsd(sunaddr, addr_len, type);
1130 	else
1131 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1132 
1133 	return sk;
1134 }
1135 
1136 static int unix_autobind(struct sock *sk)
1137 {
1138 	unsigned int new_hash, old_hash = sk->sk_hash;
1139 	struct unix_sock *u = unix_sk(sk);
1140 	struct net *net = sock_net(sk);
1141 	struct unix_address *addr;
1142 	u32 lastnum, ordernum;
1143 	int err;
1144 
1145 	err = mutex_lock_interruptible(&u->bindlock);
1146 	if (err)
1147 		return err;
1148 
1149 	if (u->addr)
1150 		goto out;
1151 
1152 	err = -ENOMEM;
1153 	addr = kzalloc(sizeof(*addr) +
1154 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1155 	if (!addr)
1156 		goto out;
1157 
1158 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1159 	addr->name->sun_family = AF_UNIX;
1160 	refcount_set(&addr->refcnt, 1);
1161 
1162 	ordernum = get_random_u32();
1163 	lastnum = ordernum & 0xFFFFF;
1164 retry:
1165 	ordernum = (ordernum + 1) & 0xFFFFF;
1166 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1167 
1168 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1169 	unix_table_double_lock(net, old_hash, new_hash);
1170 
1171 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1172 		unix_table_double_unlock(net, old_hash, new_hash);
1173 
1174 		/* __unix_find_socket_byname() may take long time if many names
1175 		 * are already in use.
1176 		 */
1177 		cond_resched();
1178 
1179 		if (ordernum == lastnum) {
1180 			/* Give up if all names seems to be in use. */
1181 			err = -ENOSPC;
1182 			unix_release_addr(addr);
1183 			goto out;
1184 		}
1185 
1186 		goto retry;
1187 	}
1188 
1189 	__unix_set_addr_hash(net, sk, addr, new_hash);
1190 	unix_table_double_unlock(net, old_hash, new_hash);
1191 	err = 0;
1192 
1193 out:	mutex_unlock(&u->bindlock);
1194 	return err;
1195 }
1196 
1197 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1198 			 int addr_len)
1199 {
1200 	umode_t mode = S_IFSOCK |
1201 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1202 	unsigned int new_hash, old_hash = sk->sk_hash;
1203 	struct unix_sock *u = unix_sk(sk);
1204 	struct net *net = sock_net(sk);
1205 	struct mnt_idmap *idmap;
1206 	struct unix_address *addr;
1207 	struct dentry *dentry;
1208 	struct path parent;
1209 	int err;
1210 
1211 	unix_mkname_bsd(sunaddr, addr_len);
1212 	addr_len = strlen(sunaddr->sun_path) +
1213 		offsetof(struct sockaddr_un, sun_path) + 1;
1214 
1215 	addr = unix_create_addr(sunaddr, addr_len);
1216 	if (!addr)
1217 		return -ENOMEM;
1218 
1219 	/*
1220 	 * Get the parent directory, calculate the hash for last
1221 	 * component.
1222 	 */
1223 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1224 	if (IS_ERR(dentry)) {
1225 		err = PTR_ERR(dentry);
1226 		goto out;
1227 	}
1228 
1229 	/*
1230 	 * All right, let's create it.
1231 	 */
1232 	idmap = mnt_idmap(parent.mnt);
1233 	err = security_path_mknod(&parent, dentry, mode, 0);
1234 	if (!err)
1235 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1236 	if (err)
1237 		goto out_path;
1238 	err = mutex_lock_interruptible(&u->bindlock);
1239 	if (err)
1240 		goto out_unlink;
1241 	if (u->addr)
1242 		goto out_unlock;
1243 
1244 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1245 	unix_table_double_lock(net, old_hash, new_hash);
1246 	u->path.mnt = mntget(parent.mnt);
1247 	u->path.dentry = dget(dentry);
1248 	__unix_set_addr_hash(net, sk, addr, new_hash);
1249 	unix_table_double_unlock(net, old_hash, new_hash);
1250 	unix_insert_bsd_socket(sk);
1251 	mutex_unlock(&u->bindlock);
1252 	done_path_create(&parent, dentry);
1253 	return 0;
1254 
1255 out_unlock:
1256 	mutex_unlock(&u->bindlock);
1257 	err = -EINVAL;
1258 out_unlink:
1259 	/* failed after successful mknod?  unlink what we'd created... */
1260 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1261 out_path:
1262 	done_path_create(&parent, dentry);
1263 out:
1264 	unix_release_addr(addr);
1265 	return err == -EEXIST ? -EADDRINUSE : err;
1266 }
1267 
1268 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1269 			      int addr_len)
1270 {
1271 	unsigned int new_hash, old_hash = sk->sk_hash;
1272 	struct unix_sock *u = unix_sk(sk);
1273 	struct net *net = sock_net(sk);
1274 	struct unix_address *addr;
1275 	int err;
1276 
1277 	addr = unix_create_addr(sunaddr, addr_len);
1278 	if (!addr)
1279 		return -ENOMEM;
1280 
1281 	err = mutex_lock_interruptible(&u->bindlock);
1282 	if (err)
1283 		goto out;
1284 
1285 	if (u->addr) {
1286 		err = -EINVAL;
1287 		goto out_mutex;
1288 	}
1289 
1290 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1291 	unix_table_double_lock(net, old_hash, new_hash);
1292 
1293 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1294 		goto out_spin;
1295 
1296 	__unix_set_addr_hash(net, sk, addr, new_hash);
1297 	unix_table_double_unlock(net, old_hash, new_hash);
1298 	mutex_unlock(&u->bindlock);
1299 	return 0;
1300 
1301 out_spin:
1302 	unix_table_double_unlock(net, old_hash, new_hash);
1303 	err = -EADDRINUSE;
1304 out_mutex:
1305 	mutex_unlock(&u->bindlock);
1306 out:
1307 	unix_release_addr(addr);
1308 	return err;
1309 }
1310 
1311 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1312 {
1313 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1314 	struct sock *sk = sock->sk;
1315 	int err;
1316 
1317 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1318 	    sunaddr->sun_family == AF_UNIX)
1319 		return unix_autobind(sk);
1320 
1321 	err = unix_validate_addr(sunaddr, addr_len);
1322 	if (err)
1323 		return err;
1324 
1325 	if (sunaddr->sun_path[0])
1326 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1327 	else
1328 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1329 
1330 	return err;
1331 }
1332 
1333 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1334 {
1335 	if (unlikely(sk1 == sk2) || !sk2) {
1336 		unix_state_lock(sk1);
1337 		return;
1338 	}
1339 	if (sk1 < sk2) {
1340 		unix_state_lock(sk1);
1341 		unix_state_lock_nested(sk2);
1342 	} else {
1343 		unix_state_lock(sk2);
1344 		unix_state_lock_nested(sk1);
1345 	}
1346 }
1347 
1348 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1349 {
1350 	if (unlikely(sk1 == sk2) || !sk2) {
1351 		unix_state_unlock(sk1);
1352 		return;
1353 	}
1354 	unix_state_unlock(sk1);
1355 	unix_state_unlock(sk2);
1356 }
1357 
1358 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1359 			      int alen, int flags)
1360 {
1361 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1362 	struct sock *sk = sock->sk;
1363 	struct sock *other;
1364 	int err;
1365 
1366 	err = -EINVAL;
1367 	if (alen < offsetofend(struct sockaddr, sa_family))
1368 		goto out;
1369 
1370 	if (addr->sa_family != AF_UNSPEC) {
1371 		err = unix_validate_addr(sunaddr, alen);
1372 		if (err)
1373 			goto out;
1374 
1375 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1376 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1377 		    !unix_sk(sk)->addr) {
1378 			err = unix_autobind(sk);
1379 			if (err)
1380 				goto out;
1381 		}
1382 
1383 restart:
1384 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1385 		if (IS_ERR(other)) {
1386 			err = PTR_ERR(other);
1387 			goto out;
1388 		}
1389 
1390 		unix_state_double_lock(sk, other);
1391 
1392 		/* Apparently VFS overslept socket death. Retry. */
1393 		if (sock_flag(other, SOCK_DEAD)) {
1394 			unix_state_double_unlock(sk, other);
1395 			sock_put(other);
1396 			goto restart;
1397 		}
1398 
1399 		err = -EPERM;
1400 		if (!unix_may_send(sk, other))
1401 			goto out_unlock;
1402 
1403 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1404 		if (err)
1405 			goto out_unlock;
1406 
1407 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1408 	} else {
1409 		/*
1410 		 *	1003.1g breaking connected state with AF_UNSPEC
1411 		 */
1412 		other = NULL;
1413 		unix_state_double_lock(sk, other);
1414 	}
1415 
1416 	/*
1417 	 * If it was connected, reconnect.
1418 	 */
1419 	if (unix_peer(sk)) {
1420 		struct sock *old_peer = unix_peer(sk);
1421 
1422 		unix_peer(sk) = other;
1423 		if (!other)
1424 			sk->sk_state = TCP_CLOSE;
1425 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1426 
1427 		unix_state_double_unlock(sk, other);
1428 
1429 		if (other != old_peer)
1430 			unix_dgram_disconnected(sk, old_peer);
1431 		sock_put(old_peer);
1432 	} else {
1433 		unix_peer(sk) = other;
1434 		unix_state_double_unlock(sk, other);
1435 	}
1436 
1437 	return 0;
1438 
1439 out_unlock:
1440 	unix_state_double_unlock(sk, other);
1441 	sock_put(other);
1442 out:
1443 	return err;
1444 }
1445 
1446 static long unix_wait_for_peer(struct sock *other, long timeo)
1447 	__releases(&unix_sk(other)->lock)
1448 {
1449 	struct unix_sock *u = unix_sk(other);
1450 	int sched;
1451 	DEFINE_WAIT(wait);
1452 
1453 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1454 
1455 	sched = !sock_flag(other, SOCK_DEAD) &&
1456 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1457 		unix_recvq_full_lockless(other);
1458 
1459 	unix_state_unlock(other);
1460 
1461 	if (sched)
1462 		timeo = schedule_timeout(timeo);
1463 
1464 	finish_wait(&u->peer_wait, &wait);
1465 	return timeo;
1466 }
1467 
1468 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1469 			       int addr_len, int flags)
1470 {
1471 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1472 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1473 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1474 	struct net *net = sock_net(sk);
1475 	struct sk_buff *skb = NULL;
1476 	long timeo;
1477 	int err;
1478 	int st;
1479 
1480 	err = unix_validate_addr(sunaddr, addr_len);
1481 	if (err)
1482 		goto out;
1483 
1484 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1485 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1486 		err = unix_autobind(sk);
1487 		if (err)
1488 			goto out;
1489 	}
1490 
1491 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1492 
1493 	/* First of all allocate resources.
1494 	   If we will make it after state is locked,
1495 	   we will have to recheck all again in any case.
1496 	 */
1497 
1498 	/* create new sock for complete connection */
1499 	newsk = unix_create1(net, NULL, 0, sock->type);
1500 	if (IS_ERR(newsk)) {
1501 		err = PTR_ERR(newsk);
1502 		newsk = NULL;
1503 		goto out;
1504 	}
1505 
1506 	err = -ENOMEM;
1507 
1508 	/* Allocate skb for sending to listening sock */
1509 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1510 	if (skb == NULL)
1511 		goto out;
1512 
1513 restart:
1514 	/*  Find listening sock. */
1515 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1516 	if (IS_ERR(other)) {
1517 		err = PTR_ERR(other);
1518 		other = NULL;
1519 		goto out;
1520 	}
1521 
1522 	/* Latch state of peer */
1523 	unix_state_lock(other);
1524 
1525 	/* Apparently VFS overslept socket death. Retry. */
1526 	if (sock_flag(other, SOCK_DEAD)) {
1527 		unix_state_unlock(other);
1528 		sock_put(other);
1529 		goto restart;
1530 	}
1531 
1532 	err = -ECONNREFUSED;
1533 	if (other->sk_state != TCP_LISTEN)
1534 		goto out_unlock;
1535 	if (other->sk_shutdown & RCV_SHUTDOWN)
1536 		goto out_unlock;
1537 
1538 	if (unix_recvq_full(other)) {
1539 		err = -EAGAIN;
1540 		if (!timeo)
1541 			goto out_unlock;
1542 
1543 		timeo = unix_wait_for_peer(other, timeo);
1544 
1545 		err = sock_intr_errno(timeo);
1546 		if (signal_pending(current))
1547 			goto out;
1548 		sock_put(other);
1549 		goto restart;
1550 	}
1551 
1552 	/* Latch our state.
1553 
1554 	   It is tricky place. We need to grab our state lock and cannot
1555 	   drop lock on peer. It is dangerous because deadlock is
1556 	   possible. Connect to self case and simultaneous
1557 	   attempt to connect are eliminated by checking socket
1558 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1559 	   check this before attempt to grab lock.
1560 
1561 	   Well, and we have to recheck the state after socket locked.
1562 	 */
1563 	st = sk->sk_state;
1564 
1565 	switch (st) {
1566 	case TCP_CLOSE:
1567 		/* This is ok... continue with connect */
1568 		break;
1569 	case TCP_ESTABLISHED:
1570 		/* Socket is already connected */
1571 		err = -EISCONN;
1572 		goto out_unlock;
1573 	default:
1574 		err = -EINVAL;
1575 		goto out_unlock;
1576 	}
1577 
1578 	unix_state_lock_nested(sk);
1579 
1580 	if (sk->sk_state != st) {
1581 		unix_state_unlock(sk);
1582 		unix_state_unlock(other);
1583 		sock_put(other);
1584 		goto restart;
1585 	}
1586 
1587 	err = security_unix_stream_connect(sk, other, newsk);
1588 	if (err) {
1589 		unix_state_unlock(sk);
1590 		goto out_unlock;
1591 	}
1592 
1593 	/* The way is open! Fastly set all the necessary fields... */
1594 
1595 	sock_hold(sk);
1596 	unix_peer(newsk)	= sk;
1597 	newsk->sk_state		= TCP_ESTABLISHED;
1598 	newsk->sk_type		= sk->sk_type;
1599 	init_peercred(newsk);
1600 	newu = unix_sk(newsk);
1601 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1602 	otheru = unix_sk(other);
1603 
1604 	/* copy address information from listening to new sock
1605 	 *
1606 	 * The contents of *(otheru->addr) and otheru->path
1607 	 * are seen fully set up here, since we have found
1608 	 * otheru in hash under its lock.  Insertion into the
1609 	 * hash chain we'd found it in had been done in an
1610 	 * earlier critical area protected by the chain's lock,
1611 	 * the same one where we'd set *(otheru->addr) contents,
1612 	 * as well as otheru->path and otheru->addr itself.
1613 	 *
1614 	 * Using smp_store_release() here to set newu->addr
1615 	 * is enough to make those stores, as well as stores
1616 	 * to newu->path visible to anyone who gets newu->addr
1617 	 * by smp_load_acquire().  IOW, the same warranties
1618 	 * as for unix_sock instances bound in unix_bind() or
1619 	 * in unix_autobind().
1620 	 */
1621 	if (otheru->path.dentry) {
1622 		path_get(&otheru->path);
1623 		newu->path = otheru->path;
1624 	}
1625 	refcount_inc(&otheru->addr->refcnt);
1626 	smp_store_release(&newu->addr, otheru->addr);
1627 
1628 	/* Set credentials */
1629 	copy_peercred(sk, other);
1630 
1631 	sock->state	= SS_CONNECTED;
1632 	sk->sk_state	= TCP_ESTABLISHED;
1633 	sock_hold(newsk);
1634 
1635 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1636 	unix_peer(sk)	= newsk;
1637 
1638 	unix_state_unlock(sk);
1639 
1640 	/* take ten and send info to listening sock */
1641 	spin_lock(&other->sk_receive_queue.lock);
1642 	__skb_queue_tail(&other->sk_receive_queue, skb);
1643 	spin_unlock(&other->sk_receive_queue.lock);
1644 	unix_state_unlock(other);
1645 	other->sk_data_ready(other);
1646 	sock_put(other);
1647 	return 0;
1648 
1649 out_unlock:
1650 	if (other)
1651 		unix_state_unlock(other);
1652 
1653 out:
1654 	kfree_skb(skb);
1655 	if (newsk)
1656 		unix_release_sock(newsk, 0);
1657 	if (other)
1658 		sock_put(other);
1659 	return err;
1660 }
1661 
1662 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1663 {
1664 	struct sock *ska = socka->sk, *skb = sockb->sk;
1665 
1666 	/* Join our sockets back to back */
1667 	sock_hold(ska);
1668 	sock_hold(skb);
1669 	unix_peer(ska) = skb;
1670 	unix_peer(skb) = ska;
1671 	init_peercred(ska);
1672 	init_peercred(skb);
1673 
1674 	ska->sk_state = TCP_ESTABLISHED;
1675 	skb->sk_state = TCP_ESTABLISHED;
1676 	socka->state  = SS_CONNECTED;
1677 	sockb->state  = SS_CONNECTED;
1678 	return 0;
1679 }
1680 
1681 static void unix_sock_inherit_flags(const struct socket *old,
1682 				    struct socket *new)
1683 {
1684 	if (test_bit(SOCK_PASSCRED, &old->flags))
1685 		set_bit(SOCK_PASSCRED, &new->flags);
1686 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1687 		set_bit(SOCK_PASSPIDFD, &new->flags);
1688 	if (test_bit(SOCK_PASSSEC, &old->flags))
1689 		set_bit(SOCK_PASSSEC, &new->flags);
1690 }
1691 
1692 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1693 		       bool kern)
1694 {
1695 	struct sock *sk = sock->sk;
1696 	struct sock *tsk;
1697 	struct sk_buff *skb;
1698 	int err;
1699 
1700 	err = -EOPNOTSUPP;
1701 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1702 		goto out;
1703 
1704 	err = -EINVAL;
1705 	if (sk->sk_state != TCP_LISTEN)
1706 		goto out;
1707 
1708 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1709 	 * so that no locks are necessary.
1710 	 */
1711 
1712 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1713 				&err);
1714 	if (!skb) {
1715 		/* This means receive shutdown. */
1716 		if (err == 0)
1717 			err = -EINVAL;
1718 		goto out;
1719 	}
1720 
1721 	tsk = skb->sk;
1722 	skb_free_datagram(sk, skb);
1723 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1724 
1725 	/* attach accepted sock to socket */
1726 	unix_state_lock(tsk);
1727 	newsock->state = SS_CONNECTED;
1728 	unix_sock_inherit_flags(sock, newsock);
1729 	sock_graft(tsk, newsock);
1730 	unix_state_unlock(tsk);
1731 	return 0;
1732 
1733 out:
1734 	return err;
1735 }
1736 
1737 
1738 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1739 {
1740 	struct sock *sk = sock->sk;
1741 	struct unix_address *addr;
1742 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1743 	int err = 0;
1744 
1745 	if (peer) {
1746 		sk = unix_peer_get(sk);
1747 
1748 		err = -ENOTCONN;
1749 		if (!sk)
1750 			goto out;
1751 		err = 0;
1752 	} else {
1753 		sock_hold(sk);
1754 	}
1755 
1756 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1757 	if (!addr) {
1758 		sunaddr->sun_family = AF_UNIX;
1759 		sunaddr->sun_path[0] = 0;
1760 		err = offsetof(struct sockaddr_un, sun_path);
1761 	} else {
1762 		err = addr->len;
1763 		memcpy(sunaddr, addr->name, addr->len);
1764 	}
1765 	sock_put(sk);
1766 out:
1767 	return err;
1768 }
1769 
1770 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1771 {
1772 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1773 
1774 	/*
1775 	 * Garbage collection of unix sockets starts by selecting a set of
1776 	 * candidate sockets which have reference only from being in flight
1777 	 * (total_refs == inflight_refs).  This condition is checked once during
1778 	 * the candidate collection phase, and candidates are marked as such, so
1779 	 * that non-candidates can later be ignored.  While inflight_refs is
1780 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1781 	 * is an instantaneous decision.
1782 	 *
1783 	 * Once a candidate, however, the socket must not be reinstalled into a
1784 	 * file descriptor while the garbage collection is in progress.
1785 	 *
1786 	 * If the above conditions are met, then the directed graph of
1787 	 * candidates (*) does not change while unix_gc_lock is held.
1788 	 *
1789 	 * Any operations that changes the file count through file descriptors
1790 	 * (dup, close, sendmsg) does not change the graph since candidates are
1791 	 * not installed in fds.
1792 	 *
1793 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1794 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1795 	 * serialized with garbage collection.
1796 	 *
1797 	 * MSG_PEEK is special in that it does not change the inflight count,
1798 	 * yet does install the socket into an fd.  The following lock/unlock
1799 	 * pair is to ensure serialization with garbage collection.  It must be
1800 	 * done between incrementing the file count and installing the file into
1801 	 * an fd.
1802 	 *
1803 	 * If garbage collection starts after the barrier provided by the
1804 	 * lock/unlock, then it will see the elevated refcount and not mark this
1805 	 * as a candidate.  If a garbage collection is already in progress
1806 	 * before the file count was incremented, then the lock/unlock pair will
1807 	 * ensure that garbage collection is finished before progressing to
1808 	 * installing the fd.
1809 	 *
1810 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1811 	 * which is on the queue of listening socket A.
1812 	 */
1813 	spin_lock(&unix_gc_lock);
1814 	spin_unlock(&unix_gc_lock);
1815 }
1816 
1817 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1818 {
1819 	int err = 0;
1820 
1821 	UNIXCB(skb).pid  = get_pid(scm->pid);
1822 	UNIXCB(skb).uid = scm->creds.uid;
1823 	UNIXCB(skb).gid = scm->creds.gid;
1824 	UNIXCB(skb).fp = NULL;
1825 	unix_get_secdata(scm, skb);
1826 	if (scm->fp && send_fds)
1827 		err = unix_attach_fds(scm, skb);
1828 
1829 	skb->destructor = unix_destruct_scm;
1830 	return err;
1831 }
1832 
1833 static bool unix_passcred_enabled(const struct socket *sock,
1834 				  const struct sock *other)
1835 {
1836 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1837 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1838 	       !other->sk_socket ||
1839 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1840 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1841 }
1842 
1843 /*
1844  * Some apps rely on write() giving SCM_CREDENTIALS
1845  * We include credentials if source or destination socket
1846  * asserted SOCK_PASSCRED.
1847  */
1848 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1849 			    const struct sock *other)
1850 {
1851 	if (UNIXCB(skb).pid)
1852 		return;
1853 	if (unix_passcred_enabled(sock, other)) {
1854 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1855 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1856 	}
1857 }
1858 
1859 static bool unix_skb_scm_eq(struct sk_buff *skb,
1860 			    struct scm_cookie *scm)
1861 {
1862 	return UNIXCB(skb).pid == scm->pid &&
1863 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1864 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1865 	       unix_secdata_eq(scm, skb);
1866 }
1867 
1868 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1869 {
1870 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1871 	struct unix_sock *u = unix_sk(sk);
1872 
1873 	if (unlikely(fp && fp->count))
1874 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1875 }
1876 
1877 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1878 {
1879 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1880 	struct unix_sock *u = unix_sk(sk);
1881 
1882 	if (unlikely(fp && fp->count))
1883 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1884 }
1885 
1886 /*
1887  *	Send AF_UNIX data.
1888  */
1889 
1890 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1891 			      size_t len)
1892 {
1893 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1894 	struct sock *sk = sock->sk, *other = NULL;
1895 	struct unix_sock *u = unix_sk(sk);
1896 	struct scm_cookie scm;
1897 	struct sk_buff *skb;
1898 	int data_len = 0;
1899 	int sk_locked;
1900 	long timeo;
1901 	int err;
1902 
1903 	wait_for_unix_gc();
1904 	err = scm_send(sock, msg, &scm, false);
1905 	if (err < 0)
1906 		return err;
1907 
1908 	err = -EOPNOTSUPP;
1909 	if (msg->msg_flags&MSG_OOB)
1910 		goto out;
1911 
1912 	if (msg->msg_namelen) {
1913 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1914 		if (err)
1915 			goto out;
1916 	} else {
1917 		sunaddr = NULL;
1918 		err = -ENOTCONN;
1919 		other = unix_peer_get(sk);
1920 		if (!other)
1921 			goto out;
1922 	}
1923 
1924 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1925 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1926 		err = unix_autobind(sk);
1927 		if (err)
1928 			goto out;
1929 	}
1930 
1931 	err = -EMSGSIZE;
1932 	if (len > sk->sk_sndbuf - 32)
1933 		goto out;
1934 
1935 	if (len > SKB_MAX_ALLOC) {
1936 		data_len = min_t(size_t,
1937 				 len - SKB_MAX_ALLOC,
1938 				 MAX_SKB_FRAGS * PAGE_SIZE);
1939 		data_len = PAGE_ALIGN(data_len);
1940 
1941 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1942 	}
1943 
1944 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1945 				   msg->msg_flags & MSG_DONTWAIT, &err,
1946 				   PAGE_ALLOC_COSTLY_ORDER);
1947 	if (skb == NULL)
1948 		goto out;
1949 
1950 	err = unix_scm_to_skb(&scm, skb, true);
1951 	if (err < 0)
1952 		goto out_free;
1953 
1954 	skb_put(skb, len - data_len);
1955 	skb->data_len = data_len;
1956 	skb->len = len;
1957 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1958 	if (err)
1959 		goto out_free;
1960 
1961 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1962 
1963 restart:
1964 	if (!other) {
1965 		err = -ECONNRESET;
1966 		if (sunaddr == NULL)
1967 			goto out_free;
1968 
1969 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1970 					sk->sk_type);
1971 		if (IS_ERR(other)) {
1972 			err = PTR_ERR(other);
1973 			other = NULL;
1974 			goto out_free;
1975 		}
1976 	}
1977 
1978 	if (sk_filter(other, skb) < 0) {
1979 		/* Toss the packet but do not return any error to the sender */
1980 		err = len;
1981 		goto out_free;
1982 	}
1983 
1984 	sk_locked = 0;
1985 	unix_state_lock(other);
1986 restart_locked:
1987 	err = -EPERM;
1988 	if (!unix_may_send(sk, other))
1989 		goto out_unlock;
1990 
1991 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1992 		/*
1993 		 *	Check with 1003.1g - what should
1994 		 *	datagram error
1995 		 */
1996 		unix_state_unlock(other);
1997 		sock_put(other);
1998 
1999 		if (!sk_locked)
2000 			unix_state_lock(sk);
2001 
2002 		err = 0;
2003 		if (sk->sk_type == SOCK_SEQPACKET) {
2004 			/* We are here only when racing with unix_release_sock()
2005 			 * is clearing @other. Never change state to TCP_CLOSE
2006 			 * unlike SOCK_DGRAM wants.
2007 			 */
2008 			unix_state_unlock(sk);
2009 			err = -EPIPE;
2010 		} else if (unix_peer(sk) == other) {
2011 			unix_peer(sk) = NULL;
2012 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2013 
2014 			sk->sk_state = TCP_CLOSE;
2015 			unix_state_unlock(sk);
2016 
2017 			unix_dgram_disconnected(sk, other);
2018 			sock_put(other);
2019 			err = -ECONNREFUSED;
2020 		} else {
2021 			unix_state_unlock(sk);
2022 		}
2023 
2024 		other = NULL;
2025 		if (err)
2026 			goto out_free;
2027 		goto restart;
2028 	}
2029 
2030 	err = -EPIPE;
2031 	if (other->sk_shutdown & RCV_SHUTDOWN)
2032 		goto out_unlock;
2033 
2034 	if (sk->sk_type != SOCK_SEQPACKET) {
2035 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2036 		if (err)
2037 			goto out_unlock;
2038 	}
2039 
2040 	/* other == sk && unix_peer(other) != sk if
2041 	 * - unix_peer(sk) == NULL, destination address bound to sk
2042 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2043 	 */
2044 	if (other != sk &&
2045 	    unlikely(unix_peer(other) != sk &&
2046 	    unix_recvq_full_lockless(other))) {
2047 		if (timeo) {
2048 			timeo = unix_wait_for_peer(other, timeo);
2049 
2050 			err = sock_intr_errno(timeo);
2051 			if (signal_pending(current))
2052 				goto out_free;
2053 
2054 			goto restart;
2055 		}
2056 
2057 		if (!sk_locked) {
2058 			unix_state_unlock(other);
2059 			unix_state_double_lock(sk, other);
2060 		}
2061 
2062 		if (unix_peer(sk) != other ||
2063 		    unix_dgram_peer_wake_me(sk, other)) {
2064 			err = -EAGAIN;
2065 			sk_locked = 1;
2066 			goto out_unlock;
2067 		}
2068 
2069 		if (!sk_locked) {
2070 			sk_locked = 1;
2071 			goto restart_locked;
2072 		}
2073 	}
2074 
2075 	if (unlikely(sk_locked))
2076 		unix_state_unlock(sk);
2077 
2078 	if (sock_flag(other, SOCK_RCVTSTAMP))
2079 		__net_timestamp(skb);
2080 	maybe_add_creds(skb, sock, other);
2081 	scm_stat_add(other, skb);
2082 	skb_queue_tail(&other->sk_receive_queue, skb);
2083 	unix_state_unlock(other);
2084 	other->sk_data_ready(other);
2085 	sock_put(other);
2086 	scm_destroy(&scm);
2087 	return len;
2088 
2089 out_unlock:
2090 	if (sk_locked)
2091 		unix_state_unlock(sk);
2092 	unix_state_unlock(other);
2093 out_free:
2094 	kfree_skb(skb);
2095 out:
2096 	if (other)
2097 		sock_put(other);
2098 	scm_destroy(&scm);
2099 	return err;
2100 }
2101 
2102 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2103  * bytes, and a minimum of a full page.
2104  */
2105 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2106 
2107 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2108 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2109 		     struct scm_cookie *scm, bool fds_sent)
2110 {
2111 	struct unix_sock *ousk = unix_sk(other);
2112 	struct sk_buff *skb;
2113 	int err = 0;
2114 
2115 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2116 
2117 	if (!skb)
2118 		return err;
2119 
2120 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2121 	if (err < 0) {
2122 		kfree_skb(skb);
2123 		return err;
2124 	}
2125 	skb_put(skb, 1);
2126 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2127 
2128 	if (err) {
2129 		kfree_skb(skb);
2130 		return err;
2131 	}
2132 
2133 	unix_state_lock(other);
2134 
2135 	if (sock_flag(other, SOCK_DEAD) ||
2136 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2137 		unix_state_unlock(other);
2138 		kfree_skb(skb);
2139 		return -EPIPE;
2140 	}
2141 
2142 	maybe_add_creds(skb, sock, other);
2143 	skb_get(skb);
2144 
2145 	if (ousk->oob_skb)
2146 		consume_skb(ousk->oob_skb);
2147 
2148 	WRITE_ONCE(ousk->oob_skb, skb);
2149 
2150 	scm_stat_add(other, skb);
2151 	skb_queue_tail(&other->sk_receive_queue, skb);
2152 	sk_send_sigurg(other);
2153 	unix_state_unlock(other);
2154 	other->sk_data_ready(other);
2155 
2156 	return err;
2157 }
2158 #endif
2159 
2160 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2161 			       size_t len)
2162 {
2163 	struct sock *sk = sock->sk;
2164 	struct sock *other = NULL;
2165 	int err, size;
2166 	struct sk_buff *skb;
2167 	int sent = 0;
2168 	struct scm_cookie scm;
2169 	bool fds_sent = false;
2170 	int data_len;
2171 
2172 	wait_for_unix_gc();
2173 	err = scm_send(sock, msg, &scm, false);
2174 	if (err < 0)
2175 		return err;
2176 
2177 	err = -EOPNOTSUPP;
2178 	if (msg->msg_flags & MSG_OOB) {
2179 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2180 		if (len)
2181 			len--;
2182 		else
2183 #endif
2184 			goto out_err;
2185 	}
2186 
2187 	if (msg->msg_namelen) {
2188 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2189 		goto out_err;
2190 	} else {
2191 		err = -ENOTCONN;
2192 		other = unix_peer(sk);
2193 		if (!other)
2194 			goto out_err;
2195 	}
2196 
2197 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2198 		goto pipe_err;
2199 
2200 	while (sent < len) {
2201 		size = len - sent;
2202 
2203 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2204 			skb = sock_alloc_send_pskb(sk, 0, 0,
2205 						   msg->msg_flags & MSG_DONTWAIT,
2206 						   &err, 0);
2207 		} else {
2208 			/* Keep two messages in the pipe so it schedules better */
2209 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2210 
2211 			/* allow fallback to order-0 allocations */
2212 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2213 
2214 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2215 
2216 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2217 
2218 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2219 						   msg->msg_flags & MSG_DONTWAIT, &err,
2220 						   get_order(UNIX_SKB_FRAGS_SZ));
2221 		}
2222 		if (!skb)
2223 			goto out_err;
2224 
2225 		/* Only send the fds in the first buffer */
2226 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2227 		if (err < 0) {
2228 			kfree_skb(skb);
2229 			goto out_err;
2230 		}
2231 		fds_sent = true;
2232 
2233 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2234 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2235 						   sk->sk_allocation);
2236 			if (err < 0) {
2237 				kfree_skb(skb);
2238 				goto out_err;
2239 			}
2240 			size = err;
2241 			refcount_add(size, &sk->sk_wmem_alloc);
2242 		} else {
2243 			skb_put(skb, size - data_len);
2244 			skb->data_len = data_len;
2245 			skb->len = size;
2246 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2247 			if (err) {
2248 				kfree_skb(skb);
2249 				goto out_err;
2250 			}
2251 		}
2252 
2253 		unix_state_lock(other);
2254 
2255 		if (sock_flag(other, SOCK_DEAD) ||
2256 		    (other->sk_shutdown & RCV_SHUTDOWN))
2257 			goto pipe_err_free;
2258 
2259 		maybe_add_creds(skb, sock, other);
2260 		scm_stat_add(other, skb);
2261 		skb_queue_tail(&other->sk_receive_queue, skb);
2262 		unix_state_unlock(other);
2263 		other->sk_data_ready(other);
2264 		sent += size;
2265 	}
2266 
2267 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2268 	if (msg->msg_flags & MSG_OOB) {
2269 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2270 		if (err)
2271 			goto out_err;
2272 		sent++;
2273 	}
2274 #endif
2275 
2276 	scm_destroy(&scm);
2277 
2278 	return sent;
2279 
2280 pipe_err_free:
2281 	unix_state_unlock(other);
2282 	kfree_skb(skb);
2283 pipe_err:
2284 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2285 		send_sig(SIGPIPE, current, 0);
2286 	err = -EPIPE;
2287 out_err:
2288 	scm_destroy(&scm);
2289 	return sent ? : err;
2290 }
2291 
2292 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2293 				  size_t len)
2294 {
2295 	int err;
2296 	struct sock *sk = sock->sk;
2297 
2298 	err = sock_error(sk);
2299 	if (err)
2300 		return err;
2301 
2302 	if (sk->sk_state != TCP_ESTABLISHED)
2303 		return -ENOTCONN;
2304 
2305 	if (msg->msg_namelen)
2306 		msg->msg_namelen = 0;
2307 
2308 	return unix_dgram_sendmsg(sock, msg, len);
2309 }
2310 
2311 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2312 				  size_t size, int flags)
2313 {
2314 	struct sock *sk = sock->sk;
2315 
2316 	if (sk->sk_state != TCP_ESTABLISHED)
2317 		return -ENOTCONN;
2318 
2319 	return unix_dgram_recvmsg(sock, msg, size, flags);
2320 }
2321 
2322 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2323 {
2324 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2325 
2326 	if (addr) {
2327 		msg->msg_namelen = addr->len;
2328 		memcpy(msg->msg_name, addr->name, addr->len);
2329 	}
2330 }
2331 
2332 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2333 			 int flags)
2334 {
2335 	struct scm_cookie scm;
2336 	struct socket *sock = sk->sk_socket;
2337 	struct unix_sock *u = unix_sk(sk);
2338 	struct sk_buff *skb, *last;
2339 	long timeo;
2340 	int skip;
2341 	int err;
2342 
2343 	err = -EOPNOTSUPP;
2344 	if (flags&MSG_OOB)
2345 		goto out;
2346 
2347 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2348 
2349 	do {
2350 		mutex_lock(&u->iolock);
2351 
2352 		skip = sk_peek_offset(sk, flags);
2353 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2354 					      &skip, &err, &last);
2355 		if (skb) {
2356 			if (!(flags & MSG_PEEK))
2357 				scm_stat_del(sk, skb);
2358 			break;
2359 		}
2360 
2361 		mutex_unlock(&u->iolock);
2362 
2363 		if (err != -EAGAIN)
2364 			break;
2365 	} while (timeo &&
2366 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2367 					      &err, &timeo, last));
2368 
2369 	if (!skb) { /* implies iolock unlocked */
2370 		unix_state_lock(sk);
2371 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2372 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2373 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2374 			err = 0;
2375 		unix_state_unlock(sk);
2376 		goto out;
2377 	}
2378 
2379 	if (wq_has_sleeper(&u->peer_wait))
2380 		wake_up_interruptible_sync_poll(&u->peer_wait,
2381 						EPOLLOUT | EPOLLWRNORM |
2382 						EPOLLWRBAND);
2383 
2384 	if (msg->msg_name)
2385 		unix_copy_addr(msg, skb->sk);
2386 
2387 	if (size > skb->len - skip)
2388 		size = skb->len - skip;
2389 	else if (size < skb->len - skip)
2390 		msg->msg_flags |= MSG_TRUNC;
2391 
2392 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2393 	if (err)
2394 		goto out_free;
2395 
2396 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2397 		__sock_recv_timestamp(msg, sk, skb);
2398 
2399 	memset(&scm, 0, sizeof(scm));
2400 
2401 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2402 	unix_set_secdata(&scm, skb);
2403 
2404 	if (!(flags & MSG_PEEK)) {
2405 		if (UNIXCB(skb).fp)
2406 			unix_detach_fds(&scm, skb);
2407 
2408 		sk_peek_offset_bwd(sk, skb->len);
2409 	} else {
2410 		/* It is questionable: on PEEK we could:
2411 		   - do not return fds - good, but too simple 8)
2412 		   - return fds, and do not return them on read (old strategy,
2413 		     apparently wrong)
2414 		   - clone fds (I chose it for now, it is the most universal
2415 		     solution)
2416 
2417 		   POSIX 1003.1g does not actually define this clearly
2418 		   at all. POSIX 1003.1g doesn't define a lot of things
2419 		   clearly however!
2420 
2421 		*/
2422 
2423 		sk_peek_offset_fwd(sk, size);
2424 
2425 		if (UNIXCB(skb).fp)
2426 			unix_peek_fds(&scm, skb);
2427 	}
2428 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2429 
2430 	scm_recv_unix(sock, msg, &scm, flags);
2431 
2432 out_free:
2433 	skb_free_datagram(sk, skb);
2434 	mutex_unlock(&u->iolock);
2435 out:
2436 	return err;
2437 }
2438 
2439 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2440 			      int flags)
2441 {
2442 	struct sock *sk = sock->sk;
2443 
2444 #ifdef CONFIG_BPF_SYSCALL
2445 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2446 
2447 	if (prot != &unix_dgram_proto)
2448 		return prot->recvmsg(sk, msg, size, flags, NULL);
2449 #endif
2450 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2451 }
2452 
2453 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2454 {
2455 	struct unix_sock *u = unix_sk(sk);
2456 	struct sk_buff *skb;
2457 	int err;
2458 
2459 	mutex_lock(&u->iolock);
2460 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2461 	mutex_unlock(&u->iolock);
2462 	if (!skb)
2463 		return err;
2464 
2465 	return recv_actor(sk, skb);
2466 }
2467 
2468 /*
2469  *	Sleep until more data has arrived. But check for races..
2470  */
2471 static long unix_stream_data_wait(struct sock *sk, long timeo,
2472 				  struct sk_buff *last, unsigned int last_len,
2473 				  bool freezable)
2474 {
2475 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2476 	struct sk_buff *tail;
2477 	DEFINE_WAIT(wait);
2478 
2479 	unix_state_lock(sk);
2480 
2481 	for (;;) {
2482 		prepare_to_wait(sk_sleep(sk), &wait, state);
2483 
2484 		tail = skb_peek_tail(&sk->sk_receive_queue);
2485 		if (tail != last ||
2486 		    (tail && tail->len != last_len) ||
2487 		    sk->sk_err ||
2488 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2489 		    signal_pending(current) ||
2490 		    !timeo)
2491 			break;
2492 
2493 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2494 		unix_state_unlock(sk);
2495 		timeo = schedule_timeout(timeo);
2496 		unix_state_lock(sk);
2497 
2498 		if (sock_flag(sk, SOCK_DEAD))
2499 			break;
2500 
2501 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2502 	}
2503 
2504 	finish_wait(sk_sleep(sk), &wait);
2505 	unix_state_unlock(sk);
2506 	return timeo;
2507 }
2508 
2509 static unsigned int unix_skb_len(const struct sk_buff *skb)
2510 {
2511 	return skb->len - UNIXCB(skb).consumed;
2512 }
2513 
2514 struct unix_stream_read_state {
2515 	int (*recv_actor)(struct sk_buff *, int, int,
2516 			  struct unix_stream_read_state *);
2517 	struct socket *socket;
2518 	struct msghdr *msg;
2519 	struct pipe_inode_info *pipe;
2520 	size_t size;
2521 	int flags;
2522 	unsigned int splice_flags;
2523 };
2524 
2525 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2526 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2527 {
2528 	struct socket *sock = state->socket;
2529 	struct sock *sk = sock->sk;
2530 	struct unix_sock *u = unix_sk(sk);
2531 	int chunk = 1;
2532 	struct sk_buff *oob_skb;
2533 
2534 	mutex_lock(&u->iolock);
2535 	unix_state_lock(sk);
2536 
2537 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2538 		unix_state_unlock(sk);
2539 		mutex_unlock(&u->iolock);
2540 		return -EINVAL;
2541 	}
2542 
2543 	oob_skb = u->oob_skb;
2544 
2545 	if (!(state->flags & MSG_PEEK))
2546 		WRITE_ONCE(u->oob_skb, NULL);
2547 
2548 	unix_state_unlock(sk);
2549 
2550 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2551 
2552 	if (!(state->flags & MSG_PEEK)) {
2553 		UNIXCB(oob_skb).consumed += 1;
2554 		kfree_skb(oob_skb);
2555 	}
2556 
2557 	mutex_unlock(&u->iolock);
2558 
2559 	if (chunk < 0)
2560 		return -EFAULT;
2561 
2562 	state->msg->msg_flags |= MSG_OOB;
2563 	return 1;
2564 }
2565 
2566 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2567 				  int flags, int copied)
2568 {
2569 	struct unix_sock *u = unix_sk(sk);
2570 
2571 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2572 		skb_unlink(skb, &sk->sk_receive_queue);
2573 		consume_skb(skb);
2574 		skb = NULL;
2575 	} else {
2576 		if (skb == u->oob_skb) {
2577 			if (copied) {
2578 				skb = NULL;
2579 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2580 				if (!(flags & MSG_PEEK)) {
2581 					WRITE_ONCE(u->oob_skb, NULL);
2582 					consume_skb(skb);
2583 				}
2584 			} else if (!(flags & MSG_PEEK)) {
2585 				skb_unlink(skb, &sk->sk_receive_queue);
2586 				consume_skb(skb);
2587 				skb = skb_peek(&sk->sk_receive_queue);
2588 			}
2589 		}
2590 	}
2591 	return skb;
2592 }
2593 #endif
2594 
2595 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2596 {
2597 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2598 		return -ENOTCONN;
2599 
2600 	return unix_read_skb(sk, recv_actor);
2601 }
2602 
2603 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2604 				    bool freezable)
2605 {
2606 	struct scm_cookie scm;
2607 	struct socket *sock = state->socket;
2608 	struct sock *sk = sock->sk;
2609 	struct unix_sock *u = unix_sk(sk);
2610 	int copied = 0;
2611 	int flags = state->flags;
2612 	int noblock = flags & MSG_DONTWAIT;
2613 	bool check_creds = false;
2614 	int target;
2615 	int err = 0;
2616 	long timeo;
2617 	int skip;
2618 	size_t size = state->size;
2619 	unsigned int last_len;
2620 
2621 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2622 		err = -EINVAL;
2623 		goto out;
2624 	}
2625 
2626 	if (unlikely(flags & MSG_OOB)) {
2627 		err = -EOPNOTSUPP;
2628 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2629 		err = unix_stream_recv_urg(state);
2630 #endif
2631 		goto out;
2632 	}
2633 
2634 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2635 	timeo = sock_rcvtimeo(sk, noblock);
2636 
2637 	memset(&scm, 0, sizeof(scm));
2638 
2639 	/* Lock the socket to prevent queue disordering
2640 	 * while sleeps in memcpy_tomsg
2641 	 */
2642 	mutex_lock(&u->iolock);
2643 
2644 	skip = max(sk_peek_offset(sk, flags), 0);
2645 
2646 	do {
2647 		int chunk;
2648 		bool drop_skb;
2649 		struct sk_buff *skb, *last;
2650 
2651 redo:
2652 		unix_state_lock(sk);
2653 		if (sock_flag(sk, SOCK_DEAD)) {
2654 			err = -ECONNRESET;
2655 			goto unlock;
2656 		}
2657 		last = skb = skb_peek(&sk->sk_receive_queue);
2658 		last_len = last ? last->len : 0;
2659 
2660 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2661 		if (skb) {
2662 			skb = manage_oob(skb, sk, flags, copied);
2663 			if (!skb) {
2664 				unix_state_unlock(sk);
2665 				if (copied)
2666 					break;
2667 				goto redo;
2668 			}
2669 		}
2670 #endif
2671 again:
2672 		if (skb == NULL) {
2673 			if (copied >= target)
2674 				goto unlock;
2675 
2676 			/*
2677 			 *	POSIX 1003.1g mandates this order.
2678 			 */
2679 
2680 			err = sock_error(sk);
2681 			if (err)
2682 				goto unlock;
2683 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2684 				goto unlock;
2685 
2686 			unix_state_unlock(sk);
2687 			if (!timeo) {
2688 				err = -EAGAIN;
2689 				break;
2690 			}
2691 
2692 			mutex_unlock(&u->iolock);
2693 
2694 			timeo = unix_stream_data_wait(sk, timeo, last,
2695 						      last_len, freezable);
2696 
2697 			if (signal_pending(current)) {
2698 				err = sock_intr_errno(timeo);
2699 				scm_destroy(&scm);
2700 				goto out;
2701 			}
2702 
2703 			mutex_lock(&u->iolock);
2704 			goto redo;
2705 unlock:
2706 			unix_state_unlock(sk);
2707 			break;
2708 		}
2709 
2710 		while (skip >= unix_skb_len(skb)) {
2711 			skip -= unix_skb_len(skb);
2712 			last = skb;
2713 			last_len = skb->len;
2714 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2715 			if (!skb)
2716 				goto again;
2717 		}
2718 
2719 		unix_state_unlock(sk);
2720 
2721 		if (check_creds) {
2722 			/* Never glue messages from different writers */
2723 			if (!unix_skb_scm_eq(skb, &scm))
2724 				break;
2725 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2726 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2727 			/* Copy credentials */
2728 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2729 			unix_set_secdata(&scm, skb);
2730 			check_creds = true;
2731 		}
2732 
2733 		/* Copy address just once */
2734 		if (state->msg && state->msg->msg_name) {
2735 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2736 					 state->msg->msg_name);
2737 			unix_copy_addr(state->msg, skb->sk);
2738 			sunaddr = NULL;
2739 		}
2740 
2741 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2742 		skb_get(skb);
2743 		chunk = state->recv_actor(skb, skip, chunk, state);
2744 		drop_skb = !unix_skb_len(skb);
2745 		/* skb is only safe to use if !drop_skb */
2746 		consume_skb(skb);
2747 		if (chunk < 0) {
2748 			if (copied == 0)
2749 				copied = -EFAULT;
2750 			break;
2751 		}
2752 		copied += chunk;
2753 		size -= chunk;
2754 
2755 		if (drop_skb) {
2756 			/* the skb was touched by a concurrent reader;
2757 			 * we should not expect anything from this skb
2758 			 * anymore and assume it invalid - we can be
2759 			 * sure it was dropped from the socket queue
2760 			 *
2761 			 * let's report a short read
2762 			 */
2763 			err = 0;
2764 			break;
2765 		}
2766 
2767 		/* Mark read part of skb as used */
2768 		if (!(flags & MSG_PEEK)) {
2769 			UNIXCB(skb).consumed += chunk;
2770 
2771 			sk_peek_offset_bwd(sk, chunk);
2772 
2773 			if (UNIXCB(skb).fp) {
2774 				scm_stat_del(sk, skb);
2775 				unix_detach_fds(&scm, skb);
2776 			}
2777 
2778 			if (unix_skb_len(skb))
2779 				break;
2780 
2781 			skb_unlink(skb, &sk->sk_receive_queue);
2782 			consume_skb(skb);
2783 
2784 			if (scm.fp)
2785 				break;
2786 		} else {
2787 			/* It is questionable, see note in unix_dgram_recvmsg.
2788 			 */
2789 			if (UNIXCB(skb).fp)
2790 				unix_peek_fds(&scm, skb);
2791 
2792 			sk_peek_offset_fwd(sk, chunk);
2793 
2794 			if (UNIXCB(skb).fp)
2795 				break;
2796 
2797 			skip = 0;
2798 			last = skb;
2799 			last_len = skb->len;
2800 			unix_state_lock(sk);
2801 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2802 			if (skb)
2803 				goto again;
2804 			unix_state_unlock(sk);
2805 			break;
2806 		}
2807 	} while (size);
2808 
2809 	mutex_unlock(&u->iolock);
2810 	if (state->msg)
2811 		scm_recv_unix(sock, state->msg, &scm, flags);
2812 	else
2813 		scm_destroy(&scm);
2814 out:
2815 	return copied ? : err;
2816 }
2817 
2818 static int unix_stream_read_actor(struct sk_buff *skb,
2819 				  int skip, int chunk,
2820 				  struct unix_stream_read_state *state)
2821 {
2822 	int ret;
2823 
2824 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2825 				    state->msg, chunk);
2826 	return ret ?: chunk;
2827 }
2828 
2829 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2830 			  size_t size, int flags)
2831 {
2832 	struct unix_stream_read_state state = {
2833 		.recv_actor = unix_stream_read_actor,
2834 		.socket = sk->sk_socket,
2835 		.msg = msg,
2836 		.size = size,
2837 		.flags = flags
2838 	};
2839 
2840 	return unix_stream_read_generic(&state, true);
2841 }
2842 
2843 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2844 			       size_t size, int flags)
2845 {
2846 	struct unix_stream_read_state state = {
2847 		.recv_actor = unix_stream_read_actor,
2848 		.socket = sock,
2849 		.msg = msg,
2850 		.size = size,
2851 		.flags = flags
2852 	};
2853 
2854 #ifdef CONFIG_BPF_SYSCALL
2855 	struct sock *sk = sock->sk;
2856 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2857 
2858 	if (prot != &unix_stream_proto)
2859 		return prot->recvmsg(sk, msg, size, flags, NULL);
2860 #endif
2861 	return unix_stream_read_generic(&state, true);
2862 }
2863 
2864 static int unix_stream_splice_actor(struct sk_buff *skb,
2865 				    int skip, int chunk,
2866 				    struct unix_stream_read_state *state)
2867 {
2868 	return skb_splice_bits(skb, state->socket->sk,
2869 			       UNIXCB(skb).consumed + skip,
2870 			       state->pipe, chunk, state->splice_flags);
2871 }
2872 
2873 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2874 				       struct pipe_inode_info *pipe,
2875 				       size_t size, unsigned int flags)
2876 {
2877 	struct unix_stream_read_state state = {
2878 		.recv_actor = unix_stream_splice_actor,
2879 		.socket = sock,
2880 		.pipe = pipe,
2881 		.size = size,
2882 		.splice_flags = flags,
2883 	};
2884 
2885 	if (unlikely(*ppos))
2886 		return -ESPIPE;
2887 
2888 	if (sock->file->f_flags & O_NONBLOCK ||
2889 	    flags & SPLICE_F_NONBLOCK)
2890 		state.flags = MSG_DONTWAIT;
2891 
2892 	return unix_stream_read_generic(&state, false);
2893 }
2894 
2895 static int unix_shutdown(struct socket *sock, int mode)
2896 {
2897 	struct sock *sk = sock->sk;
2898 	struct sock *other;
2899 
2900 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2901 		return -EINVAL;
2902 	/* This maps:
2903 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2904 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2905 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2906 	 */
2907 	++mode;
2908 
2909 	unix_state_lock(sk);
2910 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
2911 	other = unix_peer(sk);
2912 	if (other)
2913 		sock_hold(other);
2914 	unix_state_unlock(sk);
2915 	sk->sk_state_change(sk);
2916 
2917 	if (other &&
2918 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2919 
2920 		int peer_mode = 0;
2921 		const struct proto *prot = READ_ONCE(other->sk_prot);
2922 
2923 		if (prot->unhash)
2924 			prot->unhash(other);
2925 		if (mode&RCV_SHUTDOWN)
2926 			peer_mode |= SEND_SHUTDOWN;
2927 		if (mode&SEND_SHUTDOWN)
2928 			peer_mode |= RCV_SHUTDOWN;
2929 		unix_state_lock(other);
2930 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
2931 		unix_state_unlock(other);
2932 		other->sk_state_change(other);
2933 		if (peer_mode == SHUTDOWN_MASK)
2934 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2935 		else if (peer_mode & RCV_SHUTDOWN)
2936 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2937 	}
2938 	if (other)
2939 		sock_put(other);
2940 
2941 	return 0;
2942 }
2943 
2944 long unix_inq_len(struct sock *sk)
2945 {
2946 	struct sk_buff *skb;
2947 	long amount = 0;
2948 
2949 	if (sk->sk_state == TCP_LISTEN)
2950 		return -EINVAL;
2951 
2952 	spin_lock(&sk->sk_receive_queue.lock);
2953 	if (sk->sk_type == SOCK_STREAM ||
2954 	    sk->sk_type == SOCK_SEQPACKET) {
2955 		skb_queue_walk(&sk->sk_receive_queue, skb)
2956 			amount += unix_skb_len(skb);
2957 	} else {
2958 		skb = skb_peek(&sk->sk_receive_queue);
2959 		if (skb)
2960 			amount = skb->len;
2961 	}
2962 	spin_unlock(&sk->sk_receive_queue.lock);
2963 
2964 	return amount;
2965 }
2966 EXPORT_SYMBOL_GPL(unix_inq_len);
2967 
2968 long unix_outq_len(struct sock *sk)
2969 {
2970 	return sk_wmem_alloc_get(sk);
2971 }
2972 EXPORT_SYMBOL_GPL(unix_outq_len);
2973 
2974 static int unix_open_file(struct sock *sk)
2975 {
2976 	struct path path;
2977 	struct file *f;
2978 	int fd;
2979 
2980 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2981 		return -EPERM;
2982 
2983 	if (!smp_load_acquire(&unix_sk(sk)->addr))
2984 		return -ENOENT;
2985 
2986 	path = unix_sk(sk)->path;
2987 	if (!path.dentry)
2988 		return -ENOENT;
2989 
2990 	path_get(&path);
2991 
2992 	fd = get_unused_fd_flags(O_CLOEXEC);
2993 	if (fd < 0)
2994 		goto out;
2995 
2996 	f = dentry_open(&path, O_PATH, current_cred());
2997 	if (IS_ERR(f)) {
2998 		put_unused_fd(fd);
2999 		fd = PTR_ERR(f);
3000 		goto out;
3001 	}
3002 
3003 	fd_install(fd, f);
3004 out:
3005 	path_put(&path);
3006 
3007 	return fd;
3008 }
3009 
3010 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3011 {
3012 	struct sock *sk = sock->sk;
3013 	long amount = 0;
3014 	int err;
3015 
3016 	switch (cmd) {
3017 	case SIOCOUTQ:
3018 		amount = unix_outq_len(sk);
3019 		err = put_user(amount, (int __user *)arg);
3020 		break;
3021 	case SIOCINQ:
3022 		amount = unix_inq_len(sk);
3023 		if (amount < 0)
3024 			err = amount;
3025 		else
3026 			err = put_user(amount, (int __user *)arg);
3027 		break;
3028 	case SIOCUNIXFILE:
3029 		err = unix_open_file(sk);
3030 		break;
3031 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3032 	case SIOCATMARK:
3033 		{
3034 			struct sk_buff *skb;
3035 			int answ = 0;
3036 
3037 			skb = skb_peek(&sk->sk_receive_queue);
3038 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3039 				answ = 1;
3040 			err = put_user(answ, (int __user *)arg);
3041 		}
3042 		break;
3043 #endif
3044 	default:
3045 		err = -ENOIOCTLCMD;
3046 		break;
3047 	}
3048 	return err;
3049 }
3050 
3051 #ifdef CONFIG_COMPAT
3052 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3053 {
3054 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3055 }
3056 #endif
3057 
3058 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3059 {
3060 	struct sock *sk = sock->sk;
3061 	__poll_t mask;
3062 	u8 shutdown;
3063 
3064 	sock_poll_wait(file, sock, wait);
3065 	mask = 0;
3066 	shutdown = READ_ONCE(sk->sk_shutdown);
3067 
3068 	/* exceptional events? */
3069 	if (READ_ONCE(sk->sk_err))
3070 		mask |= EPOLLERR;
3071 	if (shutdown == SHUTDOWN_MASK)
3072 		mask |= EPOLLHUP;
3073 	if (shutdown & RCV_SHUTDOWN)
3074 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3075 
3076 	/* readable? */
3077 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3078 		mask |= EPOLLIN | EPOLLRDNORM;
3079 	if (sk_is_readable(sk))
3080 		mask |= EPOLLIN | EPOLLRDNORM;
3081 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3082 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3083 		mask |= EPOLLPRI;
3084 #endif
3085 
3086 	/* Connection-based need to check for termination and startup */
3087 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3088 	    sk->sk_state == TCP_CLOSE)
3089 		mask |= EPOLLHUP;
3090 
3091 	/*
3092 	 * we set writable also when the other side has shut down the
3093 	 * connection. This prevents stuck sockets.
3094 	 */
3095 	if (unix_writable(sk))
3096 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3097 
3098 	return mask;
3099 }
3100 
3101 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3102 				    poll_table *wait)
3103 {
3104 	struct sock *sk = sock->sk, *other;
3105 	unsigned int writable;
3106 	__poll_t mask;
3107 	u8 shutdown;
3108 
3109 	sock_poll_wait(file, sock, wait);
3110 	mask = 0;
3111 	shutdown = READ_ONCE(sk->sk_shutdown);
3112 
3113 	/* exceptional events? */
3114 	if (READ_ONCE(sk->sk_err) ||
3115 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3116 		mask |= EPOLLERR |
3117 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3118 
3119 	if (shutdown & RCV_SHUTDOWN)
3120 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3121 	if (shutdown == SHUTDOWN_MASK)
3122 		mask |= EPOLLHUP;
3123 
3124 	/* readable? */
3125 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3126 		mask |= EPOLLIN | EPOLLRDNORM;
3127 	if (sk_is_readable(sk))
3128 		mask |= EPOLLIN | EPOLLRDNORM;
3129 
3130 	/* Connection-based need to check for termination and startup */
3131 	if (sk->sk_type == SOCK_SEQPACKET) {
3132 		if (sk->sk_state == TCP_CLOSE)
3133 			mask |= EPOLLHUP;
3134 		/* connection hasn't started yet? */
3135 		if (sk->sk_state == TCP_SYN_SENT)
3136 			return mask;
3137 	}
3138 
3139 	/* No write status requested, avoid expensive OUT tests. */
3140 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3141 		return mask;
3142 
3143 	writable = unix_writable(sk);
3144 	if (writable) {
3145 		unix_state_lock(sk);
3146 
3147 		other = unix_peer(sk);
3148 		if (other && unix_peer(other) != sk &&
3149 		    unix_recvq_full_lockless(other) &&
3150 		    unix_dgram_peer_wake_me(sk, other))
3151 			writable = 0;
3152 
3153 		unix_state_unlock(sk);
3154 	}
3155 
3156 	if (writable)
3157 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3158 	else
3159 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3160 
3161 	return mask;
3162 }
3163 
3164 #ifdef CONFIG_PROC_FS
3165 
3166 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3167 
3168 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3169 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3170 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3171 
3172 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3173 {
3174 	unsigned long offset = get_offset(*pos);
3175 	unsigned long bucket = get_bucket(*pos);
3176 	unsigned long count = 0;
3177 	struct sock *sk;
3178 
3179 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3180 	     sk; sk = sk_next(sk)) {
3181 		if (++count == offset)
3182 			break;
3183 	}
3184 
3185 	return sk;
3186 }
3187 
3188 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3189 {
3190 	unsigned long bucket = get_bucket(*pos);
3191 	struct net *net = seq_file_net(seq);
3192 	struct sock *sk;
3193 
3194 	while (bucket < UNIX_HASH_SIZE) {
3195 		spin_lock(&net->unx.table.locks[bucket]);
3196 
3197 		sk = unix_from_bucket(seq, pos);
3198 		if (sk)
3199 			return sk;
3200 
3201 		spin_unlock(&net->unx.table.locks[bucket]);
3202 
3203 		*pos = set_bucket_offset(++bucket, 1);
3204 	}
3205 
3206 	return NULL;
3207 }
3208 
3209 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3210 				  loff_t *pos)
3211 {
3212 	unsigned long bucket = get_bucket(*pos);
3213 
3214 	sk = sk_next(sk);
3215 	if (sk)
3216 		return sk;
3217 
3218 
3219 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3220 
3221 	*pos = set_bucket_offset(++bucket, 1);
3222 
3223 	return unix_get_first(seq, pos);
3224 }
3225 
3226 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3227 {
3228 	if (!*pos)
3229 		return SEQ_START_TOKEN;
3230 
3231 	return unix_get_first(seq, pos);
3232 }
3233 
3234 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3235 {
3236 	++*pos;
3237 
3238 	if (v == SEQ_START_TOKEN)
3239 		return unix_get_first(seq, pos);
3240 
3241 	return unix_get_next(seq, v, pos);
3242 }
3243 
3244 static void unix_seq_stop(struct seq_file *seq, void *v)
3245 {
3246 	struct sock *sk = v;
3247 
3248 	if (sk)
3249 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3250 }
3251 
3252 static int unix_seq_show(struct seq_file *seq, void *v)
3253 {
3254 
3255 	if (v == SEQ_START_TOKEN)
3256 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3257 			 "Inode Path\n");
3258 	else {
3259 		struct sock *s = v;
3260 		struct unix_sock *u = unix_sk(s);
3261 		unix_state_lock(s);
3262 
3263 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3264 			s,
3265 			refcount_read(&s->sk_refcnt),
3266 			0,
3267 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3268 			s->sk_type,
3269 			s->sk_socket ?
3270 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3271 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3272 			sock_i_ino(s));
3273 
3274 		if (u->addr) {	// under a hash table lock here
3275 			int i, len;
3276 			seq_putc(seq, ' ');
3277 
3278 			i = 0;
3279 			len = u->addr->len -
3280 				offsetof(struct sockaddr_un, sun_path);
3281 			if (u->addr->name->sun_path[0]) {
3282 				len--;
3283 			} else {
3284 				seq_putc(seq, '@');
3285 				i++;
3286 			}
3287 			for ( ; i < len; i++)
3288 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3289 					 '@');
3290 		}
3291 		unix_state_unlock(s);
3292 		seq_putc(seq, '\n');
3293 	}
3294 
3295 	return 0;
3296 }
3297 
3298 static const struct seq_operations unix_seq_ops = {
3299 	.start  = unix_seq_start,
3300 	.next   = unix_seq_next,
3301 	.stop   = unix_seq_stop,
3302 	.show   = unix_seq_show,
3303 };
3304 
3305 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3306 struct bpf_unix_iter_state {
3307 	struct seq_net_private p;
3308 	unsigned int cur_sk;
3309 	unsigned int end_sk;
3310 	unsigned int max_sk;
3311 	struct sock **batch;
3312 	bool st_bucket_done;
3313 };
3314 
3315 struct bpf_iter__unix {
3316 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3317 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3318 	uid_t uid __aligned(8);
3319 };
3320 
3321 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3322 			      struct unix_sock *unix_sk, uid_t uid)
3323 {
3324 	struct bpf_iter__unix ctx;
3325 
3326 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3327 	ctx.meta = meta;
3328 	ctx.unix_sk = unix_sk;
3329 	ctx.uid = uid;
3330 	return bpf_iter_run_prog(prog, &ctx);
3331 }
3332 
3333 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3334 
3335 {
3336 	struct bpf_unix_iter_state *iter = seq->private;
3337 	unsigned int expected = 1;
3338 	struct sock *sk;
3339 
3340 	sock_hold(start_sk);
3341 	iter->batch[iter->end_sk++] = start_sk;
3342 
3343 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3344 		if (iter->end_sk < iter->max_sk) {
3345 			sock_hold(sk);
3346 			iter->batch[iter->end_sk++] = sk;
3347 		}
3348 
3349 		expected++;
3350 	}
3351 
3352 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3353 
3354 	return expected;
3355 }
3356 
3357 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3358 {
3359 	while (iter->cur_sk < iter->end_sk)
3360 		sock_put(iter->batch[iter->cur_sk++]);
3361 }
3362 
3363 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3364 				       unsigned int new_batch_sz)
3365 {
3366 	struct sock **new_batch;
3367 
3368 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3369 			     GFP_USER | __GFP_NOWARN);
3370 	if (!new_batch)
3371 		return -ENOMEM;
3372 
3373 	bpf_iter_unix_put_batch(iter);
3374 	kvfree(iter->batch);
3375 	iter->batch = new_batch;
3376 	iter->max_sk = new_batch_sz;
3377 
3378 	return 0;
3379 }
3380 
3381 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3382 					loff_t *pos)
3383 {
3384 	struct bpf_unix_iter_state *iter = seq->private;
3385 	unsigned int expected;
3386 	bool resized = false;
3387 	struct sock *sk;
3388 
3389 	if (iter->st_bucket_done)
3390 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3391 
3392 again:
3393 	/* Get a new batch */
3394 	iter->cur_sk = 0;
3395 	iter->end_sk = 0;
3396 
3397 	sk = unix_get_first(seq, pos);
3398 	if (!sk)
3399 		return NULL; /* Done */
3400 
3401 	expected = bpf_iter_unix_hold_batch(seq, sk);
3402 
3403 	if (iter->end_sk == expected) {
3404 		iter->st_bucket_done = true;
3405 		return sk;
3406 	}
3407 
3408 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3409 		resized = true;
3410 		goto again;
3411 	}
3412 
3413 	return sk;
3414 }
3415 
3416 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3417 {
3418 	if (!*pos)
3419 		return SEQ_START_TOKEN;
3420 
3421 	/* bpf iter does not support lseek, so it always
3422 	 * continue from where it was stop()-ped.
3423 	 */
3424 	return bpf_iter_unix_batch(seq, pos);
3425 }
3426 
3427 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3428 {
3429 	struct bpf_unix_iter_state *iter = seq->private;
3430 	struct sock *sk;
3431 
3432 	/* Whenever seq_next() is called, the iter->cur_sk is
3433 	 * done with seq_show(), so advance to the next sk in
3434 	 * the batch.
3435 	 */
3436 	if (iter->cur_sk < iter->end_sk)
3437 		sock_put(iter->batch[iter->cur_sk++]);
3438 
3439 	++*pos;
3440 
3441 	if (iter->cur_sk < iter->end_sk)
3442 		sk = iter->batch[iter->cur_sk];
3443 	else
3444 		sk = bpf_iter_unix_batch(seq, pos);
3445 
3446 	return sk;
3447 }
3448 
3449 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3450 {
3451 	struct bpf_iter_meta meta;
3452 	struct bpf_prog *prog;
3453 	struct sock *sk = v;
3454 	uid_t uid;
3455 	bool slow;
3456 	int ret;
3457 
3458 	if (v == SEQ_START_TOKEN)
3459 		return 0;
3460 
3461 	slow = lock_sock_fast(sk);
3462 
3463 	if (unlikely(sk_unhashed(sk))) {
3464 		ret = SEQ_SKIP;
3465 		goto unlock;
3466 	}
3467 
3468 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3469 	meta.seq = seq;
3470 	prog = bpf_iter_get_info(&meta, false);
3471 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3472 unlock:
3473 	unlock_sock_fast(sk, slow);
3474 	return ret;
3475 }
3476 
3477 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3478 {
3479 	struct bpf_unix_iter_state *iter = seq->private;
3480 	struct bpf_iter_meta meta;
3481 	struct bpf_prog *prog;
3482 
3483 	if (!v) {
3484 		meta.seq = seq;
3485 		prog = bpf_iter_get_info(&meta, true);
3486 		if (prog)
3487 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3488 	}
3489 
3490 	if (iter->cur_sk < iter->end_sk)
3491 		bpf_iter_unix_put_batch(iter);
3492 }
3493 
3494 static const struct seq_operations bpf_iter_unix_seq_ops = {
3495 	.start	= bpf_iter_unix_seq_start,
3496 	.next	= bpf_iter_unix_seq_next,
3497 	.stop	= bpf_iter_unix_seq_stop,
3498 	.show	= bpf_iter_unix_seq_show,
3499 };
3500 #endif
3501 #endif
3502 
3503 static const struct net_proto_family unix_family_ops = {
3504 	.family = PF_UNIX,
3505 	.create = unix_create,
3506 	.owner	= THIS_MODULE,
3507 };
3508 
3509 
3510 static int __net_init unix_net_init(struct net *net)
3511 {
3512 	int i;
3513 
3514 	net->unx.sysctl_max_dgram_qlen = 10;
3515 	if (unix_sysctl_register(net))
3516 		goto out;
3517 
3518 #ifdef CONFIG_PROC_FS
3519 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3520 			     sizeof(struct seq_net_private)))
3521 		goto err_sysctl;
3522 #endif
3523 
3524 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3525 					      sizeof(spinlock_t), GFP_KERNEL);
3526 	if (!net->unx.table.locks)
3527 		goto err_proc;
3528 
3529 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3530 						sizeof(struct hlist_head),
3531 						GFP_KERNEL);
3532 	if (!net->unx.table.buckets)
3533 		goto free_locks;
3534 
3535 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3536 		spin_lock_init(&net->unx.table.locks[i]);
3537 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3538 	}
3539 
3540 	return 0;
3541 
3542 free_locks:
3543 	kvfree(net->unx.table.locks);
3544 err_proc:
3545 #ifdef CONFIG_PROC_FS
3546 	remove_proc_entry("unix", net->proc_net);
3547 err_sysctl:
3548 #endif
3549 	unix_sysctl_unregister(net);
3550 out:
3551 	return -ENOMEM;
3552 }
3553 
3554 static void __net_exit unix_net_exit(struct net *net)
3555 {
3556 	kvfree(net->unx.table.buckets);
3557 	kvfree(net->unx.table.locks);
3558 	unix_sysctl_unregister(net);
3559 	remove_proc_entry("unix", net->proc_net);
3560 }
3561 
3562 static struct pernet_operations unix_net_ops = {
3563 	.init = unix_net_init,
3564 	.exit = unix_net_exit,
3565 };
3566 
3567 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3568 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3569 		     struct unix_sock *unix_sk, uid_t uid)
3570 
3571 #define INIT_BATCH_SZ 16
3572 
3573 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3574 {
3575 	struct bpf_unix_iter_state *iter = priv_data;
3576 	int err;
3577 
3578 	err = bpf_iter_init_seq_net(priv_data, aux);
3579 	if (err)
3580 		return err;
3581 
3582 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3583 	if (err) {
3584 		bpf_iter_fini_seq_net(priv_data);
3585 		return err;
3586 	}
3587 
3588 	return 0;
3589 }
3590 
3591 static void bpf_iter_fini_unix(void *priv_data)
3592 {
3593 	struct bpf_unix_iter_state *iter = priv_data;
3594 
3595 	bpf_iter_fini_seq_net(priv_data);
3596 	kvfree(iter->batch);
3597 }
3598 
3599 static const struct bpf_iter_seq_info unix_seq_info = {
3600 	.seq_ops		= &bpf_iter_unix_seq_ops,
3601 	.init_seq_private	= bpf_iter_init_unix,
3602 	.fini_seq_private	= bpf_iter_fini_unix,
3603 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3604 };
3605 
3606 static const struct bpf_func_proto *
3607 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3608 			     const struct bpf_prog *prog)
3609 {
3610 	switch (func_id) {
3611 	case BPF_FUNC_setsockopt:
3612 		return &bpf_sk_setsockopt_proto;
3613 	case BPF_FUNC_getsockopt:
3614 		return &bpf_sk_getsockopt_proto;
3615 	default:
3616 		return NULL;
3617 	}
3618 }
3619 
3620 static struct bpf_iter_reg unix_reg_info = {
3621 	.target			= "unix",
3622 	.ctx_arg_info_size	= 1,
3623 	.ctx_arg_info		= {
3624 		{ offsetof(struct bpf_iter__unix, unix_sk),
3625 		  PTR_TO_BTF_ID_OR_NULL },
3626 	},
3627 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3628 	.seq_info		= &unix_seq_info,
3629 };
3630 
3631 static void __init bpf_iter_register(void)
3632 {
3633 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3634 	if (bpf_iter_reg_target(&unix_reg_info))
3635 		pr_warn("Warning: could not register bpf iterator unix\n");
3636 }
3637 #endif
3638 
3639 static int __init af_unix_init(void)
3640 {
3641 	int i, rc = -1;
3642 
3643 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3644 
3645 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3646 		spin_lock_init(&bsd_socket_locks[i]);
3647 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3648 	}
3649 
3650 	rc = proto_register(&unix_dgram_proto, 1);
3651 	if (rc != 0) {
3652 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3653 		goto out;
3654 	}
3655 
3656 	rc = proto_register(&unix_stream_proto, 1);
3657 	if (rc != 0) {
3658 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3659 		proto_unregister(&unix_dgram_proto);
3660 		goto out;
3661 	}
3662 
3663 	sock_register(&unix_family_ops);
3664 	register_pernet_subsys(&unix_net_ops);
3665 	unix_bpf_build_proto();
3666 
3667 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3668 	bpf_iter_register();
3669 #endif
3670 
3671 out:
3672 	return rc;
3673 }
3674 
3675 static void __exit af_unix_exit(void)
3676 {
3677 	sock_unregister(PF_UNIX);
3678 	proto_unregister(&unix_dgram_proto);
3679 	proto_unregister(&unix_stream_proto);
3680 	unregister_pernet_subsys(&unix_net_ops);
3681 }
3682 
3683 /* Earlier than device_initcall() so that other drivers invoking
3684    request_module() don't end up in a loop when modprobe tries
3685    to use a UNIX socket. But later than subsys_initcall() because
3686    we depend on stuff initialised there */
3687 fs_initcall(af_unix_init);
3688 module_exit(af_unix_exit);
3689 
3690 MODULE_LICENSE("GPL");
3691 MODULE_ALIAS_NETPROTO(PF_UNIX);
3692