xref: /linux/net/unix/af_unix.c (revision 4359a011e259a4608afc7fb3635370c9d4ba5943)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/freezer.h>
116 #include <linux/file.h>
117 #include <linux/btf_ids.h>
118 
119 #include "scm.h"
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 #define unix_peer(sk) (unix_sk(sk)->peer)
215 
216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
217 {
218 	return unix_peer(osk) == sk;
219 }
220 
221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
222 {
223 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
224 }
225 
226 static inline int unix_recvq_full(const struct sock *sk)
227 {
228 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
229 }
230 
231 static inline int unix_recvq_full_lockless(const struct sock *sk)
232 {
233 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
234 		READ_ONCE(sk->sk_max_ack_backlog);
235 }
236 
237 struct sock *unix_peer_get(struct sock *s)
238 {
239 	struct sock *peer;
240 
241 	unix_state_lock(s);
242 	peer = unix_peer(s);
243 	if (peer)
244 		sock_hold(peer);
245 	unix_state_unlock(s);
246 	return peer;
247 }
248 EXPORT_SYMBOL_GPL(unix_peer_get);
249 
250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
251 					     int addr_len)
252 {
253 	struct unix_address *addr;
254 
255 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
256 	if (!addr)
257 		return NULL;
258 
259 	refcount_set(&addr->refcnt, 1);
260 	addr->len = addr_len;
261 	memcpy(addr->name, sunaddr, addr_len);
262 
263 	return addr;
264 }
265 
266 static inline void unix_release_addr(struct unix_address *addr)
267 {
268 	if (refcount_dec_and_test(&addr->refcnt))
269 		kfree(addr);
270 }
271 
272 /*
273  *	Check unix socket name:
274  *		- should be not zero length.
275  *	        - if started by not zero, should be NULL terminated (FS object)
276  *		- if started by zero, it is abstract name.
277  */
278 
279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
280 {
281 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
282 	    addr_len > sizeof(*sunaddr))
283 		return -EINVAL;
284 
285 	if (sunaddr->sun_family != AF_UNIX)
286 		return -EINVAL;
287 
288 	return 0;
289 }
290 
291 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
292 {
293 	/* This may look like an off by one error but it is a bit more
294 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
295 	 * sun_path[108] doesn't as such exist.  However in kernel space
296 	 * we are guaranteed that it is a valid memory location in our
297 	 * kernel address buffer because syscall functions always pass
298 	 * a pointer of struct sockaddr_storage which has a bigger buffer
299 	 * than 108.
300 	 */
301 	((char *)sunaddr)[addr_len] = 0;
302 }
303 
304 static void __unix_remove_socket(struct sock *sk)
305 {
306 	sk_del_node_init(sk);
307 }
308 
309 static void __unix_insert_socket(struct net *net, struct sock *sk)
310 {
311 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
312 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
313 }
314 
315 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
316 				 struct unix_address *addr, unsigned int hash)
317 {
318 	__unix_remove_socket(sk);
319 	smp_store_release(&unix_sk(sk)->addr, addr);
320 
321 	sk->sk_hash = hash;
322 	__unix_insert_socket(net, sk);
323 }
324 
325 static void unix_remove_socket(struct net *net, struct sock *sk)
326 {
327 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
328 	__unix_remove_socket(sk);
329 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
330 }
331 
332 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
333 {
334 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
335 	__unix_insert_socket(net, sk);
336 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
337 }
338 
339 static void unix_insert_bsd_socket(struct sock *sk)
340 {
341 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
342 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
343 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
344 }
345 
346 static void unix_remove_bsd_socket(struct sock *sk)
347 {
348 	if (!hlist_unhashed(&sk->sk_bind_node)) {
349 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
350 		__sk_del_bind_node(sk);
351 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
352 
353 		sk_node_init(&sk->sk_bind_node);
354 	}
355 }
356 
357 static struct sock *__unix_find_socket_byname(struct net *net,
358 					      struct sockaddr_un *sunname,
359 					      int len, unsigned int hash)
360 {
361 	struct sock *s;
362 
363 	sk_for_each(s, &net->unx.table.buckets[hash]) {
364 		struct unix_sock *u = unix_sk(s);
365 
366 		if (u->addr->len == len &&
367 		    !memcmp(u->addr->name, sunname, len))
368 			return s;
369 	}
370 	return NULL;
371 }
372 
373 static inline struct sock *unix_find_socket_byname(struct net *net,
374 						   struct sockaddr_un *sunname,
375 						   int len, unsigned int hash)
376 {
377 	struct sock *s;
378 
379 	spin_lock(&net->unx.table.locks[hash]);
380 	s = __unix_find_socket_byname(net, sunname, len, hash);
381 	if (s)
382 		sock_hold(s);
383 	spin_unlock(&net->unx.table.locks[hash]);
384 	return s;
385 }
386 
387 static struct sock *unix_find_socket_byinode(struct inode *i)
388 {
389 	unsigned int hash = unix_bsd_hash(i);
390 	struct sock *s;
391 
392 	spin_lock(&bsd_socket_locks[hash]);
393 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
394 		struct dentry *dentry = unix_sk(s)->path.dentry;
395 
396 		if (dentry && d_backing_inode(dentry) == i) {
397 			sock_hold(s);
398 			spin_unlock(&bsd_socket_locks[hash]);
399 			return s;
400 		}
401 	}
402 	spin_unlock(&bsd_socket_locks[hash]);
403 	return NULL;
404 }
405 
406 /* Support code for asymmetrically connected dgram sockets
407  *
408  * If a datagram socket is connected to a socket not itself connected
409  * to the first socket (eg, /dev/log), clients may only enqueue more
410  * messages if the present receive queue of the server socket is not
411  * "too large". This means there's a second writeability condition
412  * poll and sendmsg need to test. The dgram recv code will do a wake
413  * up on the peer_wait wait queue of a socket upon reception of a
414  * datagram which needs to be propagated to sleeping would-be writers
415  * since these might not have sent anything so far. This can't be
416  * accomplished via poll_wait because the lifetime of the server
417  * socket might be less than that of its clients if these break their
418  * association with it or if the server socket is closed while clients
419  * are still connected to it and there's no way to inform "a polling
420  * implementation" that it should let go of a certain wait queue
421  *
422  * In order to propagate a wake up, a wait_queue_entry_t of the client
423  * socket is enqueued on the peer_wait queue of the server socket
424  * whose wake function does a wake_up on the ordinary client socket
425  * wait queue. This connection is established whenever a write (or
426  * poll for write) hit the flow control condition and broken when the
427  * association to the server socket is dissolved or after a wake up
428  * was relayed.
429  */
430 
431 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
432 				      void *key)
433 {
434 	struct unix_sock *u;
435 	wait_queue_head_t *u_sleep;
436 
437 	u = container_of(q, struct unix_sock, peer_wake);
438 
439 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
440 			    q);
441 	u->peer_wake.private = NULL;
442 
443 	/* relaying can only happen while the wq still exists */
444 	u_sleep = sk_sleep(&u->sk);
445 	if (u_sleep)
446 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
447 
448 	return 0;
449 }
450 
451 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
452 {
453 	struct unix_sock *u, *u_other;
454 	int rc;
455 
456 	u = unix_sk(sk);
457 	u_other = unix_sk(other);
458 	rc = 0;
459 	spin_lock(&u_other->peer_wait.lock);
460 
461 	if (!u->peer_wake.private) {
462 		u->peer_wake.private = other;
463 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
464 
465 		rc = 1;
466 	}
467 
468 	spin_unlock(&u_other->peer_wait.lock);
469 	return rc;
470 }
471 
472 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
473 					    struct sock *other)
474 {
475 	struct unix_sock *u, *u_other;
476 
477 	u = unix_sk(sk);
478 	u_other = unix_sk(other);
479 	spin_lock(&u_other->peer_wait.lock);
480 
481 	if (u->peer_wake.private == other) {
482 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
483 		u->peer_wake.private = NULL;
484 	}
485 
486 	spin_unlock(&u_other->peer_wait.lock);
487 }
488 
489 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
490 						   struct sock *other)
491 {
492 	unix_dgram_peer_wake_disconnect(sk, other);
493 	wake_up_interruptible_poll(sk_sleep(sk),
494 				   EPOLLOUT |
495 				   EPOLLWRNORM |
496 				   EPOLLWRBAND);
497 }
498 
499 /* preconditions:
500  *	- unix_peer(sk) == other
501  *	- association is stable
502  */
503 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
504 {
505 	int connected;
506 
507 	connected = unix_dgram_peer_wake_connect(sk, other);
508 
509 	/* If other is SOCK_DEAD, we want to make sure we signal
510 	 * POLLOUT, such that a subsequent write() can get a
511 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
512 	 * to other and its full, we will hang waiting for POLLOUT.
513 	 */
514 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
515 		return 1;
516 
517 	if (connected)
518 		unix_dgram_peer_wake_disconnect(sk, other);
519 
520 	return 0;
521 }
522 
523 static int unix_writable(const struct sock *sk)
524 {
525 	return sk->sk_state != TCP_LISTEN &&
526 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
527 }
528 
529 static void unix_write_space(struct sock *sk)
530 {
531 	struct socket_wq *wq;
532 
533 	rcu_read_lock();
534 	if (unix_writable(sk)) {
535 		wq = rcu_dereference(sk->sk_wq);
536 		if (skwq_has_sleeper(wq))
537 			wake_up_interruptible_sync_poll(&wq->wait,
538 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
539 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
540 	}
541 	rcu_read_unlock();
542 }
543 
544 /* When dgram socket disconnects (or changes its peer), we clear its receive
545  * queue of packets arrived from previous peer. First, it allows to do
546  * flow control based only on wmem_alloc; second, sk connected to peer
547  * may receive messages only from that peer. */
548 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
549 {
550 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
551 		skb_queue_purge(&sk->sk_receive_queue);
552 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
553 
554 		/* If one link of bidirectional dgram pipe is disconnected,
555 		 * we signal error. Messages are lost. Do not make this,
556 		 * when peer was not connected to us.
557 		 */
558 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
559 			other->sk_err = ECONNRESET;
560 			sk_error_report(other);
561 		}
562 	}
563 	other->sk_state = TCP_CLOSE;
564 }
565 
566 static void unix_sock_destructor(struct sock *sk)
567 {
568 	struct unix_sock *u = unix_sk(sk);
569 
570 	skb_queue_purge(&sk->sk_receive_queue);
571 
572 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
573 	if (u->oob_skb) {
574 		kfree_skb(u->oob_skb);
575 		u->oob_skb = NULL;
576 	}
577 #endif
578 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
579 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
580 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
581 	if (!sock_flag(sk, SOCK_DEAD)) {
582 		pr_info("Attempt to release alive unix socket: %p\n", sk);
583 		return;
584 	}
585 
586 	if (u->addr)
587 		unix_release_addr(u->addr);
588 
589 	atomic_long_dec(&unix_nr_socks);
590 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
591 #ifdef UNIX_REFCNT_DEBUG
592 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
593 		atomic_long_read(&unix_nr_socks));
594 #endif
595 }
596 
597 static void unix_release_sock(struct sock *sk, int embrion)
598 {
599 	struct unix_sock *u = unix_sk(sk);
600 	struct sock *skpair;
601 	struct sk_buff *skb;
602 	struct path path;
603 	int state;
604 
605 	unix_remove_socket(sock_net(sk), sk);
606 	unix_remove_bsd_socket(sk);
607 
608 	/* Clear state */
609 	unix_state_lock(sk);
610 	sock_orphan(sk);
611 	sk->sk_shutdown = SHUTDOWN_MASK;
612 	path	     = u->path;
613 	u->path.dentry = NULL;
614 	u->path.mnt = NULL;
615 	state = sk->sk_state;
616 	sk->sk_state = TCP_CLOSE;
617 
618 	skpair = unix_peer(sk);
619 	unix_peer(sk) = NULL;
620 
621 	unix_state_unlock(sk);
622 
623 	wake_up_interruptible_all(&u->peer_wait);
624 
625 	if (skpair != NULL) {
626 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
627 			unix_state_lock(skpair);
628 			/* No more writes */
629 			skpair->sk_shutdown = SHUTDOWN_MASK;
630 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
631 				skpair->sk_err = ECONNRESET;
632 			unix_state_unlock(skpair);
633 			skpair->sk_state_change(skpair);
634 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
635 		}
636 
637 		unix_dgram_peer_wake_disconnect(sk, skpair);
638 		sock_put(skpair); /* It may now die */
639 	}
640 
641 	/* Try to flush out this socket. Throw out buffers at least */
642 
643 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
644 		if (state == TCP_LISTEN)
645 			unix_release_sock(skb->sk, 1);
646 		/* passed fds are erased in the kfree_skb hook	      */
647 		UNIXCB(skb).consumed = skb->len;
648 		kfree_skb(skb);
649 	}
650 
651 	if (path.dentry)
652 		path_put(&path);
653 
654 	sock_put(sk);
655 
656 	/* ---- Socket is dead now and most probably destroyed ---- */
657 
658 	/*
659 	 * Fixme: BSD difference: In BSD all sockets connected to us get
660 	 *	  ECONNRESET and we die on the spot. In Linux we behave
661 	 *	  like files and pipes do and wait for the last
662 	 *	  dereference.
663 	 *
664 	 * Can't we simply set sock->err?
665 	 *
666 	 *	  What the above comment does talk about? --ANK(980817)
667 	 */
668 
669 	if (unix_tot_inflight)
670 		unix_gc();		/* Garbage collect fds */
671 }
672 
673 static void init_peercred(struct sock *sk)
674 {
675 	const struct cred *old_cred;
676 	struct pid *old_pid;
677 
678 	spin_lock(&sk->sk_peer_lock);
679 	old_pid = sk->sk_peer_pid;
680 	old_cred = sk->sk_peer_cred;
681 	sk->sk_peer_pid  = get_pid(task_tgid(current));
682 	sk->sk_peer_cred = get_current_cred();
683 	spin_unlock(&sk->sk_peer_lock);
684 
685 	put_pid(old_pid);
686 	put_cred(old_cred);
687 }
688 
689 static void copy_peercred(struct sock *sk, struct sock *peersk)
690 {
691 	const struct cred *old_cred;
692 	struct pid *old_pid;
693 
694 	if (sk < peersk) {
695 		spin_lock(&sk->sk_peer_lock);
696 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
697 	} else {
698 		spin_lock(&peersk->sk_peer_lock);
699 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
700 	}
701 	old_pid = sk->sk_peer_pid;
702 	old_cred = sk->sk_peer_cred;
703 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
704 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
705 
706 	spin_unlock(&sk->sk_peer_lock);
707 	spin_unlock(&peersk->sk_peer_lock);
708 
709 	put_pid(old_pid);
710 	put_cred(old_cred);
711 }
712 
713 static int unix_listen(struct socket *sock, int backlog)
714 {
715 	int err;
716 	struct sock *sk = sock->sk;
717 	struct unix_sock *u = unix_sk(sk);
718 
719 	err = -EOPNOTSUPP;
720 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
721 		goto out;	/* Only stream/seqpacket sockets accept */
722 	err = -EINVAL;
723 	if (!u->addr)
724 		goto out;	/* No listens on an unbound socket */
725 	unix_state_lock(sk);
726 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
727 		goto out_unlock;
728 	if (backlog > sk->sk_max_ack_backlog)
729 		wake_up_interruptible_all(&u->peer_wait);
730 	sk->sk_max_ack_backlog	= backlog;
731 	sk->sk_state		= TCP_LISTEN;
732 	/* set credentials so connect can copy them */
733 	init_peercred(sk);
734 	err = 0;
735 
736 out_unlock:
737 	unix_state_unlock(sk);
738 out:
739 	return err;
740 }
741 
742 static int unix_release(struct socket *);
743 static int unix_bind(struct socket *, struct sockaddr *, int);
744 static int unix_stream_connect(struct socket *, struct sockaddr *,
745 			       int addr_len, int flags);
746 static int unix_socketpair(struct socket *, struct socket *);
747 static int unix_accept(struct socket *, struct socket *, int, bool);
748 static int unix_getname(struct socket *, struct sockaddr *, int);
749 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
750 static __poll_t unix_dgram_poll(struct file *, struct socket *,
751 				    poll_table *);
752 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
753 #ifdef CONFIG_COMPAT
754 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
755 #endif
756 static int unix_shutdown(struct socket *, int);
757 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
758 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
759 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
760 				    size_t size, int flags);
761 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
762 				       struct pipe_inode_info *, size_t size,
763 				       unsigned int flags);
764 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
765 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
766 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
767 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
768 static int unix_dgram_connect(struct socket *, struct sockaddr *,
769 			      int, int);
770 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
771 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
772 				  int);
773 
774 static int unix_set_peek_off(struct sock *sk, int val)
775 {
776 	struct unix_sock *u = unix_sk(sk);
777 
778 	if (mutex_lock_interruptible(&u->iolock))
779 		return -EINTR;
780 
781 	sk->sk_peek_off = val;
782 	mutex_unlock(&u->iolock);
783 
784 	return 0;
785 }
786 
787 #ifdef CONFIG_PROC_FS
788 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
789 {
790 	struct sock *sk = sock->sk;
791 	struct unix_sock *u;
792 
793 	if (sk) {
794 		u = unix_sk(sock->sk);
795 		seq_printf(m, "scm_fds: %u\n",
796 			   atomic_read(&u->scm_stat.nr_fds));
797 	}
798 }
799 #else
800 #define unix_show_fdinfo NULL
801 #endif
802 
803 static const struct proto_ops unix_stream_ops = {
804 	.family =	PF_UNIX,
805 	.owner =	THIS_MODULE,
806 	.release =	unix_release,
807 	.bind =		unix_bind,
808 	.connect =	unix_stream_connect,
809 	.socketpair =	unix_socketpair,
810 	.accept =	unix_accept,
811 	.getname =	unix_getname,
812 	.poll =		unix_poll,
813 	.ioctl =	unix_ioctl,
814 #ifdef CONFIG_COMPAT
815 	.compat_ioctl =	unix_compat_ioctl,
816 #endif
817 	.listen =	unix_listen,
818 	.shutdown =	unix_shutdown,
819 	.sendmsg =	unix_stream_sendmsg,
820 	.recvmsg =	unix_stream_recvmsg,
821 	.read_skb =	unix_stream_read_skb,
822 	.mmap =		sock_no_mmap,
823 	.sendpage =	unix_stream_sendpage,
824 	.splice_read =	unix_stream_splice_read,
825 	.set_peek_off =	unix_set_peek_off,
826 	.show_fdinfo =	unix_show_fdinfo,
827 };
828 
829 static const struct proto_ops unix_dgram_ops = {
830 	.family =	PF_UNIX,
831 	.owner =	THIS_MODULE,
832 	.release =	unix_release,
833 	.bind =		unix_bind,
834 	.connect =	unix_dgram_connect,
835 	.socketpair =	unix_socketpair,
836 	.accept =	sock_no_accept,
837 	.getname =	unix_getname,
838 	.poll =		unix_dgram_poll,
839 	.ioctl =	unix_ioctl,
840 #ifdef CONFIG_COMPAT
841 	.compat_ioctl =	unix_compat_ioctl,
842 #endif
843 	.listen =	sock_no_listen,
844 	.shutdown =	unix_shutdown,
845 	.sendmsg =	unix_dgram_sendmsg,
846 	.read_skb =	unix_read_skb,
847 	.recvmsg =	unix_dgram_recvmsg,
848 	.mmap =		sock_no_mmap,
849 	.sendpage =	sock_no_sendpage,
850 	.set_peek_off =	unix_set_peek_off,
851 	.show_fdinfo =	unix_show_fdinfo,
852 };
853 
854 static const struct proto_ops unix_seqpacket_ops = {
855 	.family =	PF_UNIX,
856 	.owner =	THIS_MODULE,
857 	.release =	unix_release,
858 	.bind =		unix_bind,
859 	.connect =	unix_stream_connect,
860 	.socketpair =	unix_socketpair,
861 	.accept =	unix_accept,
862 	.getname =	unix_getname,
863 	.poll =		unix_dgram_poll,
864 	.ioctl =	unix_ioctl,
865 #ifdef CONFIG_COMPAT
866 	.compat_ioctl =	unix_compat_ioctl,
867 #endif
868 	.listen =	unix_listen,
869 	.shutdown =	unix_shutdown,
870 	.sendmsg =	unix_seqpacket_sendmsg,
871 	.recvmsg =	unix_seqpacket_recvmsg,
872 	.mmap =		sock_no_mmap,
873 	.sendpage =	sock_no_sendpage,
874 	.set_peek_off =	unix_set_peek_off,
875 	.show_fdinfo =	unix_show_fdinfo,
876 };
877 
878 static void unix_close(struct sock *sk, long timeout)
879 {
880 	/* Nothing to do here, unix socket does not need a ->close().
881 	 * This is merely for sockmap.
882 	 */
883 }
884 
885 static void unix_unhash(struct sock *sk)
886 {
887 	/* Nothing to do here, unix socket does not need a ->unhash().
888 	 * This is merely for sockmap.
889 	 */
890 }
891 
892 struct proto unix_dgram_proto = {
893 	.name			= "UNIX",
894 	.owner			= THIS_MODULE,
895 	.obj_size		= sizeof(struct unix_sock),
896 	.close			= unix_close,
897 #ifdef CONFIG_BPF_SYSCALL
898 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
899 #endif
900 };
901 
902 struct proto unix_stream_proto = {
903 	.name			= "UNIX-STREAM",
904 	.owner			= THIS_MODULE,
905 	.obj_size		= sizeof(struct unix_sock),
906 	.close			= unix_close,
907 	.unhash			= unix_unhash,
908 #ifdef CONFIG_BPF_SYSCALL
909 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
910 #endif
911 };
912 
913 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
914 {
915 	struct unix_sock *u;
916 	struct sock *sk;
917 	int err;
918 
919 	atomic_long_inc(&unix_nr_socks);
920 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
921 		err = -ENFILE;
922 		goto err;
923 	}
924 
925 	if (type == SOCK_STREAM)
926 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
927 	else /*dgram and  seqpacket */
928 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
929 
930 	if (!sk) {
931 		err = -ENOMEM;
932 		goto err;
933 	}
934 
935 	sock_init_data(sock, sk);
936 
937 	sk->sk_hash		= unix_unbound_hash(sk);
938 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
939 	sk->sk_write_space	= unix_write_space;
940 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
941 	sk->sk_destruct		= unix_sock_destructor;
942 	u	  = unix_sk(sk);
943 	u->path.dentry = NULL;
944 	u->path.mnt = NULL;
945 	spin_lock_init(&u->lock);
946 	atomic_long_set(&u->inflight, 0);
947 	INIT_LIST_HEAD(&u->link);
948 	mutex_init(&u->iolock); /* single task reading lock */
949 	mutex_init(&u->bindlock); /* single task binding lock */
950 	init_waitqueue_head(&u->peer_wait);
951 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
952 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
953 	unix_insert_unbound_socket(net, sk);
954 
955 	sock_prot_inuse_add(net, sk->sk_prot, 1);
956 
957 	return sk;
958 
959 err:
960 	atomic_long_dec(&unix_nr_socks);
961 	return ERR_PTR(err);
962 }
963 
964 static int unix_create(struct net *net, struct socket *sock, int protocol,
965 		       int kern)
966 {
967 	struct sock *sk;
968 
969 	if (protocol && protocol != PF_UNIX)
970 		return -EPROTONOSUPPORT;
971 
972 	sock->state = SS_UNCONNECTED;
973 
974 	switch (sock->type) {
975 	case SOCK_STREAM:
976 		sock->ops = &unix_stream_ops;
977 		break;
978 		/*
979 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
980 		 *	nothing uses it.
981 		 */
982 	case SOCK_RAW:
983 		sock->type = SOCK_DGRAM;
984 		fallthrough;
985 	case SOCK_DGRAM:
986 		sock->ops = &unix_dgram_ops;
987 		break;
988 	case SOCK_SEQPACKET:
989 		sock->ops = &unix_seqpacket_ops;
990 		break;
991 	default:
992 		return -ESOCKTNOSUPPORT;
993 	}
994 
995 	sk = unix_create1(net, sock, kern, sock->type);
996 	if (IS_ERR(sk))
997 		return PTR_ERR(sk);
998 
999 	return 0;
1000 }
1001 
1002 static int unix_release(struct socket *sock)
1003 {
1004 	struct sock *sk = sock->sk;
1005 
1006 	if (!sk)
1007 		return 0;
1008 
1009 	sk->sk_prot->close(sk, 0);
1010 	unix_release_sock(sk, 0);
1011 	sock->sk = NULL;
1012 
1013 	return 0;
1014 }
1015 
1016 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1017 				  int type)
1018 {
1019 	struct inode *inode;
1020 	struct path path;
1021 	struct sock *sk;
1022 	int err;
1023 
1024 	unix_mkname_bsd(sunaddr, addr_len);
1025 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1026 	if (err)
1027 		goto fail;
1028 
1029 	err = path_permission(&path, MAY_WRITE);
1030 	if (err)
1031 		goto path_put;
1032 
1033 	err = -ECONNREFUSED;
1034 	inode = d_backing_inode(path.dentry);
1035 	if (!S_ISSOCK(inode->i_mode))
1036 		goto path_put;
1037 
1038 	sk = unix_find_socket_byinode(inode);
1039 	if (!sk)
1040 		goto path_put;
1041 
1042 	err = -EPROTOTYPE;
1043 	if (sk->sk_type == type)
1044 		touch_atime(&path);
1045 	else
1046 		goto sock_put;
1047 
1048 	path_put(&path);
1049 
1050 	return sk;
1051 
1052 sock_put:
1053 	sock_put(sk);
1054 path_put:
1055 	path_put(&path);
1056 fail:
1057 	return ERR_PTR(err);
1058 }
1059 
1060 static struct sock *unix_find_abstract(struct net *net,
1061 				       struct sockaddr_un *sunaddr,
1062 				       int addr_len, int type)
1063 {
1064 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1065 	struct dentry *dentry;
1066 	struct sock *sk;
1067 
1068 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1069 	if (!sk)
1070 		return ERR_PTR(-ECONNREFUSED);
1071 
1072 	dentry = unix_sk(sk)->path.dentry;
1073 	if (dentry)
1074 		touch_atime(&unix_sk(sk)->path);
1075 
1076 	return sk;
1077 }
1078 
1079 static struct sock *unix_find_other(struct net *net,
1080 				    struct sockaddr_un *sunaddr,
1081 				    int addr_len, int type)
1082 {
1083 	struct sock *sk;
1084 
1085 	if (sunaddr->sun_path[0])
1086 		sk = unix_find_bsd(sunaddr, addr_len, type);
1087 	else
1088 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1089 
1090 	return sk;
1091 }
1092 
1093 static int unix_autobind(struct sock *sk)
1094 {
1095 	unsigned int new_hash, old_hash = sk->sk_hash;
1096 	struct unix_sock *u = unix_sk(sk);
1097 	struct net *net = sock_net(sk);
1098 	struct unix_address *addr;
1099 	u32 lastnum, ordernum;
1100 	int err;
1101 
1102 	err = mutex_lock_interruptible(&u->bindlock);
1103 	if (err)
1104 		return err;
1105 
1106 	if (u->addr)
1107 		goto out;
1108 
1109 	err = -ENOMEM;
1110 	addr = kzalloc(sizeof(*addr) +
1111 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1112 	if (!addr)
1113 		goto out;
1114 
1115 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1116 	addr->name->sun_family = AF_UNIX;
1117 	refcount_set(&addr->refcnt, 1);
1118 
1119 	ordernum = prandom_u32();
1120 	lastnum = ordernum & 0xFFFFF;
1121 retry:
1122 	ordernum = (ordernum + 1) & 0xFFFFF;
1123 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1124 
1125 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1126 	unix_table_double_lock(net, old_hash, new_hash);
1127 
1128 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1129 		unix_table_double_unlock(net, old_hash, new_hash);
1130 
1131 		/* __unix_find_socket_byname() may take long time if many names
1132 		 * are already in use.
1133 		 */
1134 		cond_resched();
1135 
1136 		if (ordernum == lastnum) {
1137 			/* Give up if all names seems to be in use. */
1138 			err = -ENOSPC;
1139 			unix_release_addr(addr);
1140 			goto out;
1141 		}
1142 
1143 		goto retry;
1144 	}
1145 
1146 	__unix_set_addr_hash(net, sk, addr, new_hash);
1147 	unix_table_double_unlock(net, old_hash, new_hash);
1148 	err = 0;
1149 
1150 out:	mutex_unlock(&u->bindlock);
1151 	return err;
1152 }
1153 
1154 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1155 			 int addr_len)
1156 {
1157 	umode_t mode = S_IFSOCK |
1158 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1159 	unsigned int new_hash, old_hash = sk->sk_hash;
1160 	struct unix_sock *u = unix_sk(sk);
1161 	struct net *net = sock_net(sk);
1162 	struct user_namespace *ns; // barf...
1163 	struct unix_address *addr;
1164 	struct dentry *dentry;
1165 	struct path parent;
1166 	int err;
1167 
1168 	unix_mkname_bsd(sunaddr, addr_len);
1169 	addr_len = strlen(sunaddr->sun_path) +
1170 		offsetof(struct sockaddr_un, sun_path) + 1;
1171 
1172 	addr = unix_create_addr(sunaddr, addr_len);
1173 	if (!addr)
1174 		return -ENOMEM;
1175 
1176 	/*
1177 	 * Get the parent directory, calculate the hash for last
1178 	 * component.
1179 	 */
1180 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1181 	if (IS_ERR(dentry)) {
1182 		err = PTR_ERR(dentry);
1183 		goto out;
1184 	}
1185 
1186 	/*
1187 	 * All right, let's create it.
1188 	 */
1189 	ns = mnt_user_ns(parent.mnt);
1190 	err = security_path_mknod(&parent, dentry, mode, 0);
1191 	if (!err)
1192 		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1193 	if (err)
1194 		goto out_path;
1195 	err = mutex_lock_interruptible(&u->bindlock);
1196 	if (err)
1197 		goto out_unlink;
1198 	if (u->addr)
1199 		goto out_unlock;
1200 
1201 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1202 	unix_table_double_lock(net, old_hash, new_hash);
1203 	u->path.mnt = mntget(parent.mnt);
1204 	u->path.dentry = dget(dentry);
1205 	__unix_set_addr_hash(net, sk, addr, new_hash);
1206 	unix_table_double_unlock(net, old_hash, new_hash);
1207 	unix_insert_bsd_socket(sk);
1208 	mutex_unlock(&u->bindlock);
1209 	done_path_create(&parent, dentry);
1210 	return 0;
1211 
1212 out_unlock:
1213 	mutex_unlock(&u->bindlock);
1214 	err = -EINVAL;
1215 out_unlink:
1216 	/* failed after successful mknod?  unlink what we'd created... */
1217 	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1218 out_path:
1219 	done_path_create(&parent, dentry);
1220 out:
1221 	unix_release_addr(addr);
1222 	return err == -EEXIST ? -EADDRINUSE : err;
1223 }
1224 
1225 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1226 			      int addr_len)
1227 {
1228 	unsigned int new_hash, old_hash = sk->sk_hash;
1229 	struct unix_sock *u = unix_sk(sk);
1230 	struct net *net = sock_net(sk);
1231 	struct unix_address *addr;
1232 	int err;
1233 
1234 	addr = unix_create_addr(sunaddr, addr_len);
1235 	if (!addr)
1236 		return -ENOMEM;
1237 
1238 	err = mutex_lock_interruptible(&u->bindlock);
1239 	if (err)
1240 		goto out;
1241 
1242 	if (u->addr) {
1243 		err = -EINVAL;
1244 		goto out_mutex;
1245 	}
1246 
1247 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1248 	unix_table_double_lock(net, old_hash, new_hash);
1249 
1250 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1251 		goto out_spin;
1252 
1253 	__unix_set_addr_hash(net, sk, addr, new_hash);
1254 	unix_table_double_unlock(net, old_hash, new_hash);
1255 	mutex_unlock(&u->bindlock);
1256 	return 0;
1257 
1258 out_spin:
1259 	unix_table_double_unlock(net, old_hash, new_hash);
1260 	err = -EADDRINUSE;
1261 out_mutex:
1262 	mutex_unlock(&u->bindlock);
1263 out:
1264 	unix_release_addr(addr);
1265 	return err;
1266 }
1267 
1268 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1269 {
1270 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1271 	struct sock *sk = sock->sk;
1272 	int err;
1273 
1274 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1275 	    sunaddr->sun_family == AF_UNIX)
1276 		return unix_autobind(sk);
1277 
1278 	err = unix_validate_addr(sunaddr, addr_len);
1279 	if (err)
1280 		return err;
1281 
1282 	if (sunaddr->sun_path[0])
1283 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1284 	else
1285 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1286 
1287 	return err;
1288 }
1289 
1290 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1291 {
1292 	if (unlikely(sk1 == sk2) || !sk2) {
1293 		unix_state_lock(sk1);
1294 		return;
1295 	}
1296 	if (sk1 < sk2) {
1297 		unix_state_lock(sk1);
1298 		unix_state_lock_nested(sk2);
1299 	} else {
1300 		unix_state_lock(sk2);
1301 		unix_state_lock_nested(sk1);
1302 	}
1303 }
1304 
1305 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1306 {
1307 	if (unlikely(sk1 == sk2) || !sk2) {
1308 		unix_state_unlock(sk1);
1309 		return;
1310 	}
1311 	unix_state_unlock(sk1);
1312 	unix_state_unlock(sk2);
1313 }
1314 
1315 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1316 			      int alen, int flags)
1317 {
1318 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1319 	struct sock *sk = sock->sk;
1320 	struct sock *other;
1321 	int err;
1322 
1323 	err = -EINVAL;
1324 	if (alen < offsetofend(struct sockaddr, sa_family))
1325 		goto out;
1326 
1327 	if (addr->sa_family != AF_UNSPEC) {
1328 		err = unix_validate_addr(sunaddr, alen);
1329 		if (err)
1330 			goto out;
1331 
1332 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1333 		    !unix_sk(sk)->addr) {
1334 			err = unix_autobind(sk);
1335 			if (err)
1336 				goto out;
1337 		}
1338 
1339 restart:
1340 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1341 		if (IS_ERR(other)) {
1342 			err = PTR_ERR(other);
1343 			goto out;
1344 		}
1345 
1346 		unix_state_double_lock(sk, other);
1347 
1348 		/* Apparently VFS overslept socket death. Retry. */
1349 		if (sock_flag(other, SOCK_DEAD)) {
1350 			unix_state_double_unlock(sk, other);
1351 			sock_put(other);
1352 			goto restart;
1353 		}
1354 
1355 		err = -EPERM;
1356 		if (!unix_may_send(sk, other))
1357 			goto out_unlock;
1358 
1359 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1360 		if (err)
1361 			goto out_unlock;
1362 
1363 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1364 	} else {
1365 		/*
1366 		 *	1003.1g breaking connected state with AF_UNSPEC
1367 		 */
1368 		other = NULL;
1369 		unix_state_double_lock(sk, other);
1370 	}
1371 
1372 	/*
1373 	 * If it was connected, reconnect.
1374 	 */
1375 	if (unix_peer(sk)) {
1376 		struct sock *old_peer = unix_peer(sk);
1377 
1378 		unix_peer(sk) = other;
1379 		if (!other)
1380 			sk->sk_state = TCP_CLOSE;
1381 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1382 
1383 		unix_state_double_unlock(sk, other);
1384 
1385 		if (other != old_peer)
1386 			unix_dgram_disconnected(sk, old_peer);
1387 		sock_put(old_peer);
1388 	} else {
1389 		unix_peer(sk) = other;
1390 		unix_state_double_unlock(sk, other);
1391 	}
1392 
1393 	return 0;
1394 
1395 out_unlock:
1396 	unix_state_double_unlock(sk, other);
1397 	sock_put(other);
1398 out:
1399 	return err;
1400 }
1401 
1402 static long unix_wait_for_peer(struct sock *other, long timeo)
1403 	__releases(&unix_sk(other)->lock)
1404 {
1405 	struct unix_sock *u = unix_sk(other);
1406 	int sched;
1407 	DEFINE_WAIT(wait);
1408 
1409 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1410 
1411 	sched = !sock_flag(other, SOCK_DEAD) &&
1412 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1413 		unix_recvq_full(other);
1414 
1415 	unix_state_unlock(other);
1416 
1417 	if (sched)
1418 		timeo = schedule_timeout(timeo);
1419 
1420 	finish_wait(&u->peer_wait, &wait);
1421 	return timeo;
1422 }
1423 
1424 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1425 			       int addr_len, int flags)
1426 {
1427 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1428 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1429 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1430 	struct net *net = sock_net(sk);
1431 	struct sk_buff *skb = NULL;
1432 	long timeo;
1433 	int err;
1434 	int st;
1435 
1436 	err = unix_validate_addr(sunaddr, addr_len);
1437 	if (err)
1438 		goto out;
1439 
1440 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1441 		err = unix_autobind(sk);
1442 		if (err)
1443 			goto out;
1444 	}
1445 
1446 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1447 
1448 	/* First of all allocate resources.
1449 	   If we will make it after state is locked,
1450 	   we will have to recheck all again in any case.
1451 	 */
1452 
1453 	/* create new sock for complete connection */
1454 	newsk = unix_create1(net, NULL, 0, sock->type);
1455 	if (IS_ERR(newsk)) {
1456 		err = PTR_ERR(newsk);
1457 		newsk = NULL;
1458 		goto out;
1459 	}
1460 
1461 	err = -ENOMEM;
1462 
1463 	/* Allocate skb for sending to listening sock */
1464 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1465 	if (skb == NULL)
1466 		goto out;
1467 
1468 restart:
1469 	/*  Find listening sock. */
1470 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1471 	if (IS_ERR(other)) {
1472 		err = PTR_ERR(other);
1473 		other = NULL;
1474 		goto out;
1475 	}
1476 
1477 	/* Latch state of peer */
1478 	unix_state_lock(other);
1479 
1480 	/* Apparently VFS overslept socket death. Retry. */
1481 	if (sock_flag(other, SOCK_DEAD)) {
1482 		unix_state_unlock(other);
1483 		sock_put(other);
1484 		goto restart;
1485 	}
1486 
1487 	err = -ECONNREFUSED;
1488 	if (other->sk_state != TCP_LISTEN)
1489 		goto out_unlock;
1490 	if (other->sk_shutdown & RCV_SHUTDOWN)
1491 		goto out_unlock;
1492 
1493 	if (unix_recvq_full(other)) {
1494 		err = -EAGAIN;
1495 		if (!timeo)
1496 			goto out_unlock;
1497 
1498 		timeo = unix_wait_for_peer(other, timeo);
1499 
1500 		err = sock_intr_errno(timeo);
1501 		if (signal_pending(current))
1502 			goto out;
1503 		sock_put(other);
1504 		goto restart;
1505 	}
1506 
1507 	/* Latch our state.
1508 
1509 	   It is tricky place. We need to grab our state lock and cannot
1510 	   drop lock on peer. It is dangerous because deadlock is
1511 	   possible. Connect to self case and simultaneous
1512 	   attempt to connect are eliminated by checking socket
1513 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1514 	   check this before attempt to grab lock.
1515 
1516 	   Well, and we have to recheck the state after socket locked.
1517 	 */
1518 	st = sk->sk_state;
1519 
1520 	switch (st) {
1521 	case TCP_CLOSE:
1522 		/* This is ok... continue with connect */
1523 		break;
1524 	case TCP_ESTABLISHED:
1525 		/* Socket is already connected */
1526 		err = -EISCONN;
1527 		goto out_unlock;
1528 	default:
1529 		err = -EINVAL;
1530 		goto out_unlock;
1531 	}
1532 
1533 	unix_state_lock_nested(sk);
1534 
1535 	if (sk->sk_state != st) {
1536 		unix_state_unlock(sk);
1537 		unix_state_unlock(other);
1538 		sock_put(other);
1539 		goto restart;
1540 	}
1541 
1542 	err = security_unix_stream_connect(sk, other, newsk);
1543 	if (err) {
1544 		unix_state_unlock(sk);
1545 		goto out_unlock;
1546 	}
1547 
1548 	/* The way is open! Fastly set all the necessary fields... */
1549 
1550 	sock_hold(sk);
1551 	unix_peer(newsk)	= sk;
1552 	newsk->sk_state		= TCP_ESTABLISHED;
1553 	newsk->sk_type		= sk->sk_type;
1554 	init_peercred(newsk);
1555 	newu = unix_sk(newsk);
1556 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1557 	otheru = unix_sk(other);
1558 
1559 	/* copy address information from listening to new sock
1560 	 *
1561 	 * The contents of *(otheru->addr) and otheru->path
1562 	 * are seen fully set up here, since we have found
1563 	 * otheru in hash under its lock.  Insertion into the
1564 	 * hash chain we'd found it in had been done in an
1565 	 * earlier critical area protected by the chain's lock,
1566 	 * the same one where we'd set *(otheru->addr) contents,
1567 	 * as well as otheru->path and otheru->addr itself.
1568 	 *
1569 	 * Using smp_store_release() here to set newu->addr
1570 	 * is enough to make those stores, as well as stores
1571 	 * to newu->path visible to anyone who gets newu->addr
1572 	 * by smp_load_acquire().  IOW, the same warranties
1573 	 * as for unix_sock instances bound in unix_bind() or
1574 	 * in unix_autobind().
1575 	 */
1576 	if (otheru->path.dentry) {
1577 		path_get(&otheru->path);
1578 		newu->path = otheru->path;
1579 	}
1580 	refcount_inc(&otheru->addr->refcnt);
1581 	smp_store_release(&newu->addr, otheru->addr);
1582 
1583 	/* Set credentials */
1584 	copy_peercred(sk, other);
1585 
1586 	sock->state	= SS_CONNECTED;
1587 	sk->sk_state	= TCP_ESTABLISHED;
1588 	sock_hold(newsk);
1589 
1590 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1591 	unix_peer(sk)	= newsk;
1592 
1593 	unix_state_unlock(sk);
1594 
1595 	/* take ten and send info to listening sock */
1596 	spin_lock(&other->sk_receive_queue.lock);
1597 	__skb_queue_tail(&other->sk_receive_queue, skb);
1598 	spin_unlock(&other->sk_receive_queue.lock);
1599 	unix_state_unlock(other);
1600 	other->sk_data_ready(other);
1601 	sock_put(other);
1602 	return 0;
1603 
1604 out_unlock:
1605 	if (other)
1606 		unix_state_unlock(other);
1607 
1608 out:
1609 	kfree_skb(skb);
1610 	if (newsk)
1611 		unix_release_sock(newsk, 0);
1612 	if (other)
1613 		sock_put(other);
1614 	return err;
1615 }
1616 
1617 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1618 {
1619 	struct sock *ska = socka->sk, *skb = sockb->sk;
1620 
1621 	/* Join our sockets back to back */
1622 	sock_hold(ska);
1623 	sock_hold(skb);
1624 	unix_peer(ska) = skb;
1625 	unix_peer(skb) = ska;
1626 	init_peercred(ska);
1627 	init_peercred(skb);
1628 
1629 	ska->sk_state = TCP_ESTABLISHED;
1630 	skb->sk_state = TCP_ESTABLISHED;
1631 	socka->state  = SS_CONNECTED;
1632 	sockb->state  = SS_CONNECTED;
1633 	return 0;
1634 }
1635 
1636 static void unix_sock_inherit_flags(const struct socket *old,
1637 				    struct socket *new)
1638 {
1639 	if (test_bit(SOCK_PASSCRED, &old->flags))
1640 		set_bit(SOCK_PASSCRED, &new->flags);
1641 	if (test_bit(SOCK_PASSSEC, &old->flags))
1642 		set_bit(SOCK_PASSSEC, &new->flags);
1643 }
1644 
1645 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1646 		       bool kern)
1647 {
1648 	struct sock *sk = sock->sk;
1649 	struct sock *tsk;
1650 	struct sk_buff *skb;
1651 	int err;
1652 
1653 	err = -EOPNOTSUPP;
1654 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1655 		goto out;
1656 
1657 	err = -EINVAL;
1658 	if (sk->sk_state != TCP_LISTEN)
1659 		goto out;
1660 
1661 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1662 	 * so that no locks are necessary.
1663 	 */
1664 
1665 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1666 				&err);
1667 	if (!skb) {
1668 		/* This means receive shutdown. */
1669 		if (err == 0)
1670 			err = -EINVAL;
1671 		goto out;
1672 	}
1673 
1674 	tsk = skb->sk;
1675 	skb_free_datagram(sk, skb);
1676 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1677 
1678 	/* attach accepted sock to socket */
1679 	unix_state_lock(tsk);
1680 	newsock->state = SS_CONNECTED;
1681 	unix_sock_inherit_flags(sock, newsock);
1682 	sock_graft(tsk, newsock);
1683 	unix_state_unlock(tsk);
1684 	return 0;
1685 
1686 out:
1687 	return err;
1688 }
1689 
1690 
1691 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1692 {
1693 	struct sock *sk = sock->sk;
1694 	struct unix_address *addr;
1695 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1696 	int err = 0;
1697 
1698 	if (peer) {
1699 		sk = unix_peer_get(sk);
1700 
1701 		err = -ENOTCONN;
1702 		if (!sk)
1703 			goto out;
1704 		err = 0;
1705 	} else {
1706 		sock_hold(sk);
1707 	}
1708 
1709 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1710 	if (!addr) {
1711 		sunaddr->sun_family = AF_UNIX;
1712 		sunaddr->sun_path[0] = 0;
1713 		err = offsetof(struct sockaddr_un, sun_path);
1714 	} else {
1715 		err = addr->len;
1716 		memcpy(sunaddr, addr->name, addr->len);
1717 	}
1718 	sock_put(sk);
1719 out:
1720 	return err;
1721 }
1722 
1723 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1724 {
1725 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1726 
1727 	/*
1728 	 * Garbage collection of unix sockets starts by selecting a set of
1729 	 * candidate sockets which have reference only from being in flight
1730 	 * (total_refs == inflight_refs).  This condition is checked once during
1731 	 * the candidate collection phase, and candidates are marked as such, so
1732 	 * that non-candidates can later be ignored.  While inflight_refs is
1733 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1734 	 * is an instantaneous decision.
1735 	 *
1736 	 * Once a candidate, however, the socket must not be reinstalled into a
1737 	 * file descriptor while the garbage collection is in progress.
1738 	 *
1739 	 * If the above conditions are met, then the directed graph of
1740 	 * candidates (*) does not change while unix_gc_lock is held.
1741 	 *
1742 	 * Any operations that changes the file count through file descriptors
1743 	 * (dup, close, sendmsg) does not change the graph since candidates are
1744 	 * not installed in fds.
1745 	 *
1746 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1747 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1748 	 * serialized with garbage collection.
1749 	 *
1750 	 * MSG_PEEK is special in that it does not change the inflight count,
1751 	 * yet does install the socket into an fd.  The following lock/unlock
1752 	 * pair is to ensure serialization with garbage collection.  It must be
1753 	 * done between incrementing the file count and installing the file into
1754 	 * an fd.
1755 	 *
1756 	 * If garbage collection starts after the barrier provided by the
1757 	 * lock/unlock, then it will see the elevated refcount and not mark this
1758 	 * as a candidate.  If a garbage collection is already in progress
1759 	 * before the file count was incremented, then the lock/unlock pair will
1760 	 * ensure that garbage collection is finished before progressing to
1761 	 * installing the fd.
1762 	 *
1763 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1764 	 * which is on the queue of listening socket A.
1765 	 */
1766 	spin_lock(&unix_gc_lock);
1767 	spin_unlock(&unix_gc_lock);
1768 }
1769 
1770 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1771 {
1772 	int err = 0;
1773 
1774 	UNIXCB(skb).pid  = get_pid(scm->pid);
1775 	UNIXCB(skb).uid = scm->creds.uid;
1776 	UNIXCB(skb).gid = scm->creds.gid;
1777 	UNIXCB(skb).fp = NULL;
1778 	unix_get_secdata(scm, skb);
1779 	if (scm->fp && send_fds)
1780 		err = unix_attach_fds(scm, skb);
1781 
1782 	skb->destructor = unix_destruct_scm;
1783 	return err;
1784 }
1785 
1786 static bool unix_passcred_enabled(const struct socket *sock,
1787 				  const struct sock *other)
1788 {
1789 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1790 	       !other->sk_socket ||
1791 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1792 }
1793 
1794 /*
1795  * Some apps rely on write() giving SCM_CREDENTIALS
1796  * We include credentials if source or destination socket
1797  * asserted SOCK_PASSCRED.
1798  */
1799 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1800 			    const struct sock *other)
1801 {
1802 	if (UNIXCB(skb).pid)
1803 		return;
1804 	if (unix_passcred_enabled(sock, other)) {
1805 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1806 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1807 	}
1808 }
1809 
1810 static int maybe_init_creds(struct scm_cookie *scm,
1811 			    struct socket *socket,
1812 			    const struct sock *other)
1813 {
1814 	int err;
1815 	struct msghdr msg = { .msg_controllen = 0 };
1816 
1817 	err = scm_send(socket, &msg, scm, false);
1818 	if (err)
1819 		return err;
1820 
1821 	if (unix_passcred_enabled(socket, other)) {
1822 		scm->pid = get_pid(task_tgid(current));
1823 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1824 	}
1825 	return err;
1826 }
1827 
1828 static bool unix_skb_scm_eq(struct sk_buff *skb,
1829 			    struct scm_cookie *scm)
1830 {
1831 	return UNIXCB(skb).pid == scm->pid &&
1832 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1833 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1834 	       unix_secdata_eq(scm, skb);
1835 }
1836 
1837 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1838 {
1839 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1840 	struct unix_sock *u = unix_sk(sk);
1841 
1842 	if (unlikely(fp && fp->count))
1843 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1844 }
1845 
1846 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1847 {
1848 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1849 	struct unix_sock *u = unix_sk(sk);
1850 
1851 	if (unlikely(fp && fp->count))
1852 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1853 }
1854 
1855 /*
1856  *	Send AF_UNIX data.
1857  */
1858 
1859 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1860 			      size_t len)
1861 {
1862 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1863 	struct sock *sk = sock->sk, *other = NULL;
1864 	struct unix_sock *u = unix_sk(sk);
1865 	struct scm_cookie scm;
1866 	struct sk_buff *skb;
1867 	int data_len = 0;
1868 	int sk_locked;
1869 	long timeo;
1870 	int err;
1871 
1872 	wait_for_unix_gc();
1873 	err = scm_send(sock, msg, &scm, false);
1874 	if (err < 0)
1875 		return err;
1876 
1877 	err = -EOPNOTSUPP;
1878 	if (msg->msg_flags&MSG_OOB)
1879 		goto out;
1880 
1881 	if (msg->msg_namelen) {
1882 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1883 		if (err)
1884 			goto out;
1885 	} else {
1886 		sunaddr = NULL;
1887 		err = -ENOTCONN;
1888 		other = unix_peer_get(sk);
1889 		if (!other)
1890 			goto out;
1891 	}
1892 
1893 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1894 		err = unix_autobind(sk);
1895 		if (err)
1896 			goto out;
1897 	}
1898 
1899 	err = -EMSGSIZE;
1900 	if (len > sk->sk_sndbuf - 32)
1901 		goto out;
1902 
1903 	if (len > SKB_MAX_ALLOC) {
1904 		data_len = min_t(size_t,
1905 				 len - SKB_MAX_ALLOC,
1906 				 MAX_SKB_FRAGS * PAGE_SIZE);
1907 		data_len = PAGE_ALIGN(data_len);
1908 
1909 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1910 	}
1911 
1912 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1913 				   msg->msg_flags & MSG_DONTWAIT, &err,
1914 				   PAGE_ALLOC_COSTLY_ORDER);
1915 	if (skb == NULL)
1916 		goto out;
1917 
1918 	err = unix_scm_to_skb(&scm, skb, true);
1919 	if (err < 0)
1920 		goto out_free;
1921 
1922 	skb_put(skb, len - data_len);
1923 	skb->data_len = data_len;
1924 	skb->len = len;
1925 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1926 	if (err)
1927 		goto out_free;
1928 
1929 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1930 
1931 restart:
1932 	if (!other) {
1933 		err = -ECONNRESET;
1934 		if (sunaddr == NULL)
1935 			goto out_free;
1936 
1937 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1938 					sk->sk_type);
1939 		if (IS_ERR(other)) {
1940 			err = PTR_ERR(other);
1941 			other = NULL;
1942 			goto out_free;
1943 		}
1944 	}
1945 
1946 	if (sk_filter(other, skb) < 0) {
1947 		/* Toss the packet but do not return any error to the sender */
1948 		err = len;
1949 		goto out_free;
1950 	}
1951 
1952 	sk_locked = 0;
1953 	unix_state_lock(other);
1954 restart_locked:
1955 	err = -EPERM;
1956 	if (!unix_may_send(sk, other))
1957 		goto out_unlock;
1958 
1959 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1960 		/*
1961 		 *	Check with 1003.1g - what should
1962 		 *	datagram error
1963 		 */
1964 		unix_state_unlock(other);
1965 		sock_put(other);
1966 
1967 		if (!sk_locked)
1968 			unix_state_lock(sk);
1969 
1970 		err = 0;
1971 		if (unix_peer(sk) == other) {
1972 			unix_peer(sk) = NULL;
1973 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1974 
1975 			unix_state_unlock(sk);
1976 
1977 			sk->sk_state = TCP_CLOSE;
1978 			unix_dgram_disconnected(sk, other);
1979 			sock_put(other);
1980 			err = -ECONNREFUSED;
1981 		} else {
1982 			unix_state_unlock(sk);
1983 		}
1984 
1985 		other = NULL;
1986 		if (err)
1987 			goto out_free;
1988 		goto restart;
1989 	}
1990 
1991 	err = -EPIPE;
1992 	if (other->sk_shutdown & RCV_SHUTDOWN)
1993 		goto out_unlock;
1994 
1995 	if (sk->sk_type != SOCK_SEQPACKET) {
1996 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1997 		if (err)
1998 			goto out_unlock;
1999 	}
2000 
2001 	/* other == sk && unix_peer(other) != sk if
2002 	 * - unix_peer(sk) == NULL, destination address bound to sk
2003 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2004 	 */
2005 	if (other != sk &&
2006 	    unlikely(unix_peer(other) != sk &&
2007 	    unix_recvq_full_lockless(other))) {
2008 		if (timeo) {
2009 			timeo = unix_wait_for_peer(other, timeo);
2010 
2011 			err = sock_intr_errno(timeo);
2012 			if (signal_pending(current))
2013 				goto out_free;
2014 
2015 			goto restart;
2016 		}
2017 
2018 		if (!sk_locked) {
2019 			unix_state_unlock(other);
2020 			unix_state_double_lock(sk, other);
2021 		}
2022 
2023 		if (unix_peer(sk) != other ||
2024 		    unix_dgram_peer_wake_me(sk, other)) {
2025 			err = -EAGAIN;
2026 			sk_locked = 1;
2027 			goto out_unlock;
2028 		}
2029 
2030 		if (!sk_locked) {
2031 			sk_locked = 1;
2032 			goto restart_locked;
2033 		}
2034 	}
2035 
2036 	if (unlikely(sk_locked))
2037 		unix_state_unlock(sk);
2038 
2039 	if (sock_flag(other, SOCK_RCVTSTAMP))
2040 		__net_timestamp(skb);
2041 	maybe_add_creds(skb, sock, other);
2042 	scm_stat_add(other, skb);
2043 	skb_queue_tail(&other->sk_receive_queue, skb);
2044 	unix_state_unlock(other);
2045 	other->sk_data_ready(other);
2046 	sock_put(other);
2047 	scm_destroy(&scm);
2048 	return len;
2049 
2050 out_unlock:
2051 	if (sk_locked)
2052 		unix_state_unlock(sk);
2053 	unix_state_unlock(other);
2054 out_free:
2055 	kfree_skb(skb);
2056 out:
2057 	if (other)
2058 		sock_put(other);
2059 	scm_destroy(&scm);
2060 	return err;
2061 }
2062 
2063 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2064  * bytes, and a minimum of a full page.
2065  */
2066 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2067 
2068 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2069 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2070 {
2071 	struct unix_sock *ousk = unix_sk(other);
2072 	struct sk_buff *skb;
2073 	int err = 0;
2074 
2075 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2076 
2077 	if (!skb)
2078 		return err;
2079 
2080 	skb_put(skb, 1);
2081 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2082 
2083 	if (err) {
2084 		kfree_skb(skb);
2085 		return err;
2086 	}
2087 
2088 	unix_state_lock(other);
2089 
2090 	if (sock_flag(other, SOCK_DEAD) ||
2091 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2092 		unix_state_unlock(other);
2093 		kfree_skb(skb);
2094 		return -EPIPE;
2095 	}
2096 
2097 	maybe_add_creds(skb, sock, other);
2098 	skb_get(skb);
2099 
2100 	if (ousk->oob_skb)
2101 		consume_skb(ousk->oob_skb);
2102 
2103 	WRITE_ONCE(ousk->oob_skb, skb);
2104 
2105 	scm_stat_add(other, skb);
2106 	skb_queue_tail(&other->sk_receive_queue, skb);
2107 	sk_send_sigurg(other);
2108 	unix_state_unlock(other);
2109 	other->sk_data_ready(other);
2110 
2111 	return err;
2112 }
2113 #endif
2114 
2115 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2116 			       size_t len)
2117 {
2118 	struct sock *sk = sock->sk;
2119 	struct sock *other = NULL;
2120 	int err, size;
2121 	struct sk_buff *skb;
2122 	int sent = 0;
2123 	struct scm_cookie scm;
2124 	bool fds_sent = false;
2125 	int data_len;
2126 
2127 	wait_for_unix_gc();
2128 	err = scm_send(sock, msg, &scm, false);
2129 	if (err < 0)
2130 		return err;
2131 
2132 	err = -EOPNOTSUPP;
2133 	if (msg->msg_flags & MSG_OOB) {
2134 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2135 		if (len)
2136 			len--;
2137 		else
2138 #endif
2139 			goto out_err;
2140 	}
2141 
2142 	if (msg->msg_namelen) {
2143 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2144 		goto out_err;
2145 	} else {
2146 		err = -ENOTCONN;
2147 		other = unix_peer(sk);
2148 		if (!other)
2149 			goto out_err;
2150 	}
2151 
2152 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2153 		goto pipe_err;
2154 
2155 	while (sent < len) {
2156 		size = len - sent;
2157 
2158 		/* Keep two messages in the pipe so it schedules better */
2159 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2160 
2161 		/* allow fallback to order-0 allocations */
2162 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2163 
2164 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2165 
2166 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2167 
2168 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2169 					   msg->msg_flags & MSG_DONTWAIT, &err,
2170 					   get_order(UNIX_SKB_FRAGS_SZ));
2171 		if (!skb)
2172 			goto out_err;
2173 
2174 		/* Only send the fds in the first buffer */
2175 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2176 		if (err < 0) {
2177 			kfree_skb(skb);
2178 			goto out_err;
2179 		}
2180 		fds_sent = true;
2181 
2182 		skb_put(skb, size - data_len);
2183 		skb->data_len = data_len;
2184 		skb->len = size;
2185 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2186 		if (err) {
2187 			kfree_skb(skb);
2188 			goto out_err;
2189 		}
2190 
2191 		unix_state_lock(other);
2192 
2193 		if (sock_flag(other, SOCK_DEAD) ||
2194 		    (other->sk_shutdown & RCV_SHUTDOWN))
2195 			goto pipe_err_free;
2196 
2197 		maybe_add_creds(skb, sock, other);
2198 		scm_stat_add(other, skb);
2199 		skb_queue_tail(&other->sk_receive_queue, skb);
2200 		unix_state_unlock(other);
2201 		other->sk_data_ready(other);
2202 		sent += size;
2203 	}
2204 
2205 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2206 	if (msg->msg_flags & MSG_OOB) {
2207 		err = queue_oob(sock, msg, other);
2208 		if (err)
2209 			goto out_err;
2210 		sent++;
2211 	}
2212 #endif
2213 
2214 	scm_destroy(&scm);
2215 
2216 	return sent;
2217 
2218 pipe_err_free:
2219 	unix_state_unlock(other);
2220 	kfree_skb(skb);
2221 pipe_err:
2222 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2223 		send_sig(SIGPIPE, current, 0);
2224 	err = -EPIPE;
2225 out_err:
2226 	scm_destroy(&scm);
2227 	return sent ? : err;
2228 }
2229 
2230 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2231 				    int offset, size_t size, int flags)
2232 {
2233 	int err;
2234 	bool send_sigpipe = false;
2235 	bool init_scm = true;
2236 	struct scm_cookie scm;
2237 	struct sock *other, *sk = socket->sk;
2238 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2239 
2240 	if (flags & MSG_OOB)
2241 		return -EOPNOTSUPP;
2242 
2243 	other = unix_peer(sk);
2244 	if (!other || sk->sk_state != TCP_ESTABLISHED)
2245 		return -ENOTCONN;
2246 
2247 	if (false) {
2248 alloc_skb:
2249 		unix_state_unlock(other);
2250 		mutex_unlock(&unix_sk(other)->iolock);
2251 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2252 					      &err, 0);
2253 		if (!newskb)
2254 			goto err;
2255 	}
2256 
2257 	/* we must acquire iolock as we modify already present
2258 	 * skbs in the sk_receive_queue and mess with skb->len
2259 	 */
2260 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2261 	if (err) {
2262 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2263 		goto err;
2264 	}
2265 
2266 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
2267 		err = -EPIPE;
2268 		send_sigpipe = true;
2269 		goto err_unlock;
2270 	}
2271 
2272 	unix_state_lock(other);
2273 
2274 	if (sock_flag(other, SOCK_DEAD) ||
2275 	    other->sk_shutdown & RCV_SHUTDOWN) {
2276 		err = -EPIPE;
2277 		send_sigpipe = true;
2278 		goto err_state_unlock;
2279 	}
2280 
2281 	if (init_scm) {
2282 		err = maybe_init_creds(&scm, socket, other);
2283 		if (err)
2284 			goto err_state_unlock;
2285 		init_scm = false;
2286 	}
2287 
2288 	skb = skb_peek_tail(&other->sk_receive_queue);
2289 	if (tail && tail == skb) {
2290 		skb = newskb;
2291 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2292 		if (newskb) {
2293 			skb = newskb;
2294 		} else {
2295 			tail = skb;
2296 			goto alloc_skb;
2297 		}
2298 	} else if (newskb) {
2299 		/* this is fast path, we don't necessarily need to
2300 		 * call to kfree_skb even though with newskb == NULL
2301 		 * this - does no harm
2302 		 */
2303 		consume_skb(newskb);
2304 		newskb = NULL;
2305 	}
2306 
2307 	if (skb_append_pagefrags(skb, page, offset, size)) {
2308 		tail = skb;
2309 		goto alloc_skb;
2310 	}
2311 
2312 	skb->len += size;
2313 	skb->data_len += size;
2314 	skb->truesize += size;
2315 	refcount_add(size, &sk->sk_wmem_alloc);
2316 
2317 	if (newskb) {
2318 		err = unix_scm_to_skb(&scm, skb, false);
2319 		if (err)
2320 			goto err_state_unlock;
2321 		spin_lock(&other->sk_receive_queue.lock);
2322 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2323 		spin_unlock(&other->sk_receive_queue.lock);
2324 	}
2325 
2326 	unix_state_unlock(other);
2327 	mutex_unlock(&unix_sk(other)->iolock);
2328 
2329 	other->sk_data_ready(other);
2330 	scm_destroy(&scm);
2331 	return size;
2332 
2333 err_state_unlock:
2334 	unix_state_unlock(other);
2335 err_unlock:
2336 	mutex_unlock(&unix_sk(other)->iolock);
2337 err:
2338 	kfree_skb(newskb);
2339 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2340 		send_sig(SIGPIPE, current, 0);
2341 	if (!init_scm)
2342 		scm_destroy(&scm);
2343 	return err;
2344 }
2345 
2346 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2347 				  size_t len)
2348 {
2349 	int err;
2350 	struct sock *sk = sock->sk;
2351 
2352 	err = sock_error(sk);
2353 	if (err)
2354 		return err;
2355 
2356 	if (sk->sk_state != TCP_ESTABLISHED)
2357 		return -ENOTCONN;
2358 
2359 	if (msg->msg_namelen)
2360 		msg->msg_namelen = 0;
2361 
2362 	return unix_dgram_sendmsg(sock, msg, len);
2363 }
2364 
2365 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2366 				  size_t size, int flags)
2367 {
2368 	struct sock *sk = sock->sk;
2369 
2370 	if (sk->sk_state != TCP_ESTABLISHED)
2371 		return -ENOTCONN;
2372 
2373 	return unix_dgram_recvmsg(sock, msg, size, flags);
2374 }
2375 
2376 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2377 {
2378 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2379 
2380 	if (addr) {
2381 		msg->msg_namelen = addr->len;
2382 		memcpy(msg->msg_name, addr->name, addr->len);
2383 	}
2384 }
2385 
2386 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2387 			 int flags)
2388 {
2389 	struct scm_cookie scm;
2390 	struct socket *sock = sk->sk_socket;
2391 	struct unix_sock *u = unix_sk(sk);
2392 	struct sk_buff *skb, *last;
2393 	long timeo;
2394 	int skip;
2395 	int err;
2396 
2397 	err = -EOPNOTSUPP;
2398 	if (flags&MSG_OOB)
2399 		goto out;
2400 
2401 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2402 
2403 	do {
2404 		mutex_lock(&u->iolock);
2405 
2406 		skip = sk_peek_offset(sk, flags);
2407 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2408 					      &skip, &err, &last);
2409 		if (skb) {
2410 			if (!(flags & MSG_PEEK))
2411 				scm_stat_del(sk, skb);
2412 			break;
2413 		}
2414 
2415 		mutex_unlock(&u->iolock);
2416 
2417 		if (err != -EAGAIN)
2418 			break;
2419 	} while (timeo &&
2420 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2421 					      &err, &timeo, last));
2422 
2423 	if (!skb) { /* implies iolock unlocked */
2424 		unix_state_lock(sk);
2425 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2426 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2427 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2428 			err = 0;
2429 		unix_state_unlock(sk);
2430 		goto out;
2431 	}
2432 
2433 	if (wq_has_sleeper(&u->peer_wait))
2434 		wake_up_interruptible_sync_poll(&u->peer_wait,
2435 						EPOLLOUT | EPOLLWRNORM |
2436 						EPOLLWRBAND);
2437 
2438 	if (msg->msg_name)
2439 		unix_copy_addr(msg, skb->sk);
2440 
2441 	if (size > skb->len - skip)
2442 		size = skb->len - skip;
2443 	else if (size < skb->len - skip)
2444 		msg->msg_flags |= MSG_TRUNC;
2445 
2446 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2447 	if (err)
2448 		goto out_free;
2449 
2450 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2451 		__sock_recv_timestamp(msg, sk, skb);
2452 
2453 	memset(&scm, 0, sizeof(scm));
2454 
2455 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2456 	unix_set_secdata(&scm, skb);
2457 
2458 	if (!(flags & MSG_PEEK)) {
2459 		if (UNIXCB(skb).fp)
2460 			unix_detach_fds(&scm, skb);
2461 
2462 		sk_peek_offset_bwd(sk, skb->len);
2463 	} else {
2464 		/* It is questionable: on PEEK we could:
2465 		   - do not return fds - good, but too simple 8)
2466 		   - return fds, and do not return them on read (old strategy,
2467 		     apparently wrong)
2468 		   - clone fds (I chose it for now, it is the most universal
2469 		     solution)
2470 
2471 		   POSIX 1003.1g does not actually define this clearly
2472 		   at all. POSIX 1003.1g doesn't define a lot of things
2473 		   clearly however!
2474 
2475 		*/
2476 
2477 		sk_peek_offset_fwd(sk, size);
2478 
2479 		if (UNIXCB(skb).fp)
2480 			unix_peek_fds(&scm, skb);
2481 	}
2482 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2483 
2484 	scm_recv(sock, msg, &scm, flags);
2485 
2486 out_free:
2487 	skb_free_datagram(sk, skb);
2488 	mutex_unlock(&u->iolock);
2489 out:
2490 	return err;
2491 }
2492 
2493 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2494 			      int flags)
2495 {
2496 	struct sock *sk = sock->sk;
2497 
2498 #ifdef CONFIG_BPF_SYSCALL
2499 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2500 
2501 	if (prot != &unix_dgram_proto)
2502 		return prot->recvmsg(sk, msg, size, flags, NULL);
2503 #endif
2504 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2505 }
2506 
2507 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2508 {
2509 	int copied = 0;
2510 
2511 	while (1) {
2512 		struct unix_sock *u = unix_sk(sk);
2513 		struct sk_buff *skb;
2514 		int used, err;
2515 
2516 		mutex_lock(&u->iolock);
2517 		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2518 		mutex_unlock(&u->iolock);
2519 		if (!skb)
2520 			return err;
2521 
2522 		used = recv_actor(sk, skb);
2523 		if (used <= 0) {
2524 			if (!copied)
2525 				copied = used;
2526 			kfree_skb(skb);
2527 			break;
2528 		} else if (used <= skb->len) {
2529 			copied += used;
2530 		}
2531 
2532 		kfree_skb(skb);
2533 		break;
2534 	}
2535 
2536 	return copied;
2537 }
2538 
2539 /*
2540  *	Sleep until more data has arrived. But check for races..
2541  */
2542 static long unix_stream_data_wait(struct sock *sk, long timeo,
2543 				  struct sk_buff *last, unsigned int last_len,
2544 				  bool freezable)
2545 {
2546 	struct sk_buff *tail;
2547 	DEFINE_WAIT(wait);
2548 
2549 	unix_state_lock(sk);
2550 
2551 	for (;;) {
2552 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2553 
2554 		tail = skb_peek_tail(&sk->sk_receive_queue);
2555 		if (tail != last ||
2556 		    (tail && tail->len != last_len) ||
2557 		    sk->sk_err ||
2558 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2559 		    signal_pending(current) ||
2560 		    !timeo)
2561 			break;
2562 
2563 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2564 		unix_state_unlock(sk);
2565 		if (freezable)
2566 			timeo = freezable_schedule_timeout(timeo);
2567 		else
2568 			timeo = schedule_timeout(timeo);
2569 		unix_state_lock(sk);
2570 
2571 		if (sock_flag(sk, SOCK_DEAD))
2572 			break;
2573 
2574 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2575 	}
2576 
2577 	finish_wait(sk_sleep(sk), &wait);
2578 	unix_state_unlock(sk);
2579 	return timeo;
2580 }
2581 
2582 static unsigned int unix_skb_len(const struct sk_buff *skb)
2583 {
2584 	return skb->len - UNIXCB(skb).consumed;
2585 }
2586 
2587 struct unix_stream_read_state {
2588 	int (*recv_actor)(struct sk_buff *, int, int,
2589 			  struct unix_stream_read_state *);
2590 	struct socket *socket;
2591 	struct msghdr *msg;
2592 	struct pipe_inode_info *pipe;
2593 	size_t size;
2594 	int flags;
2595 	unsigned int splice_flags;
2596 };
2597 
2598 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2599 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2600 {
2601 	struct socket *sock = state->socket;
2602 	struct sock *sk = sock->sk;
2603 	struct unix_sock *u = unix_sk(sk);
2604 	int chunk = 1;
2605 	struct sk_buff *oob_skb;
2606 
2607 	mutex_lock(&u->iolock);
2608 	unix_state_lock(sk);
2609 
2610 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2611 		unix_state_unlock(sk);
2612 		mutex_unlock(&u->iolock);
2613 		return -EINVAL;
2614 	}
2615 
2616 	oob_skb = u->oob_skb;
2617 
2618 	if (!(state->flags & MSG_PEEK))
2619 		WRITE_ONCE(u->oob_skb, NULL);
2620 
2621 	unix_state_unlock(sk);
2622 
2623 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2624 
2625 	if (!(state->flags & MSG_PEEK)) {
2626 		UNIXCB(oob_skb).consumed += 1;
2627 		kfree_skb(oob_skb);
2628 	}
2629 
2630 	mutex_unlock(&u->iolock);
2631 
2632 	if (chunk < 0)
2633 		return -EFAULT;
2634 
2635 	state->msg->msg_flags |= MSG_OOB;
2636 	return 1;
2637 }
2638 
2639 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2640 				  int flags, int copied)
2641 {
2642 	struct unix_sock *u = unix_sk(sk);
2643 
2644 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2645 		skb_unlink(skb, &sk->sk_receive_queue);
2646 		consume_skb(skb);
2647 		skb = NULL;
2648 	} else {
2649 		if (skb == u->oob_skb) {
2650 			if (copied) {
2651 				skb = NULL;
2652 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2653 				if (!(flags & MSG_PEEK)) {
2654 					WRITE_ONCE(u->oob_skb, NULL);
2655 					consume_skb(skb);
2656 				}
2657 			} else if (!(flags & MSG_PEEK)) {
2658 				skb_unlink(skb, &sk->sk_receive_queue);
2659 				consume_skb(skb);
2660 				skb = skb_peek(&sk->sk_receive_queue);
2661 			}
2662 		}
2663 	}
2664 	return skb;
2665 }
2666 #endif
2667 
2668 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2669 {
2670 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2671 		return -ENOTCONN;
2672 
2673 	return unix_read_skb(sk, recv_actor);
2674 }
2675 
2676 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2677 				    bool freezable)
2678 {
2679 	struct scm_cookie scm;
2680 	struct socket *sock = state->socket;
2681 	struct sock *sk = sock->sk;
2682 	struct unix_sock *u = unix_sk(sk);
2683 	int copied = 0;
2684 	int flags = state->flags;
2685 	int noblock = flags & MSG_DONTWAIT;
2686 	bool check_creds = false;
2687 	int target;
2688 	int err = 0;
2689 	long timeo;
2690 	int skip;
2691 	size_t size = state->size;
2692 	unsigned int last_len;
2693 
2694 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2695 		err = -EINVAL;
2696 		goto out;
2697 	}
2698 
2699 	if (unlikely(flags & MSG_OOB)) {
2700 		err = -EOPNOTSUPP;
2701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2702 		err = unix_stream_recv_urg(state);
2703 #endif
2704 		goto out;
2705 	}
2706 
2707 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2708 	timeo = sock_rcvtimeo(sk, noblock);
2709 
2710 	memset(&scm, 0, sizeof(scm));
2711 
2712 	/* Lock the socket to prevent queue disordering
2713 	 * while sleeps in memcpy_tomsg
2714 	 */
2715 	mutex_lock(&u->iolock);
2716 
2717 	skip = max(sk_peek_offset(sk, flags), 0);
2718 
2719 	do {
2720 		int chunk;
2721 		bool drop_skb;
2722 		struct sk_buff *skb, *last;
2723 
2724 redo:
2725 		unix_state_lock(sk);
2726 		if (sock_flag(sk, SOCK_DEAD)) {
2727 			err = -ECONNRESET;
2728 			goto unlock;
2729 		}
2730 		last = skb = skb_peek(&sk->sk_receive_queue);
2731 		last_len = last ? last->len : 0;
2732 
2733 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2734 		if (skb) {
2735 			skb = manage_oob(skb, sk, flags, copied);
2736 			if (!skb) {
2737 				unix_state_unlock(sk);
2738 				if (copied)
2739 					break;
2740 				goto redo;
2741 			}
2742 		}
2743 #endif
2744 again:
2745 		if (skb == NULL) {
2746 			if (copied >= target)
2747 				goto unlock;
2748 
2749 			/*
2750 			 *	POSIX 1003.1g mandates this order.
2751 			 */
2752 
2753 			err = sock_error(sk);
2754 			if (err)
2755 				goto unlock;
2756 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2757 				goto unlock;
2758 
2759 			unix_state_unlock(sk);
2760 			if (!timeo) {
2761 				err = -EAGAIN;
2762 				break;
2763 			}
2764 
2765 			mutex_unlock(&u->iolock);
2766 
2767 			timeo = unix_stream_data_wait(sk, timeo, last,
2768 						      last_len, freezable);
2769 
2770 			if (signal_pending(current)) {
2771 				err = sock_intr_errno(timeo);
2772 				scm_destroy(&scm);
2773 				goto out;
2774 			}
2775 
2776 			mutex_lock(&u->iolock);
2777 			goto redo;
2778 unlock:
2779 			unix_state_unlock(sk);
2780 			break;
2781 		}
2782 
2783 		while (skip >= unix_skb_len(skb)) {
2784 			skip -= unix_skb_len(skb);
2785 			last = skb;
2786 			last_len = skb->len;
2787 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2788 			if (!skb)
2789 				goto again;
2790 		}
2791 
2792 		unix_state_unlock(sk);
2793 
2794 		if (check_creds) {
2795 			/* Never glue messages from different writers */
2796 			if (!unix_skb_scm_eq(skb, &scm))
2797 				break;
2798 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2799 			/* Copy credentials */
2800 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2801 			unix_set_secdata(&scm, skb);
2802 			check_creds = true;
2803 		}
2804 
2805 		/* Copy address just once */
2806 		if (state->msg && state->msg->msg_name) {
2807 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2808 					 state->msg->msg_name);
2809 			unix_copy_addr(state->msg, skb->sk);
2810 			sunaddr = NULL;
2811 		}
2812 
2813 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2814 		skb_get(skb);
2815 		chunk = state->recv_actor(skb, skip, chunk, state);
2816 		drop_skb = !unix_skb_len(skb);
2817 		/* skb is only safe to use if !drop_skb */
2818 		consume_skb(skb);
2819 		if (chunk < 0) {
2820 			if (copied == 0)
2821 				copied = -EFAULT;
2822 			break;
2823 		}
2824 		copied += chunk;
2825 		size -= chunk;
2826 
2827 		if (drop_skb) {
2828 			/* the skb was touched by a concurrent reader;
2829 			 * we should not expect anything from this skb
2830 			 * anymore and assume it invalid - we can be
2831 			 * sure it was dropped from the socket queue
2832 			 *
2833 			 * let's report a short read
2834 			 */
2835 			err = 0;
2836 			break;
2837 		}
2838 
2839 		/* Mark read part of skb as used */
2840 		if (!(flags & MSG_PEEK)) {
2841 			UNIXCB(skb).consumed += chunk;
2842 
2843 			sk_peek_offset_bwd(sk, chunk);
2844 
2845 			if (UNIXCB(skb).fp) {
2846 				scm_stat_del(sk, skb);
2847 				unix_detach_fds(&scm, skb);
2848 			}
2849 
2850 			if (unix_skb_len(skb))
2851 				break;
2852 
2853 			skb_unlink(skb, &sk->sk_receive_queue);
2854 			consume_skb(skb);
2855 
2856 			if (scm.fp)
2857 				break;
2858 		} else {
2859 			/* It is questionable, see note in unix_dgram_recvmsg.
2860 			 */
2861 			if (UNIXCB(skb).fp)
2862 				unix_peek_fds(&scm, skb);
2863 
2864 			sk_peek_offset_fwd(sk, chunk);
2865 
2866 			if (UNIXCB(skb).fp)
2867 				break;
2868 
2869 			skip = 0;
2870 			last = skb;
2871 			last_len = skb->len;
2872 			unix_state_lock(sk);
2873 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2874 			if (skb)
2875 				goto again;
2876 			unix_state_unlock(sk);
2877 			break;
2878 		}
2879 	} while (size);
2880 
2881 	mutex_unlock(&u->iolock);
2882 	if (state->msg)
2883 		scm_recv(sock, state->msg, &scm, flags);
2884 	else
2885 		scm_destroy(&scm);
2886 out:
2887 	return copied ? : err;
2888 }
2889 
2890 static int unix_stream_read_actor(struct sk_buff *skb,
2891 				  int skip, int chunk,
2892 				  struct unix_stream_read_state *state)
2893 {
2894 	int ret;
2895 
2896 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2897 				    state->msg, chunk);
2898 	return ret ?: chunk;
2899 }
2900 
2901 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2902 			  size_t size, int flags)
2903 {
2904 	struct unix_stream_read_state state = {
2905 		.recv_actor = unix_stream_read_actor,
2906 		.socket = sk->sk_socket,
2907 		.msg = msg,
2908 		.size = size,
2909 		.flags = flags
2910 	};
2911 
2912 	return unix_stream_read_generic(&state, true);
2913 }
2914 
2915 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2916 			       size_t size, int flags)
2917 {
2918 	struct unix_stream_read_state state = {
2919 		.recv_actor = unix_stream_read_actor,
2920 		.socket = sock,
2921 		.msg = msg,
2922 		.size = size,
2923 		.flags = flags
2924 	};
2925 
2926 #ifdef CONFIG_BPF_SYSCALL
2927 	struct sock *sk = sock->sk;
2928 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2929 
2930 	if (prot != &unix_stream_proto)
2931 		return prot->recvmsg(sk, msg, size, flags, NULL);
2932 #endif
2933 	return unix_stream_read_generic(&state, true);
2934 }
2935 
2936 static int unix_stream_splice_actor(struct sk_buff *skb,
2937 				    int skip, int chunk,
2938 				    struct unix_stream_read_state *state)
2939 {
2940 	return skb_splice_bits(skb, state->socket->sk,
2941 			       UNIXCB(skb).consumed + skip,
2942 			       state->pipe, chunk, state->splice_flags);
2943 }
2944 
2945 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2946 				       struct pipe_inode_info *pipe,
2947 				       size_t size, unsigned int flags)
2948 {
2949 	struct unix_stream_read_state state = {
2950 		.recv_actor = unix_stream_splice_actor,
2951 		.socket = sock,
2952 		.pipe = pipe,
2953 		.size = size,
2954 		.splice_flags = flags,
2955 	};
2956 
2957 	if (unlikely(*ppos))
2958 		return -ESPIPE;
2959 
2960 	if (sock->file->f_flags & O_NONBLOCK ||
2961 	    flags & SPLICE_F_NONBLOCK)
2962 		state.flags = MSG_DONTWAIT;
2963 
2964 	return unix_stream_read_generic(&state, false);
2965 }
2966 
2967 static int unix_shutdown(struct socket *sock, int mode)
2968 {
2969 	struct sock *sk = sock->sk;
2970 	struct sock *other;
2971 
2972 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2973 		return -EINVAL;
2974 	/* This maps:
2975 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2976 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2977 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2978 	 */
2979 	++mode;
2980 
2981 	unix_state_lock(sk);
2982 	sk->sk_shutdown |= mode;
2983 	other = unix_peer(sk);
2984 	if (other)
2985 		sock_hold(other);
2986 	unix_state_unlock(sk);
2987 	sk->sk_state_change(sk);
2988 
2989 	if (other &&
2990 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2991 
2992 		int peer_mode = 0;
2993 		const struct proto *prot = READ_ONCE(other->sk_prot);
2994 
2995 		if (prot->unhash)
2996 			prot->unhash(other);
2997 		if (mode&RCV_SHUTDOWN)
2998 			peer_mode |= SEND_SHUTDOWN;
2999 		if (mode&SEND_SHUTDOWN)
3000 			peer_mode |= RCV_SHUTDOWN;
3001 		unix_state_lock(other);
3002 		other->sk_shutdown |= peer_mode;
3003 		unix_state_unlock(other);
3004 		other->sk_state_change(other);
3005 		if (peer_mode == SHUTDOWN_MASK)
3006 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3007 		else if (peer_mode & RCV_SHUTDOWN)
3008 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3009 	}
3010 	if (other)
3011 		sock_put(other);
3012 
3013 	return 0;
3014 }
3015 
3016 long unix_inq_len(struct sock *sk)
3017 {
3018 	struct sk_buff *skb;
3019 	long amount = 0;
3020 
3021 	if (sk->sk_state == TCP_LISTEN)
3022 		return -EINVAL;
3023 
3024 	spin_lock(&sk->sk_receive_queue.lock);
3025 	if (sk->sk_type == SOCK_STREAM ||
3026 	    sk->sk_type == SOCK_SEQPACKET) {
3027 		skb_queue_walk(&sk->sk_receive_queue, skb)
3028 			amount += unix_skb_len(skb);
3029 	} else {
3030 		skb = skb_peek(&sk->sk_receive_queue);
3031 		if (skb)
3032 			amount = skb->len;
3033 	}
3034 	spin_unlock(&sk->sk_receive_queue.lock);
3035 
3036 	return amount;
3037 }
3038 EXPORT_SYMBOL_GPL(unix_inq_len);
3039 
3040 long unix_outq_len(struct sock *sk)
3041 {
3042 	return sk_wmem_alloc_get(sk);
3043 }
3044 EXPORT_SYMBOL_GPL(unix_outq_len);
3045 
3046 static int unix_open_file(struct sock *sk)
3047 {
3048 	struct path path;
3049 	struct file *f;
3050 	int fd;
3051 
3052 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3053 		return -EPERM;
3054 
3055 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3056 		return -ENOENT;
3057 
3058 	path = unix_sk(sk)->path;
3059 	if (!path.dentry)
3060 		return -ENOENT;
3061 
3062 	path_get(&path);
3063 
3064 	fd = get_unused_fd_flags(O_CLOEXEC);
3065 	if (fd < 0)
3066 		goto out;
3067 
3068 	f = dentry_open(&path, O_PATH, current_cred());
3069 	if (IS_ERR(f)) {
3070 		put_unused_fd(fd);
3071 		fd = PTR_ERR(f);
3072 		goto out;
3073 	}
3074 
3075 	fd_install(fd, f);
3076 out:
3077 	path_put(&path);
3078 
3079 	return fd;
3080 }
3081 
3082 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3083 {
3084 	struct sock *sk = sock->sk;
3085 	long amount = 0;
3086 	int err;
3087 
3088 	switch (cmd) {
3089 	case SIOCOUTQ:
3090 		amount = unix_outq_len(sk);
3091 		err = put_user(amount, (int __user *)arg);
3092 		break;
3093 	case SIOCINQ:
3094 		amount = unix_inq_len(sk);
3095 		if (amount < 0)
3096 			err = amount;
3097 		else
3098 			err = put_user(amount, (int __user *)arg);
3099 		break;
3100 	case SIOCUNIXFILE:
3101 		err = unix_open_file(sk);
3102 		break;
3103 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3104 	case SIOCATMARK:
3105 		{
3106 			struct sk_buff *skb;
3107 			int answ = 0;
3108 
3109 			skb = skb_peek(&sk->sk_receive_queue);
3110 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3111 				answ = 1;
3112 			err = put_user(answ, (int __user *)arg);
3113 		}
3114 		break;
3115 #endif
3116 	default:
3117 		err = -ENOIOCTLCMD;
3118 		break;
3119 	}
3120 	return err;
3121 }
3122 
3123 #ifdef CONFIG_COMPAT
3124 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3125 {
3126 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3127 }
3128 #endif
3129 
3130 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3131 {
3132 	struct sock *sk = sock->sk;
3133 	__poll_t mask;
3134 
3135 	sock_poll_wait(file, sock, wait);
3136 	mask = 0;
3137 
3138 	/* exceptional events? */
3139 	if (sk->sk_err)
3140 		mask |= EPOLLERR;
3141 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3142 		mask |= EPOLLHUP;
3143 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3144 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3145 
3146 	/* readable? */
3147 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3148 		mask |= EPOLLIN | EPOLLRDNORM;
3149 	if (sk_is_readable(sk))
3150 		mask |= EPOLLIN | EPOLLRDNORM;
3151 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3152 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3153 		mask |= EPOLLPRI;
3154 #endif
3155 
3156 	/* Connection-based need to check for termination and startup */
3157 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3158 	    sk->sk_state == TCP_CLOSE)
3159 		mask |= EPOLLHUP;
3160 
3161 	/*
3162 	 * we set writable also when the other side has shut down the
3163 	 * connection. This prevents stuck sockets.
3164 	 */
3165 	if (unix_writable(sk))
3166 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3167 
3168 	return mask;
3169 }
3170 
3171 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3172 				    poll_table *wait)
3173 {
3174 	struct sock *sk = sock->sk, *other;
3175 	unsigned int writable;
3176 	__poll_t mask;
3177 
3178 	sock_poll_wait(file, sock, wait);
3179 	mask = 0;
3180 
3181 	/* exceptional events? */
3182 	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3183 		mask |= EPOLLERR |
3184 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3185 
3186 	if (sk->sk_shutdown & RCV_SHUTDOWN)
3187 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3188 	if (sk->sk_shutdown == SHUTDOWN_MASK)
3189 		mask |= EPOLLHUP;
3190 
3191 	/* readable? */
3192 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3193 		mask |= EPOLLIN | EPOLLRDNORM;
3194 	if (sk_is_readable(sk))
3195 		mask |= EPOLLIN | EPOLLRDNORM;
3196 
3197 	/* Connection-based need to check for termination and startup */
3198 	if (sk->sk_type == SOCK_SEQPACKET) {
3199 		if (sk->sk_state == TCP_CLOSE)
3200 			mask |= EPOLLHUP;
3201 		/* connection hasn't started yet? */
3202 		if (sk->sk_state == TCP_SYN_SENT)
3203 			return mask;
3204 	}
3205 
3206 	/* No write status requested, avoid expensive OUT tests. */
3207 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3208 		return mask;
3209 
3210 	writable = unix_writable(sk);
3211 	if (writable) {
3212 		unix_state_lock(sk);
3213 
3214 		other = unix_peer(sk);
3215 		if (other && unix_peer(other) != sk &&
3216 		    unix_recvq_full_lockless(other) &&
3217 		    unix_dgram_peer_wake_me(sk, other))
3218 			writable = 0;
3219 
3220 		unix_state_unlock(sk);
3221 	}
3222 
3223 	if (writable)
3224 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3225 	else
3226 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3227 
3228 	return mask;
3229 }
3230 
3231 #ifdef CONFIG_PROC_FS
3232 
3233 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3234 
3235 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3236 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3237 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3238 
3239 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3240 {
3241 	unsigned long offset = get_offset(*pos);
3242 	unsigned long bucket = get_bucket(*pos);
3243 	unsigned long count = 0;
3244 	struct sock *sk;
3245 
3246 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3247 	     sk; sk = sk_next(sk)) {
3248 		if (++count == offset)
3249 			break;
3250 	}
3251 
3252 	return sk;
3253 }
3254 
3255 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3256 {
3257 	unsigned long bucket = get_bucket(*pos);
3258 	struct net *net = seq_file_net(seq);
3259 	struct sock *sk;
3260 
3261 	while (bucket < UNIX_HASH_SIZE) {
3262 		spin_lock(&net->unx.table.locks[bucket]);
3263 
3264 		sk = unix_from_bucket(seq, pos);
3265 		if (sk)
3266 			return sk;
3267 
3268 		spin_unlock(&net->unx.table.locks[bucket]);
3269 
3270 		*pos = set_bucket_offset(++bucket, 1);
3271 	}
3272 
3273 	return NULL;
3274 }
3275 
3276 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3277 				  loff_t *pos)
3278 {
3279 	unsigned long bucket = get_bucket(*pos);
3280 
3281 	sk = sk_next(sk);
3282 	if (sk)
3283 		return sk;
3284 
3285 
3286 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3287 
3288 	*pos = set_bucket_offset(++bucket, 1);
3289 
3290 	return unix_get_first(seq, pos);
3291 }
3292 
3293 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3294 {
3295 	if (!*pos)
3296 		return SEQ_START_TOKEN;
3297 
3298 	return unix_get_first(seq, pos);
3299 }
3300 
3301 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3302 {
3303 	++*pos;
3304 
3305 	if (v == SEQ_START_TOKEN)
3306 		return unix_get_first(seq, pos);
3307 
3308 	return unix_get_next(seq, v, pos);
3309 }
3310 
3311 static void unix_seq_stop(struct seq_file *seq, void *v)
3312 {
3313 	struct sock *sk = v;
3314 
3315 	if (sk)
3316 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3317 }
3318 
3319 static int unix_seq_show(struct seq_file *seq, void *v)
3320 {
3321 
3322 	if (v == SEQ_START_TOKEN)
3323 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3324 			 "Inode Path\n");
3325 	else {
3326 		struct sock *s = v;
3327 		struct unix_sock *u = unix_sk(s);
3328 		unix_state_lock(s);
3329 
3330 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3331 			s,
3332 			refcount_read(&s->sk_refcnt),
3333 			0,
3334 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3335 			s->sk_type,
3336 			s->sk_socket ?
3337 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3338 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3339 			sock_i_ino(s));
3340 
3341 		if (u->addr) {	// under a hash table lock here
3342 			int i, len;
3343 			seq_putc(seq, ' ');
3344 
3345 			i = 0;
3346 			len = u->addr->len -
3347 				offsetof(struct sockaddr_un, sun_path);
3348 			if (u->addr->name->sun_path[0]) {
3349 				len--;
3350 			} else {
3351 				seq_putc(seq, '@');
3352 				i++;
3353 			}
3354 			for ( ; i < len; i++)
3355 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3356 					 '@');
3357 		}
3358 		unix_state_unlock(s);
3359 		seq_putc(seq, '\n');
3360 	}
3361 
3362 	return 0;
3363 }
3364 
3365 static const struct seq_operations unix_seq_ops = {
3366 	.start  = unix_seq_start,
3367 	.next   = unix_seq_next,
3368 	.stop   = unix_seq_stop,
3369 	.show   = unix_seq_show,
3370 };
3371 
3372 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3373 struct bpf_unix_iter_state {
3374 	struct seq_net_private p;
3375 	unsigned int cur_sk;
3376 	unsigned int end_sk;
3377 	unsigned int max_sk;
3378 	struct sock **batch;
3379 	bool st_bucket_done;
3380 };
3381 
3382 struct bpf_iter__unix {
3383 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3384 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3385 	uid_t uid __aligned(8);
3386 };
3387 
3388 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3389 			      struct unix_sock *unix_sk, uid_t uid)
3390 {
3391 	struct bpf_iter__unix ctx;
3392 
3393 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3394 	ctx.meta = meta;
3395 	ctx.unix_sk = unix_sk;
3396 	ctx.uid = uid;
3397 	return bpf_iter_run_prog(prog, &ctx);
3398 }
3399 
3400 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3401 
3402 {
3403 	struct bpf_unix_iter_state *iter = seq->private;
3404 	unsigned int expected = 1;
3405 	struct sock *sk;
3406 
3407 	sock_hold(start_sk);
3408 	iter->batch[iter->end_sk++] = start_sk;
3409 
3410 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3411 		if (iter->end_sk < iter->max_sk) {
3412 			sock_hold(sk);
3413 			iter->batch[iter->end_sk++] = sk;
3414 		}
3415 
3416 		expected++;
3417 	}
3418 
3419 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3420 
3421 	return expected;
3422 }
3423 
3424 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3425 {
3426 	while (iter->cur_sk < iter->end_sk)
3427 		sock_put(iter->batch[iter->cur_sk++]);
3428 }
3429 
3430 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3431 				       unsigned int new_batch_sz)
3432 {
3433 	struct sock **new_batch;
3434 
3435 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3436 			     GFP_USER | __GFP_NOWARN);
3437 	if (!new_batch)
3438 		return -ENOMEM;
3439 
3440 	bpf_iter_unix_put_batch(iter);
3441 	kvfree(iter->batch);
3442 	iter->batch = new_batch;
3443 	iter->max_sk = new_batch_sz;
3444 
3445 	return 0;
3446 }
3447 
3448 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3449 					loff_t *pos)
3450 {
3451 	struct bpf_unix_iter_state *iter = seq->private;
3452 	unsigned int expected;
3453 	bool resized = false;
3454 	struct sock *sk;
3455 
3456 	if (iter->st_bucket_done)
3457 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3458 
3459 again:
3460 	/* Get a new batch */
3461 	iter->cur_sk = 0;
3462 	iter->end_sk = 0;
3463 
3464 	sk = unix_get_first(seq, pos);
3465 	if (!sk)
3466 		return NULL; /* Done */
3467 
3468 	expected = bpf_iter_unix_hold_batch(seq, sk);
3469 
3470 	if (iter->end_sk == expected) {
3471 		iter->st_bucket_done = true;
3472 		return sk;
3473 	}
3474 
3475 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3476 		resized = true;
3477 		goto again;
3478 	}
3479 
3480 	return sk;
3481 }
3482 
3483 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3484 {
3485 	if (!*pos)
3486 		return SEQ_START_TOKEN;
3487 
3488 	/* bpf iter does not support lseek, so it always
3489 	 * continue from where it was stop()-ped.
3490 	 */
3491 	return bpf_iter_unix_batch(seq, pos);
3492 }
3493 
3494 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3495 {
3496 	struct bpf_unix_iter_state *iter = seq->private;
3497 	struct sock *sk;
3498 
3499 	/* Whenever seq_next() is called, the iter->cur_sk is
3500 	 * done with seq_show(), so advance to the next sk in
3501 	 * the batch.
3502 	 */
3503 	if (iter->cur_sk < iter->end_sk)
3504 		sock_put(iter->batch[iter->cur_sk++]);
3505 
3506 	++*pos;
3507 
3508 	if (iter->cur_sk < iter->end_sk)
3509 		sk = iter->batch[iter->cur_sk];
3510 	else
3511 		sk = bpf_iter_unix_batch(seq, pos);
3512 
3513 	return sk;
3514 }
3515 
3516 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3517 {
3518 	struct bpf_iter_meta meta;
3519 	struct bpf_prog *prog;
3520 	struct sock *sk = v;
3521 	uid_t uid;
3522 	bool slow;
3523 	int ret;
3524 
3525 	if (v == SEQ_START_TOKEN)
3526 		return 0;
3527 
3528 	slow = lock_sock_fast(sk);
3529 
3530 	if (unlikely(sk_unhashed(sk))) {
3531 		ret = SEQ_SKIP;
3532 		goto unlock;
3533 	}
3534 
3535 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3536 	meta.seq = seq;
3537 	prog = bpf_iter_get_info(&meta, false);
3538 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3539 unlock:
3540 	unlock_sock_fast(sk, slow);
3541 	return ret;
3542 }
3543 
3544 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3545 {
3546 	struct bpf_unix_iter_state *iter = seq->private;
3547 	struct bpf_iter_meta meta;
3548 	struct bpf_prog *prog;
3549 
3550 	if (!v) {
3551 		meta.seq = seq;
3552 		prog = bpf_iter_get_info(&meta, true);
3553 		if (prog)
3554 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3555 	}
3556 
3557 	if (iter->cur_sk < iter->end_sk)
3558 		bpf_iter_unix_put_batch(iter);
3559 }
3560 
3561 static const struct seq_operations bpf_iter_unix_seq_ops = {
3562 	.start	= bpf_iter_unix_seq_start,
3563 	.next	= bpf_iter_unix_seq_next,
3564 	.stop	= bpf_iter_unix_seq_stop,
3565 	.show	= bpf_iter_unix_seq_show,
3566 };
3567 #endif
3568 #endif
3569 
3570 static const struct net_proto_family unix_family_ops = {
3571 	.family = PF_UNIX,
3572 	.create = unix_create,
3573 	.owner	= THIS_MODULE,
3574 };
3575 
3576 
3577 static int __net_init unix_net_init(struct net *net)
3578 {
3579 	int i;
3580 
3581 	net->unx.sysctl_max_dgram_qlen = 10;
3582 	if (unix_sysctl_register(net))
3583 		goto out;
3584 
3585 #ifdef CONFIG_PROC_FS
3586 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3587 			     sizeof(struct seq_net_private)))
3588 		goto err_sysctl;
3589 #endif
3590 
3591 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3592 					      sizeof(spinlock_t), GFP_KERNEL);
3593 	if (!net->unx.table.locks)
3594 		goto err_proc;
3595 
3596 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3597 						sizeof(struct hlist_head),
3598 						GFP_KERNEL);
3599 	if (!net->unx.table.buckets)
3600 		goto free_locks;
3601 
3602 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3603 		spin_lock_init(&net->unx.table.locks[i]);
3604 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3605 	}
3606 
3607 	return 0;
3608 
3609 free_locks:
3610 	kvfree(net->unx.table.locks);
3611 err_proc:
3612 #ifdef CONFIG_PROC_FS
3613 	remove_proc_entry("unix", net->proc_net);
3614 err_sysctl:
3615 #endif
3616 	unix_sysctl_unregister(net);
3617 out:
3618 	return -ENOMEM;
3619 }
3620 
3621 static void __net_exit unix_net_exit(struct net *net)
3622 {
3623 	kvfree(net->unx.table.buckets);
3624 	kvfree(net->unx.table.locks);
3625 	unix_sysctl_unregister(net);
3626 	remove_proc_entry("unix", net->proc_net);
3627 }
3628 
3629 static struct pernet_operations unix_net_ops = {
3630 	.init = unix_net_init,
3631 	.exit = unix_net_exit,
3632 };
3633 
3634 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3635 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3636 		     struct unix_sock *unix_sk, uid_t uid)
3637 
3638 #define INIT_BATCH_SZ 16
3639 
3640 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3641 {
3642 	struct bpf_unix_iter_state *iter = priv_data;
3643 	int err;
3644 
3645 	err = bpf_iter_init_seq_net(priv_data, aux);
3646 	if (err)
3647 		return err;
3648 
3649 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3650 	if (err) {
3651 		bpf_iter_fini_seq_net(priv_data);
3652 		return err;
3653 	}
3654 
3655 	return 0;
3656 }
3657 
3658 static void bpf_iter_fini_unix(void *priv_data)
3659 {
3660 	struct bpf_unix_iter_state *iter = priv_data;
3661 
3662 	bpf_iter_fini_seq_net(priv_data);
3663 	kvfree(iter->batch);
3664 }
3665 
3666 static const struct bpf_iter_seq_info unix_seq_info = {
3667 	.seq_ops		= &bpf_iter_unix_seq_ops,
3668 	.init_seq_private	= bpf_iter_init_unix,
3669 	.fini_seq_private	= bpf_iter_fini_unix,
3670 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3671 };
3672 
3673 static const struct bpf_func_proto *
3674 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3675 			     const struct bpf_prog *prog)
3676 {
3677 	switch (func_id) {
3678 	case BPF_FUNC_setsockopt:
3679 		return &bpf_sk_setsockopt_proto;
3680 	case BPF_FUNC_getsockopt:
3681 		return &bpf_sk_getsockopt_proto;
3682 	default:
3683 		return NULL;
3684 	}
3685 }
3686 
3687 static struct bpf_iter_reg unix_reg_info = {
3688 	.target			= "unix",
3689 	.ctx_arg_info_size	= 1,
3690 	.ctx_arg_info		= {
3691 		{ offsetof(struct bpf_iter__unix, unix_sk),
3692 		  PTR_TO_BTF_ID_OR_NULL },
3693 	},
3694 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3695 	.seq_info		= &unix_seq_info,
3696 };
3697 
3698 static void __init bpf_iter_register(void)
3699 {
3700 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3701 	if (bpf_iter_reg_target(&unix_reg_info))
3702 		pr_warn("Warning: could not register bpf iterator unix\n");
3703 }
3704 #endif
3705 
3706 static int __init af_unix_init(void)
3707 {
3708 	int i, rc = -1;
3709 
3710 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3711 
3712 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3713 		spin_lock_init(&bsd_socket_locks[i]);
3714 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3715 	}
3716 
3717 	rc = proto_register(&unix_dgram_proto, 1);
3718 	if (rc != 0) {
3719 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3720 		goto out;
3721 	}
3722 
3723 	rc = proto_register(&unix_stream_proto, 1);
3724 	if (rc != 0) {
3725 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3726 		goto out;
3727 	}
3728 
3729 	sock_register(&unix_family_ops);
3730 	register_pernet_subsys(&unix_net_ops);
3731 	unix_bpf_build_proto();
3732 
3733 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3734 	bpf_iter_register();
3735 #endif
3736 
3737 out:
3738 	return rc;
3739 }
3740 
3741 static void __exit af_unix_exit(void)
3742 {
3743 	sock_unregister(PF_UNIX);
3744 	proto_unregister(&unix_dgram_proto);
3745 	proto_unregister(&unix_stream_proto);
3746 	unregister_pernet_subsys(&unix_net_ops);
3747 }
3748 
3749 /* Earlier than device_initcall() so that other drivers invoking
3750    request_module() don't end up in a loop when modprobe tries
3751    to use a UNIX socket. But later than subsys_initcall() because
3752    we depend on stuff initialised there */
3753 fs_initcall(af_unix_init);
3754 module_exit(af_unix_exit);
3755 
3756 MODULE_LICENSE("GPL");
3757 MODULE_ALIAS_NETPROTO(PF_UNIX);
3758