xref: /linux/net/unix/af_unix.c (revision bd765cc910127ee8ed6cd83dae0f0bfbca69d71e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 
130 static unsigned int unix_unbound_hash(struct sock *sk)
131 {
132 	unsigned long hash = (unsigned long)sk;
133 
134 	hash ^= hash >> 16;
135 	hash ^= hash >> 8;
136 	hash ^= sk->sk_type;
137 
138 	return hash & UNIX_HASH_MOD;
139 }
140 
141 static unsigned int unix_bsd_hash(struct inode *i)
142 {
143 	return i->i_ino & UNIX_HASH_MOD;
144 }
145 
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 				       int addr_len, int type)
148 {
149 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150 	unsigned int hash;
151 
152 	hash = (__force unsigned int)csum_fold(csum);
153 	hash ^= hash >> 8;
154 	hash ^= type;
155 
156 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157 }
158 
159 static void unix_table_double_lock(struct net *net,
160 				   unsigned int hash1, unsigned int hash2)
161 {
162 	if (hash1 == hash2) {
163 		spin_lock(&net->unx.table.locks[hash1]);
164 		return;
165 	}
166 
167 	if (hash1 > hash2)
168 		swap(hash1, hash2);
169 
170 	spin_lock(&net->unx.table.locks[hash1]);
171 	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172 }
173 
174 static void unix_table_double_unlock(struct net *net,
175 				     unsigned int hash1, unsigned int hash2)
176 {
177 	if (hash1 == hash2) {
178 		spin_unlock(&net->unx.table.locks[hash1]);
179 		return;
180 	}
181 
182 	spin_unlock(&net->unx.table.locks[hash1]);
183 	spin_unlock(&net->unx.table.locks[hash2]);
184 }
185 
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188 {
189 	UNIXCB(skb).secid = scm->secid;
190 }
191 
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193 {
194 	scm->secid = UNIXCB(skb).secid;
195 }
196 
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198 {
199 	return (scm->secid == UNIXCB(skb).secid);
200 }
201 #else
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203 { }
204 
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206 { }
207 
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209 {
210 	return true;
211 }
212 #endif /* CONFIG_SECURITY_NETWORK */
213 
214 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
215 {
216 	return unix_peer(osk) == sk;
217 }
218 
219 static inline int unix_may_send(struct sock *sk, struct sock *osk)
220 {
221 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
222 }
223 
224 static inline int unix_recvq_full(const struct sock *sk)
225 {
226 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
227 }
228 
229 static inline int unix_recvq_full_lockless(const struct sock *sk)
230 {
231 	return skb_queue_len_lockless(&sk->sk_receive_queue) >
232 		READ_ONCE(sk->sk_max_ack_backlog);
233 }
234 
235 struct sock *unix_peer_get(struct sock *s)
236 {
237 	struct sock *peer;
238 
239 	unix_state_lock(s);
240 	peer = unix_peer(s);
241 	if (peer)
242 		sock_hold(peer);
243 	unix_state_unlock(s);
244 	return peer;
245 }
246 EXPORT_SYMBOL_GPL(unix_peer_get);
247 
248 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
249 					     int addr_len)
250 {
251 	struct unix_address *addr;
252 
253 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
254 	if (!addr)
255 		return NULL;
256 
257 	refcount_set(&addr->refcnt, 1);
258 	addr->len = addr_len;
259 	memcpy(addr->name, sunaddr, addr_len);
260 
261 	return addr;
262 }
263 
264 static inline void unix_release_addr(struct unix_address *addr)
265 {
266 	if (refcount_dec_and_test(&addr->refcnt))
267 		kfree(addr);
268 }
269 
270 /*
271  *	Check unix socket name:
272  *		- should be not zero length.
273  *	        - if started by not zero, should be NULL terminated (FS object)
274  *		- if started by zero, it is abstract name.
275  */
276 
277 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
278 {
279 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
280 	    addr_len > sizeof(*sunaddr))
281 		return -EINVAL;
282 
283 	if (sunaddr->sun_family != AF_UNIX)
284 		return -EINVAL;
285 
286 	return 0;
287 }
288 
289 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
290 {
291 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
292 	short offset = offsetof(struct sockaddr_storage, __data);
293 
294 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
295 
296 	/* This may look like an off by one error but it is a bit more
297 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
298 	 * sun_path[108] doesn't as such exist.  However in kernel space
299 	 * we are guaranteed that it is a valid memory location in our
300 	 * kernel address buffer because syscall functions always pass
301 	 * a pointer of struct sockaddr_storage which has a bigger buffer
302 	 * than 108.  Also, we must terminate sun_path for strlen() in
303 	 * getname_kernel().
304 	 */
305 	addr->__data[addr_len - offset] = 0;
306 
307 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
308 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
309 	 * know the actual buffer.
310 	 */
311 	return strlen(addr->__data) + offset + 1;
312 }
313 
314 static void __unix_remove_socket(struct sock *sk)
315 {
316 	sk_del_node_init(sk);
317 }
318 
319 static void __unix_insert_socket(struct net *net, struct sock *sk)
320 {
321 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
322 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
323 }
324 
325 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
326 				 struct unix_address *addr, unsigned int hash)
327 {
328 	__unix_remove_socket(sk);
329 	smp_store_release(&unix_sk(sk)->addr, addr);
330 
331 	sk->sk_hash = hash;
332 	__unix_insert_socket(net, sk);
333 }
334 
335 static void unix_remove_socket(struct net *net, struct sock *sk)
336 {
337 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
338 	__unix_remove_socket(sk);
339 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
340 }
341 
342 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
343 {
344 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
345 	__unix_insert_socket(net, sk);
346 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
347 }
348 
349 static void unix_insert_bsd_socket(struct sock *sk)
350 {
351 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
352 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
353 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
354 }
355 
356 static void unix_remove_bsd_socket(struct sock *sk)
357 {
358 	if (!hlist_unhashed(&sk->sk_bind_node)) {
359 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
360 		__sk_del_bind_node(sk);
361 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
362 
363 		sk_node_init(&sk->sk_bind_node);
364 	}
365 }
366 
367 static struct sock *__unix_find_socket_byname(struct net *net,
368 					      struct sockaddr_un *sunname,
369 					      int len, unsigned int hash)
370 {
371 	struct sock *s;
372 
373 	sk_for_each(s, &net->unx.table.buckets[hash]) {
374 		struct unix_sock *u = unix_sk(s);
375 
376 		if (u->addr->len == len &&
377 		    !memcmp(u->addr->name, sunname, len))
378 			return s;
379 	}
380 	return NULL;
381 }
382 
383 static inline struct sock *unix_find_socket_byname(struct net *net,
384 						   struct sockaddr_un *sunname,
385 						   int len, unsigned int hash)
386 {
387 	struct sock *s;
388 
389 	spin_lock(&net->unx.table.locks[hash]);
390 	s = __unix_find_socket_byname(net, sunname, len, hash);
391 	if (s)
392 		sock_hold(s);
393 	spin_unlock(&net->unx.table.locks[hash]);
394 	return s;
395 }
396 
397 static struct sock *unix_find_socket_byinode(struct inode *i)
398 {
399 	unsigned int hash = unix_bsd_hash(i);
400 	struct sock *s;
401 
402 	spin_lock(&bsd_socket_locks[hash]);
403 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
404 		struct dentry *dentry = unix_sk(s)->path.dentry;
405 
406 		if (dentry && d_backing_inode(dentry) == i) {
407 			sock_hold(s);
408 			spin_unlock(&bsd_socket_locks[hash]);
409 			return s;
410 		}
411 	}
412 	spin_unlock(&bsd_socket_locks[hash]);
413 	return NULL;
414 }
415 
416 /* Support code for asymmetrically connected dgram sockets
417  *
418  * If a datagram socket is connected to a socket not itself connected
419  * to the first socket (eg, /dev/log), clients may only enqueue more
420  * messages if the present receive queue of the server socket is not
421  * "too large". This means there's a second writeability condition
422  * poll and sendmsg need to test. The dgram recv code will do a wake
423  * up on the peer_wait wait queue of a socket upon reception of a
424  * datagram which needs to be propagated to sleeping would-be writers
425  * since these might not have sent anything so far. This can't be
426  * accomplished via poll_wait because the lifetime of the server
427  * socket might be less than that of its clients if these break their
428  * association with it or if the server socket is closed while clients
429  * are still connected to it and there's no way to inform "a polling
430  * implementation" that it should let go of a certain wait queue
431  *
432  * In order to propagate a wake up, a wait_queue_entry_t of the client
433  * socket is enqueued on the peer_wait queue of the server socket
434  * whose wake function does a wake_up on the ordinary client socket
435  * wait queue. This connection is established whenever a write (or
436  * poll for write) hit the flow control condition and broken when the
437  * association to the server socket is dissolved or after a wake up
438  * was relayed.
439  */
440 
441 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
442 				      void *key)
443 {
444 	struct unix_sock *u;
445 	wait_queue_head_t *u_sleep;
446 
447 	u = container_of(q, struct unix_sock, peer_wake);
448 
449 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
450 			    q);
451 	u->peer_wake.private = NULL;
452 
453 	/* relaying can only happen while the wq still exists */
454 	u_sleep = sk_sleep(&u->sk);
455 	if (u_sleep)
456 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
457 
458 	return 0;
459 }
460 
461 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
462 {
463 	struct unix_sock *u, *u_other;
464 	int rc;
465 
466 	u = unix_sk(sk);
467 	u_other = unix_sk(other);
468 	rc = 0;
469 	spin_lock(&u_other->peer_wait.lock);
470 
471 	if (!u->peer_wake.private) {
472 		u->peer_wake.private = other;
473 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
474 
475 		rc = 1;
476 	}
477 
478 	spin_unlock(&u_other->peer_wait.lock);
479 	return rc;
480 }
481 
482 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
483 					    struct sock *other)
484 {
485 	struct unix_sock *u, *u_other;
486 
487 	u = unix_sk(sk);
488 	u_other = unix_sk(other);
489 	spin_lock(&u_other->peer_wait.lock);
490 
491 	if (u->peer_wake.private == other) {
492 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
493 		u->peer_wake.private = NULL;
494 	}
495 
496 	spin_unlock(&u_other->peer_wait.lock);
497 }
498 
499 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
500 						   struct sock *other)
501 {
502 	unix_dgram_peer_wake_disconnect(sk, other);
503 	wake_up_interruptible_poll(sk_sleep(sk),
504 				   EPOLLOUT |
505 				   EPOLLWRNORM |
506 				   EPOLLWRBAND);
507 }
508 
509 /* preconditions:
510  *	- unix_peer(sk) == other
511  *	- association is stable
512  */
513 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
514 {
515 	int connected;
516 
517 	connected = unix_dgram_peer_wake_connect(sk, other);
518 
519 	/* If other is SOCK_DEAD, we want to make sure we signal
520 	 * POLLOUT, such that a subsequent write() can get a
521 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
522 	 * to other and its full, we will hang waiting for POLLOUT.
523 	 */
524 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
525 		return 1;
526 
527 	if (connected)
528 		unix_dgram_peer_wake_disconnect(sk, other);
529 
530 	return 0;
531 }
532 
533 static int unix_writable(const struct sock *sk)
534 {
535 	return sk->sk_state != TCP_LISTEN &&
536 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
537 }
538 
539 static void unix_write_space(struct sock *sk)
540 {
541 	struct socket_wq *wq;
542 
543 	rcu_read_lock();
544 	if (unix_writable(sk)) {
545 		wq = rcu_dereference(sk->sk_wq);
546 		if (skwq_has_sleeper(wq))
547 			wake_up_interruptible_sync_poll(&wq->wait,
548 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
549 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
550 	}
551 	rcu_read_unlock();
552 }
553 
554 /* When dgram socket disconnects (or changes its peer), we clear its receive
555  * queue of packets arrived from previous peer. First, it allows to do
556  * flow control based only on wmem_alloc; second, sk connected to peer
557  * may receive messages only from that peer. */
558 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
559 {
560 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
561 		skb_queue_purge(&sk->sk_receive_queue);
562 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
563 
564 		/* If one link of bidirectional dgram pipe is disconnected,
565 		 * we signal error. Messages are lost. Do not make this,
566 		 * when peer was not connected to us.
567 		 */
568 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
569 			WRITE_ONCE(other->sk_err, ECONNRESET);
570 			sk_error_report(other);
571 		}
572 	}
573 	other->sk_state = TCP_CLOSE;
574 }
575 
576 static void unix_sock_destructor(struct sock *sk)
577 {
578 	struct unix_sock *u = unix_sk(sk);
579 
580 	skb_queue_purge(&sk->sk_receive_queue);
581 
582 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
583 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
584 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
585 	if (!sock_flag(sk, SOCK_DEAD)) {
586 		pr_info("Attempt to release alive unix socket: %p\n", sk);
587 		return;
588 	}
589 
590 	if (u->addr)
591 		unix_release_addr(u->addr);
592 
593 	atomic_long_dec(&unix_nr_socks);
594 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
595 #ifdef UNIX_REFCNT_DEBUG
596 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
597 		atomic_long_read(&unix_nr_socks));
598 #endif
599 }
600 
601 static void unix_release_sock(struct sock *sk, int embrion)
602 {
603 	struct unix_sock *u = unix_sk(sk);
604 	struct sock *skpair;
605 	struct sk_buff *skb;
606 	struct path path;
607 	int state;
608 
609 	unix_remove_socket(sock_net(sk), sk);
610 	unix_remove_bsd_socket(sk);
611 
612 	/* Clear state */
613 	unix_state_lock(sk);
614 	sock_orphan(sk);
615 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
616 	path	     = u->path;
617 	u->path.dentry = NULL;
618 	u->path.mnt = NULL;
619 	state = sk->sk_state;
620 	sk->sk_state = TCP_CLOSE;
621 
622 	skpair = unix_peer(sk);
623 	unix_peer(sk) = NULL;
624 
625 	unix_state_unlock(sk);
626 
627 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
628 	if (u->oob_skb) {
629 		kfree_skb(u->oob_skb);
630 		u->oob_skb = NULL;
631 	}
632 #endif
633 
634 	wake_up_interruptible_all(&u->peer_wait);
635 
636 	if (skpair != NULL) {
637 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
638 			unix_state_lock(skpair);
639 			/* No more writes */
640 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
641 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
642 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
643 			unix_state_unlock(skpair);
644 			skpair->sk_state_change(skpair);
645 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
646 		}
647 
648 		unix_dgram_peer_wake_disconnect(sk, skpair);
649 		sock_put(skpair); /* It may now die */
650 	}
651 
652 	/* Try to flush out this socket. Throw out buffers at least */
653 
654 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
655 		if (state == TCP_LISTEN)
656 			unix_release_sock(skb->sk, 1);
657 		/* passed fds are erased in the kfree_skb hook	      */
658 		UNIXCB(skb).consumed = skb->len;
659 		kfree_skb(skb);
660 	}
661 
662 	if (path.dentry)
663 		path_put(&path);
664 
665 	sock_put(sk);
666 
667 	/* ---- Socket is dead now and most probably destroyed ---- */
668 
669 	/*
670 	 * Fixme: BSD difference: In BSD all sockets connected to us get
671 	 *	  ECONNRESET and we die on the spot. In Linux we behave
672 	 *	  like files and pipes do and wait for the last
673 	 *	  dereference.
674 	 *
675 	 * Can't we simply set sock->err?
676 	 *
677 	 *	  What the above comment does talk about? --ANK(980817)
678 	 */
679 
680 	if (READ_ONCE(unix_tot_inflight))
681 		unix_gc();		/* Garbage collect fds */
682 }
683 
684 static void init_peercred(struct sock *sk)
685 {
686 	const struct cred *old_cred;
687 	struct pid *old_pid;
688 
689 	spin_lock(&sk->sk_peer_lock);
690 	old_pid = sk->sk_peer_pid;
691 	old_cred = sk->sk_peer_cred;
692 	sk->sk_peer_pid  = get_pid(task_tgid(current));
693 	sk->sk_peer_cred = get_current_cred();
694 	spin_unlock(&sk->sk_peer_lock);
695 
696 	put_pid(old_pid);
697 	put_cred(old_cred);
698 }
699 
700 static void copy_peercred(struct sock *sk, struct sock *peersk)
701 {
702 	const struct cred *old_cred;
703 	struct pid *old_pid;
704 
705 	if (sk < peersk) {
706 		spin_lock(&sk->sk_peer_lock);
707 		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
708 	} else {
709 		spin_lock(&peersk->sk_peer_lock);
710 		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
711 	}
712 	old_pid = sk->sk_peer_pid;
713 	old_cred = sk->sk_peer_cred;
714 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
715 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
716 
717 	spin_unlock(&sk->sk_peer_lock);
718 	spin_unlock(&peersk->sk_peer_lock);
719 
720 	put_pid(old_pid);
721 	put_cred(old_cred);
722 }
723 
724 static int unix_listen(struct socket *sock, int backlog)
725 {
726 	int err;
727 	struct sock *sk = sock->sk;
728 	struct unix_sock *u = unix_sk(sk);
729 
730 	err = -EOPNOTSUPP;
731 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
732 		goto out;	/* Only stream/seqpacket sockets accept */
733 	err = -EINVAL;
734 	if (!u->addr)
735 		goto out;	/* No listens on an unbound socket */
736 	unix_state_lock(sk);
737 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
738 		goto out_unlock;
739 	if (backlog > sk->sk_max_ack_backlog)
740 		wake_up_interruptible_all(&u->peer_wait);
741 	sk->sk_max_ack_backlog	= backlog;
742 	sk->sk_state		= TCP_LISTEN;
743 	/* set credentials so connect can copy them */
744 	init_peercred(sk);
745 	err = 0;
746 
747 out_unlock:
748 	unix_state_unlock(sk);
749 out:
750 	return err;
751 }
752 
753 static int unix_release(struct socket *);
754 static int unix_bind(struct socket *, struct sockaddr *, int);
755 static int unix_stream_connect(struct socket *, struct sockaddr *,
756 			       int addr_len, int flags);
757 static int unix_socketpair(struct socket *, struct socket *);
758 static int unix_accept(struct socket *, struct socket *, int, bool);
759 static int unix_getname(struct socket *, struct sockaddr *, int);
760 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
761 static __poll_t unix_dgram_poll(struct file *, struct socket *,
762 				    poll_table *);
763 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
764 #ifdef CONFIG_COMPAT
765 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
766 #endif
767 static int unix_shutdown(struct socket *, int);
768 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
769 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
770 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
771 				       struct pipe_inode_info *, size_t size,
772 				       unsigned int flags);
773 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
774 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
775 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
776 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
777 static int unix_dgram_connect(struct socket *, struct sockaddr *,
778 			      int, int);
779 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
780 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
781 				  int);
782 
783 static int unix_set_peek_off(struct sock *sk, int val)
784 {
785 	struct unix_sock *u = unix_sk(sk);
786 
787 	if (mutex_lock_interruptible(&u->iolock))
788 		return -EINTR;
789 
790 	WRITE_ONCE(sk->sk_peek_off, val);
791 	mutex_unlock(&u->iolock);
792 
793 	return 0;
794 }
795 
796 #ifdef CONFIG_PROC_FS
797 static int unix_count_nr_fds(struct sock *sk)
798 {
799 	struct sk_buff *skb;
800 	struct unix_sock *u;
801 	int nr_fds = 0;
802 
803 	spin_lock(&sk->sk_receive_queue.lock);
804 	skb = skb_peek(&sk->sk_receive_queue);
805 	while (skb) {
806 		u = unix_sk(skb->sk);
807 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
808 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
809 	}
810 	spin_unlock(&sk->sk_receive_queue.lock);
811 
812 	return nr_fds;
813 }
814 
815 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
816 {
817 	struct sock *sk = sock->sk;
818 	unsigned char s_state;
819 	struct unix_sock *u;
820 	int nr_fds = 0;
821 
822 	if (sk) {
823 		s_state = READ_ONCE(sk->sk_state);
824 		u = unix_sk(sk);
825 
826 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
827 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
828 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
829 		 */
830 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
831 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
832 		else if (s_state == TCP_LISTEN)
833 			nr_fds = unix_count_nr_fds(sk);
834 
835 		seq_printf(m, "scm_fds: %u\n", nr_fds);
836 	}
837 }
838 #else
839 #define unix_show_fdinfo NULL
840 #endif
841 
842 static const struct proto_ops unix_stream_ops = {
843 	.family =	PF_UNIX,
844 	.owner =	THIS_MODULE,
845 	.release =	unix_release,
846 	.bind =		unix_bind,
847 	.connect =	unix_stream_connect,
848 	.socketpair =	unix_socketpair,
849 	.accept =	unix_accept,
850 	.getname =	unix_getname,
851 	.poll =		unix_poll,
852 	.ioctl =	unix_ioctl,
853 #ifdef CONFIG_COMPAT
854 	.compat_ioctl =	unix_compat_ioctl,
855 #endif
856 	.listen =	unix_listen,
857 	.shutdown =	unix_shutdown,
858 	.sendmsg =	unix_stream_sendmsg,
859 	.recvmsg =	unix_stream_recvmsg,
860 	.read_skb =	unix_stream_read_skb,
861 	.mmap =		sock_no_mmap,
862 	.splice_read =	unix_stream_splice_read,
863 	.set_peek_off =	unix_set_peek_off,
864 	.show_fdinfo =	unix_show_fdinfo,
865 };
866 
867 static const struct proto_ops unix_dgram_ops = {
868 	.family =	PF_UNIX,
869 	.owner =	THIS_MODULE,
870 	.release =	unix_release,
871 	.bind =		unix_bind,
872 	.connect =	unix_dgram_connect,
873 	.socketpair =	unix_socketpair,
874 	.accept =	sock_no_accept,
875 	.getname =	unix_getname,
876 	.poll =		unix_dgram_poll,
877 	.ioctl =	unix_ioctl,
878 #ifdef CONFIG_COMPAT
879 	.compat_ioctl =	unix_compat_ioctl,
880 #endif
881 	.listen =	sock_no_listen,
882 	.shutdown =	unix_shutdown,
883 	.sendmsg =	unix_dgram_sendmsg,
884 	.read_skb =	unix_read_skb,
885 	.recvmsg =	unix_dgram_recvmsg,
886 	.mmap =		sock_no_mmap,
887 	.set_peek_off =	unix_set_peek_off,
888 	.show_fdinfo =	unix_show_fdinfo,
889 };
890 
891 static const struct proto_ops unix_seqpacket_ops = {
892 	.family =	PF_UNIX,
893 	.owner =	THIS_MODULE,
894 	.release =	unix_release,
895 	.bind =		unix_bind,
896 	.connect =	unix_stream_connect,
897 	.socketpair =	unix_socketpair,
898 	.accept =	unix_accept,
899 	.getname =	unix_getname,
900 	.poll =		unix_dgram_poll,
901 	.ioctl =	unix_ioctl,
902 #ifdef CONFIG_COMPAT
903 	.compat_ioctl =	unix_compat_ioctl,
904 #endif
905 	.listen =	unix_listen,
906 	.shutdown =	unix_shutdown,
907 	.sendmsg =	unix_seqpacket_sendmsg,
908 	.recvmsg =	unix_seqpacket_recvmsg,
909 	.mmap =		sock_no_mmap,
910 	.set_peek_off =	unix_set_peek_off,
911 	.show_fdinfo =	unix_show_fdinfo,
912 };
913 
914 static void unix_close(struct sock *sk, long timeout)
915 {
916 	/* Nothing to do here, unix socket does not need a ->close().
917 	 * This is merely for sockmap.
918 	 */
919 }
920 
921 static void unix_unhash(struct sock *sk)
922 {
923 	/* Nothing to do here, unix socket does not need a ->unhash().
924 	 * This is merely for sockmap.
925 	 */
926 }
927 
928 static bool unix_bpf_bypass_getsockopt(int level, int optname)
929 {
930 	if (level == SOL_SOCKET) {
931 		switch (optname) {
932 		case SO_PEERPIDFD:
933 			return true;
934 		default:
935 			return false;
936 		}
937 	}
938 
939 	return false;
940 }
941 
942 struct proto unix_dgram_proto = {
943 	.name			= "UNIX",
944 	.owner			= THIS_MODULE,
945 	.obj_size		= sizeof(struct unix_sock),
946 	.close			= unix_close,
947 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
948 #ifdef CONFIG_BPF_SYSCALL
949 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
950 #endif
951 };
952 
953 struct proto unix_stream_proto = {
954 	.name			= "UNIX-STREAM",
955 	.owner			= THIS_MODULE,
956 	.obj_size		= sizeof(struct unix_sock),
957 	.close			= unix_close,
958 	.unhash			= unix_unhash,
959 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
960 #ifdef CONFIG_BPF_SYSCALL
961 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
962 #endif
963 };
964 
965 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
966 {
967 	struct unix_sock *u;
968 	struct sock *sk;
969 	int err;
970 
971 	atomic_long_inc(&unix_nr_socks);
972 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
973 		err = -ENFILE;
974 		goto err;
975 	}
976 
977 	if (type == SOCK_STREAM)
978 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
979 	else /*dgram and  seqpacket */
980 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
981 
982 	if (!sk) {
983 		err = -ENOMEM;
984 		goto err;
985 	}
986 
987 	sock_init_data(sock, sk);
988 
989 	sk->sk_hash		= unix_unbound_hash(sk);
990 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
991 	sk->sk_write_space	= unix_write_space;
992 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
993 	sk->sk_destruct		= unix_sock_destructor;
994 	u = unix_sk(sk);
995 	u->inflight = 0;
996 	u->path.dentry = NULL;
997 	u->path.mnt = NULL;
998 	spin_lock_init(&u->lock);
999 	INIT_LIST_HEAD(&u->link);
1000 	mutex_init(&u->iolock); /* single task reading lock */
1001 	mutex_init(&u->bindlock); /* single task binding lock */
1002 	init_waitqueue_head(&u->peer_wait);
1003 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1004 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1005 	unix_insert_unbound_socket(net, sk);
1006 
1007 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1008 
1009 	return sk;
1010 
1011 err:
1012 	atomic_long_dec(&unix_nr_socks);
1013 	return ERR_PTR(err);
1014 }
1015 
1016 static int unix_create(struct net *net, struct socket *sock, int protocol,
1017 		       int kern)
1018 {
1019 	struct sock *sk;
1020 
1021 	if (protocol && protocol != PF_UNIX)
1022 		return -EPROTONOSUPPORT;
1023 
1024 	sock->state = SS_UNCONNECTED;
1025 
1026 	switch (sock->type) {
1027 	case SOCK_STREAM:
1028 		sock->ops = &unix_stream_ops;
1029 		break;
1030 		/*
1031 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1032 		 *	nothing uses it.
1033 		 */
1034 	case SOCK_RAW:
1035 		sock->type = SOCK_DGRAM;
1036 		fallthrough;
1037 	case SOCK_DGRAM:
1038 		sock->ops = &unix_dgram_ops;
1039 		break;
1040 	case SOCK_SEQPACKET:
1041 		sock->ops = &unix_seqpacket_ops;
1042 		break;
1043 	default:
1044 		return -ESOCKTNOSUPPORT;
1045 	}
1046 
1047 	sk = unix_create1(net, sock, kern, sock->type);
1048 	if (IS_ERR(sk))
1049 		return PTR_ERR(sk);
1050 
1051 	return 0;
1052 }
1053 
1054 static int unix_release(struct socket *sock)
1055 {
1056 	struct sock *sk = sock->sk;
1057 
1058 	if (!sk)
1059 		return 0;
1060 
1061 	sk->sk_prot->close(sk, 0);
1062 	unix_release_sock(sk, 0);
1063 	sock->sk = NULL;
1064 
1065 	return 0;
1066 }
1067 
1068 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1069 				  int type)
1070 {
1071 	struct inode *inode;
1072 	struct path path;
1073 	struct sock *sk;
1074 	int err;
1075 
1076 	unix_mkname_bsd(sunaddr, addr_len);
1077 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1078 	if (err)
1079 		goto fail;
1080 
1081 	err = path_permission(&path, MAY_WRITE);
1082 	if (err)
1083 		goto path_put;
1084 
1085 	err = -ECONNREFUSED;
1086 	inode = d_backing_inode(path.dentry);
1087 	if (!S_ISSOCK(inode->i_mode))
1088 		goto path_put;
1089 
1090 	sk = unix_find_socket_byinode(inode);
1091 	if (!sk)
1092 		goto path_put;
1093 
1094 	err = -EPROTOTYPE;
1095 	if (sk->sk_type == type)
1096 		touch_atime(&path);
1097 	else
1098 		goto sock_put;
1099 
1100 	path_put(&path);
1101 
1102 	return sk;
1103 
1104 sock_put:
1105 	sock_put(sk);
1106 path_put:
1107 	path_put(&path);
1108 fail:
1109 	return ERR_PTR(err);
1110 }
1111 
1112 static struct sock *unix_find_abstract(struct net *net,
1113 				       struct sockaddr_un *sunaddr,
1114 				       int addr_len, int type)
1115 {
1116 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1117 	struct dentry *dentry;
1118 	struct sock *sk;
1119 
1120 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1121 	if (!sk)
1122 		return ERR_PTR(-ECONNREFUSED);
1123 
1124 	dentry = unix_sk(sk)->path.dentry;
1125 	if (dentry)
1126 		touch_atime(&unix_sk(sk)->path);
1127 
1128 	return sk;
1129 }
1130 
1131 static struct sock *unix_find_other(struct net *net,
1132 				    struct sockaddr_un *sunaddr,
1133 				    int addr_len, int type)
1134 {
1135 	struct sock *sk;
1136 
1137 	if (sunaddr->sun_path[0])
1138 		sk = unix_find_bsd(sunaddr, addr_len, type);
1139 	else
1140 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1141 
1142 	return sk;
1143 }
1144 
1145 static int unix_autobind(struct sock *sk)
1146 {
1147 	unsigned int new_hash, old_hash = sk->sk_hash;
1148 	struct unix_sock *u = unix_sk(sk);
1149 	struct net *net = sock_net(sk);
1150 	struct unix_address *addr;
1151 	u32 lastnum, ordernum;
1152 	int err;
1153 
1154 	err = mutex_lock_interruptible(&u->bindlock);
1155 	if (err)
1156 		return err;
1157 
1158 	if (u->addr)
1159 		goto out;
1160 
1161 	err = -ENOMEM;
1162 	addr = kzalloc(sizeof(*addr) +
1163 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1164 	if (!addr)
1165 		goto out;
1166 
1167 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1168 	addr->name->sun_family = AF_UNIX;
1169 	refcount_set(&addr->refcnt, 1);
1170 
1171 	ordernum = get_random_u32();
1172 	lastnum = ordernum & 0xFFFFF;
1173 retry:
1174 	ordernum = (ordernum + 1) & 0xFFFFF;
1175 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1176 
1177 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1178 	unix_table_double_lock(net, old_hash, new_hash);
1179 
1180 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1181 		unix_table_double_unlock(net, old_hash, new_hash);
1182 
1183 		/* __unix_find_socket_byname() may take long time if many names
1184 		 * are already in use.
1185 		 */
1186 		cond_resched();
1187 
1188 		if (ordernum == lastnum) {
1189 			/* Give up if all names seems to be in use. */
1190 			err = -ENOSPC;
1191 			unix_release_addr(addr);
1192 			goto out;
1193 		}
1194 
1195 		goto retry;
1196 	}
1197 
1198 	__unix_set_addr_hash(net, sk, addr, new_hash);
1199 	unix_table_double_unlock(net, old_hash, new_hash);
1200 	err = 0;
1201 
1202 out:	mutex_unlock(&u->bindlock);
1203 	return err;
1204 }
1205 
1206 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1207 			 int addr_len)
1208 {
1209 	umode_t mode = S_IFSOCK |
1210 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1211 	unsigned int new_hash, old_hash = sk->sk_hash;
1212 	struct unix_sock *u = unix_sk(sk);
1213 	struct net *net = sock_net(sk);
1214 	struct mnt_idmap *idmap;
1215 	struct unix_address *addr;
1216 	struct dentry *dentry;
1217 	struct path parent;
1218 	int err;
1219 
1220 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1221 	addr = unix_create_addr(sunaddr, addr_len);
1222 	if (!addr)
1223 		return -ENOMEM;
1224 
1225 	/*
1226 	 * Get the parent directory, calculate the hash for last
1227 	 * component.
1228 	 */
1229 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1230 	if (IS_ERR(dentry)) {
1231 		err = PTR_ERR(dentry);
1232 		goto out;
1233 	}
1234 
1235 	/*
1236 	 * All right, let's create it.
1237 	 */
1238 	idmap = mnt_idmap(parent.mnt);
1239 	err = security_path_mknod(&parent, dentry, mode, 0);
1240 	if (!err)
1241 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1242 	if (err)
1243 		goto out_path;
1244 	err = mutex_lock_interruptible(&u->bindlock);
1245 	if (err)
1246 		goto out_unlink;
1247 	if (u->addr)
1248 		goto out_unlock;
1249 
1250 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1251 	unix_table_double_lock(net, old_hash, new_hash);
1252 	u->path.mnt = mntget(parent.mnt);
1253 	u->path.dentry = dget(dentry);
1254 	__unix_set_addr_hash(net, sk, addr, new_hash);
1255 	unix_table_double_unlock(net, old_hash, new_hash);
1256 	unix_insert_bsd_socket(sk);
1257 	mutex_unlock(&u->bindlock);
1258 	done_path_create(&parent, dentry);
1259 	return 0;
1260 
1261 out_unlock:
1262 	mutex_unlock(&u->bindlock);
1263 	err = -EINVAL;
1264 out_unlink:
1265 	/* failed after successful mknod?  unlink what we'd created... */
1266 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1267 out_path:
1268 	done_path_create(&parent, dentry);
1269 out:
1270 	unix_release_addr(addr);
1271 	return err == -EEXIST ? -EADDRINUSE : err;
1272 }
1273 
1274 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1275 			      int addr_len)
1276 {
1277 	unsigned int new_hash, old_hash = sk->sk_hash;
1278 	struct unix_sock *u = unix_sk(sk);
1279 	struct net *net = sock_net(sk);
1280 	struct unix_address *addr;
1281 	int err;
1282 
1283 	addr = unix_create_addr(sunaddr, addr_len);
1284 	if (!addr)
1285 		return -ENOMEM;
1286 
1287 	err = mutex_lock_interruptible(&u->bindlock);
1288 	if (err)
1289 		goto out;
1290 
1291 	if (u->addr) {
1292 		err = -EINVAL;
1293 		goto out_mutex;
1294 	}
1295 
1296 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1297 	unix_table_double_lock(net, old_hash, new_hash);
1298 
1299 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1300 		goto out_spin;
1301 
1302 	__unix_set_addr_hash(net, sk, addr, new_hash);
1303 	unix_table_double_unlock(net, old_hash, new_hash);
1304 	mutex_unlock(&u->bindlock);
1305 	return 0;
1306 
1307 out_spin:
1308 	unix_table_double_unlock(net, old_hash, new_hash);
1309 	err = -EADDRINUSE;
1310 out_mutex:
1311 	mutex_unlock(&u->bindlock);
1312 out:
1313 	unix_release_addr(addr);
1314 	return err;
1315 }
1316 
1317 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1318 {
1319 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1320 	struct sock *sk = sock->sk;
1321 	int err;
1322 
1323 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1324 	    sunaddr->sun_family == AF_UNIX)
1325 		return unix_autobind(sk);
1326 
1327 	err = unix_validate_addr(sunaddr, addr_len);
1328 	if (err)
1329 		return err;
1330 
1331 	if (sunaddr->sun_path[0])
1332 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1333 	else
1334 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1335 
1336 	return err;
1337 }
1338 
1339 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1340 {
1341 	if (unlikely(sk1 == sk2) || !sk2) {
1342 		unix_state_lock(sk1);
1343 		return;
1344 	}
1345 	if (sk1 < sk2) {
1346 		unix_state_lock(sk1);
1347 		unix_state_lock_nested(sk2);
1348 	} else {
1349 		unix_state_lock(sk2);
1350 		unix_state_lock_nested(sk1);
1351 	}
1352 }
1353 
1354 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1355 {
1356 	if (unlikely(sk1 == sk2) || !sk2) {
1357 		unix_state_unlock(sk1);
1358 		return;
1359 	}
1360 	unix_state_unlock(sk1);
1361 	unix_state_unlock(sk2);
1362 }
1363 
1364 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1365 			      int alen, int flags)
1366 {
1367 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1368 	struct sock *sk = sock->sk;
1369 	struct sock *other;
1370 	int err;
1371 
1372 	err = -EINVAL;
1373 	if (alen < offsetofend(struct sockaddr, sa_family))
1374 		goto out;
1375 
1376 	if (addr->sa_family != AF_UNSPEC) {
1377 		err = unix_validate_addr(sunaddr, alen);
1378 		if (err)
1379 			goto out;
1380 
1381 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1382 		if (err)
1383 			goto out;
1384 
1385 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1386 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1387 		    !unix_sk(sk)->addr) {
1388 			err = unix_autobind(sk);
1389 			if (err)
1390 				goto out;
1391 		}
1392 
1393 restart:
1394 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1395 		if (IS_ERR(other)) {
1396 			err = PTR_ERR(other);
1397 			goto out;
1398 		}
1399 
1400 		unix_state_double_lock(sk, other);
1401 
1402 		/* Apparently VFS overslept socket death. Retry. */
1403 		if (sock_flag(other, SOCK_DEAD)) {
1404 			unix_state_double_unlock(sk, other);
1405 			sock_put(other);
1406 			goto restart;
1407 		}
1408 
1409 		err = -EPERM;
1410 		if (!unix_may_send(sk, other))
1411 			goto out_unlock;
1412 
1413 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1414 		if (err)
1415 			goto out_unlock;
1416 
1417 		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1418 	} else {
1419 		/*
1420 		 *	1003.1g breaking connected state with AF_UNSPEC
1421 		 */
1422 		other = NULL;
1423 		unix_state_double_lock(sk, other);
1424 	}
1425 
1426 	/*
1427 	 * If it was connected, reconnect.
1428 	 */
1429 	if (unix_peer(sk)) {
1430 		struct sock *old_peer = unix_peer(sk);
1431 
1432 		unix_peer(sk) = other;
1433 		if (!other)
1434 			sk->sk_state = TCP_CLOSE;
1435 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1436 
1437 		unix_state_double_unlock(sk, other);
1438 
1439 		if (other != old_peer)
1440 			unix_dgram_disconnected(sk, old_peer);
1441 		sock_put(old_peer);
1442 	} else {
1443 		unix_peer(sk) = other;
1444 		unix_state_double_unlock(sk, other);
1445 	}
1446 
1447 	return 0;
1448 
1449 out_unlock:
1450 	unix_state_double_unlock(sk, other);
1451 	sock_put(other);
1452 out:
1453 	return err;
1454 }
1455 
1456 static long unix_wait_for_peer(struct sock *other, long timeo)
1457 	__releases(&unix_sk(other)->lock)
1458 {
1459 	struct unix_sock *u = unix_sk(other);
1460 	int sched;
1461 	DEFINE_WAIT(wait);
1462 
1463 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1464 
1465 	sched = !sock_flag(other, SOCK_DEAD) &&
1466 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1467 		unix_recvq_full_lockless(other);
1468 
1469 	unix_state_unlock(other);
1470 
1471 	if (sched)
1472 		timeo = schedule_timeout(timeo);
1473 
1474 	finish_wait(&u->peer_wait, &wait);
1475 	return timeo;
1476 }
1477 
1478 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1479 			       int addr_len, int flags)
1480 {
1481 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1482 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1483 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1484 	struct net *net = sock_net(sk);
1485 	struct sk_buff *skb = NULL;
1486 	long timeo;
1487 	int err;
1488 	int st;
1489 
1490 	err = unix_validate_addr(sunaddr, addr_len);
1491 	if (err)
1492 		goto out;
1493 
1494 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1495 	if (err)
1496 		goto out;
1497 
1498 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1499 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1500 		err = unix_autobind(sk);
1501 		if (err)
1502 			goto out;
1503 	}
1504 
1505 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1506 
1507 	/* First of all allocate resources.
1508 	   If we will make it after state is locked,
1509 	   we will have to recheck all again in any case.
1510 	 */
1511 
1512 	/* create new sock for complete connection */
1513 	newsk = unix_create1(net, NULL, 0, sock->type);
1514 	if (IS_ERR(newsk)) {
1515 		err = PTR_ERR(newsk);
1516 		newsk = NULL;
1517 		goto out;
1518 	}
1519 
1520 	err = -ENOMEM;
1521 
1522 	/* Allocate skb for sending to listening sock */
1523 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1524 	if (skb == NULL)
1525 		goto out;
1526 
1527 restart:
1528 	/*  Find listening sock. */
1529 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1530 	if (IS_ERR(other)) {
1531 		err = PTR_ERR(other);
1532 		other = NULL;
1533 		goto out;
1534 	}
1535 
1536 	/* Latch state of peer */
1537 	unix_state_lock(other);
1538 
1539 	/* Apparently VFS overslept socket death. Retry. */
1540 	if (sock_flag(other, SOCK_DEAD)) {
1541 		unix_state_unlock(other);
1542 		sock_put(other);
1543 		goto restart;
1544 	}
1545 
1546 	err = -ECONNREFUSED;
1547 	if (other->sk_state != TCP_LISTEN)
1548 		goto out_unlock;
1549 	if (other->sk_shutdown & RCV_SHUTDOWN)
1550 		goto out_unlock;
1551 
1552 	if (unix_recvq_full(other)) {
1553 		err = -EAGAIN;
1554 		if (!timeo)
1555 			goto out_unlock;
1556 
1557 		timeo = unix_wait_for_peer(other, timeo);
1558 
1559 		err = sock_intr_errno(timeo);
1560 		if (signal_pending(current))
1561 			goto out;
1562 		sock_put(other);
1563 		goto restart;
1564 	}
1565 
1566 	/* Latch our state.
1567 
1568 	   It is tricky place. We need to grab our state lock and cannot
1569 	   drop lock on peer. It is dangerous because deadlock is
1570 	   possible. Connect to self case and simultaneous
1571 	   attempt to connect are eliminated by checking socket
1572 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1573 	   check this before attempt to grab lock.
1574 
1575 	   Well, and we have to recheck the state after socket locked.
1576 	 */
1577 	st = sk->sk_state;
1578 
1579 	switch (st) {
1580 	case TCP_CLOSE:
1581 		/* This is ok... continue with connect */
1582 		break;
1583 	case TCP_ESTABLISHED:
1584 		/* Socket is already connected */
1585 		err = -EISCONN;
1586 		goto out_unlock;
1587 	default:
1588 		err = -EINVAL;
1589 		goto out_unlock;
1590 	}
1591 
1592 	unix_state_lock_nested(sk);
1593 
1594 	if (sk->sk_state != st) {
1595 		unix_state_unlock(sk);
1596 		unix_state_unlock(other);
1597 		sock_put(other);
1598 		goto restart;
1599 	}
1600 
1601 	err = security_unix_stream_connect(sk, other, newsk);
1602 	if (err) {
1603 		unix_state_unlock(sk);
1604 		goto out_unlock;
1605 	}
1606 
1607 	/* The way is open! Fastly set all the necessary fields... */
1608 
1609 	sock_hold(sk);
1610 	unix_peer(newsk)	= sk;
1611 	newsk->sk_state		= TCP_ESTABLISHED;
1612 	newsk->sk_type		= sk->sk_type;
1613 	init_peercred(newsk);
1614 	newu = unix_sk(newsk);
1615 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1616 	otheru = unix_sk(other);
1617 
1618 	/* copy address information from listening to new sock
1619 	 *
1620 	 * The contents of *(otheru->addr) and otheru->path
1621 	 * are seen fully set up here, since we have found
1622 	 * otheru in hash under its lock.  Insertion into the
1623 	 * hash chain we'd found it in had been done in an
1624 	 * earlier critical area protected by the chain's lock,
1625 	 * the same one where we'd set *(otheru->addr) contents,
1626 	 * as well as otheru->path and otheru->addr itself.
1627 	 *
1628 	 * Using smp_store_release() here to set newu->addr
1629 	 * is enough to make those stores, as well as stores
1630 	 * to newu->path visible to anyone who gets newu->addr
1631 	 * by smp_load_acquire().  IOW, the same warranties
1632 	 * as for unix_sock instances bound in unix_bind() or
1633 	 * in unix_autobind().
1634 	 */
1635 	if (otheru->path.dentry) {
1636 		path_get(&otheru->path);
1637 		newu->path = otheru->path;
1638 	}
1639 	refcount_inc(&otheru->addr->refcnt);
1640 	smp_store_release(&newu->addr, otheru->addr);
1641 
1642 	/* Set credentials */
1643 	copy_peercred(sk, other);
1644 
1645 	sock->state	= SS_CONNECTED;
1646 	sk->sk_state	= TCP_ESTABLISHED;
1647 	sock_hold(newsk);
1648 
1649 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1650 	unix_peer(sk)	= newsk;
1651 
1652 	unix_state_unlock(sk);
1653 
1654 	/* take ten and send info to listening sock */
1655 	spin_lock(&other->sk_receive_queue.lock);
1656 	__skb_queue_tail(&other->sk_receive_queue, skb);
1657 	spin_unlock(&other->sk_receive_queue.lock);
1658 	unix_state_unlock(other);
1659 	other->sk_data_ready(other);
1660 	sock_put(other);
1661 	return 0;
1662 
1663 out_unlock:
1664 	if (other)
1665 		unix_state_unlock(other);
1666 
1667 out:
1668 	kfree_skb(skb);
1669 	if (newsk)
1670 		unix_release_sock(newsk, 0);
1671 	if (other)
1672 		sock_put(other);
1673 	return err;
1674 }
1675 
1676 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1677 {
1678 	struct sock *ska = socka->sk, *skb = sockb->sk;
1679 
1680 	/* Join our sockets back to back */
1681 	sock_hold(ska);
1682 	sock_hold(skb);
1683 	unix_peer(ska) = skb;
1684 	unix_peer(skb) = ska;
1685 	init_peercred(ska);
1686 	init_peercred(skb);
1687 
1688 	ska->sk_state = TCP_ESTABLISHED;
1689 	skb->sk_state = TCP_ESTABLISHED;
1690 	socka->state  = SS_CONNECTED;
1691 	sockb->state  = SS_CONNECTED;
1692 	return 0;
1693 }
1694 
1695 static void unix_sock_inherit_flags(const struct socket *old,
1696 				    struct socket *new)
1697 {
1698 	if (test_bit(SOCK_PASSCRED, &old->flags))
1699 		set_bit(SOCK_PASSCRED, &new->flags);
1700 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1701 		set_bit(SOCK_PASSPIDFD, &new->flags);
1702 	if (test_bit(SOCK_PASSSEC, &old->flags))
1703 		set_bit(SOCK_PASSSEC, &new->flags);
1704 }
1705 
1706 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1707 		       bool kern)
1708 {
1709 	struct sock *sk = sock->sk;
1710 	struct sock *tsk;
1711 	struct sk_buff *skb;
1712 	int err;
1713 
1714 	err = -EOPNOTSUPP;
1715 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1716 		goto out;
1717 
1718 	err = -EINVAL;
1719 	if (sk->sk_state != TCP_LISTEN)
1720 		goto out;
1721 
1722 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1723 	 * so that no locks are necessary.
1724 	 */
1725 
1726 	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1727 				&err);
1728 	if (!skb) {
1729 		/* This means receive shutdown. */
1730 		if (err == 0)
1731 			err = -EINVAL;
1732 		goto out;
1733 	}
1734 
1735 	tsk = skb->sk;
1736 	skb_free_datagram(sk, skb);
1737 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1738 
1739 	/* attach accepted sock to socket */
1740 	unix_state_lock(tsk);
1741 	newsock->state = SS_CONNECTED;
1742 	unix_sock_inherit_flags(sock, newsock);
1743 	sock_graft(tsk, newsock);
1744 	unix_state_unlock(tsk);
1745 	return 0;
1746 
1747 out:
1748 	return err;
1749 }
1750 
1751 
1752 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1753 {
1754 	struct sock *sk = sock->sk;
1755 	struct unix_address *addr;
1756 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1757 	int err = 0;
1758 
1759 	if (peer) {
1760 		sk = unix_peer_get(sk);
1761 
1762 		err = -ENOTCONN;
1763 		if (!sk)
1764 			goto out;
1765 		err = 0;
1766 	} else {
1767 		sock_hold(sk);
1768 	}
1769 
1770 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1771 	if (!addr) {
1772 		sunaddr->sun_family = AF_UNIX;
1773 		sunaddr->sun_path[0] = 0;
1774 		err = offsetof(struct sockaddr_un, sun_path);
1775 	} else {
1776 		err = addr->len;
1777 		memcpy(sunaddr, addr->name, addr->len);
1778 
1779 		if (peer)
1780 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1781 					       CGROUP_UNIX_GETPEERNAME);
1782 		else
1783 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1784 					       CGROUP_UNIX_GETSOCKNAME);
1785 	}
1786 	sock_put(sk);
1787 out:
1788 	return err;
1789 }
1790 
1791 /* The "user->unix_inflight" variable is protected by the garbage
1792  * collection lock, and we just read it locklessly here. If you go
1793  * over the limit, there might be a tiny race in actually noticing
1794  * it across threads. Tough.
1795  */
1796 static inline bool too_many_unix_fds(struct task_struct *p)
1797 {
1798 	struct user_struct *user = current_user();
1799 
1800 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1801 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1802 	return false;
1803 }
1804 
1805 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1806 {
1807 	int i;
1808 
1809 	if (too_many_unix_fds(current))
1810 		return -ETOOMANYREFS;
1811 
1812 	/* Need to duplicate file references for the sake of garbage
1813 	 * collection.  Otherwise a socket in the fps might become a
1814 	 * candidate for GC while the skb is not yet queued.
1815 	 */
1816 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1817 	if (!UNIXCB(skb).fp)
1818 		return -ENOMEM;
1819 
1820 	for (i = scm->fp->count - 1; i >= 0; i--)
1821 		unix_inflight(scm->fp->user, scm->fp->fp[i]);
1822 
1823 	return 0;
1824 }
1825 
1826 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1827 {
1828 	int i;
1829 
1830 	scm->fp = UNIXCB(skb).fp;
1831 	UNIXCB(skb).fp = NULL;
1832 
1833 	for (i = scm->fp->count - 1; i >= 0; i--)
1834 		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1835 }
1836 
1837 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1838 {
1839 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1840 
1841 	/*
1842 	 * Garbage collection of unix sockets starts by selecting a set of
1843 	 * candidate sockets which have reference only from being in flight
1844 	 * (total_refs == inflight_refs).  This condition is checked once during
1845 	 * the candidate collection phase, and candidates are marked as such, so
1846 	 * that non-candidates can later be ignored.  While inflight_refs is
1847 	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1848 	 * is an instantaneous decision.
1849 	 *
1850 	 * Once a candidate, however, the socket must not be reinstalled into a
1851 	 * file descriptor while the garbage collection is in progress.
1852 	 *
1853 	 * If the above conditions are met, then the directed graph of
1854 	 * candidates (*) does not change while unix_gc_lock is held.
1855 	 *
1856 	 * Any operations that changes the file count through file descriptors
1857 	 * (dup, close, sendmsg) does not change the graph since candidates are
1858 	 * not installed in fds.
1859 	 *
1860 	 * Dequeing a candidate via recvmsg would install it into an fd, but
1861 	 * that takes unix_gc_lock to decrement the inflight count, so it's
1862 	 * serialized with garbage collection.
1863 	 *
1864 	 * MSG_PEEK is special in that it does not change the inflight count,
1865 	 * yet does install the socket into an fd.  The following lock/unlock
1866 	 * pair is to ensure serialization with garbage collection.  It must be
1867 	 * done between incrementing the file count and installing the file into
1868 	 * an fd.
1869 	 *
1870 	 * If garbage collection starts after the barrier provided by the
1871 	 * lock/unlock, then it will see the elevated refcount and not mark this
1872 	 * as a candidate.  If a garbage collection is already in progress
1873 	 * before the file count was incremented, then the lock/unlock pair will
1874 	 * ensure that garbage collection is finished before progressing to
1875 	 * installing the fd.
1876 	 *
1877 	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1878 	 * which is on the queue of listening socket A.
1879 	 */
1880 	spin_lock(&unix_gc_lock);
1881 	spin_unlock(&unix_gc_lock);
1882 }
1883 
1884 static void unix_destruct_scm(struct sk_buff *skb)
1885 {
1886 	struct scm_cookie scm;
1887 
1888 	memset(&scm, 0, sizeof(scm));
1889 	scm.pid  = UNIXCB(skb).pid;
1890 	if (UNIXCB(skb).fp)
1891 		unix_detach_fds(&scm, skb);
1892 
1893 	/* Alas, it calls VFS */
1894 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1895 	scm_destroy(&scm);
1896 	sock_wfree(skb);
1897 }
1898 
1899 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1900 {
1901 	int err = 0;
1902 
1903 	UNIXCB(skb).pid  = get_pid(scm->pid);
1904 	UNIXCB(skb).uid = scm->creds.uid;
1905 	UNIXCB(skb).gid = scm->creds.gid;
1906 	UNIXCB(skb).fp = NULL;
1907 	unix_get_secdata(scm, skb);
1908 	if (scm->fp && send_fds)
1909 		err = unix_attach_fds(scm, skb);
1910 
1911 	skb->destructor = unix_destruct_scm;
1912 	return err;
1913 }
1914 
1915 static bool unix_passcred_enabled(const struct socket *sock,
1916 				  const struct sock *other)
1917 {
1918 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1919 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1920 	       !other->sk_socket ||
1921 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1922 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1923 }
1924 
1925 /*
1926  * Some apps rely on write() giving SCM_CREDENTIALS
1927  * We include credentials if source or destination socket
1928  * asserted SOCK_PASSCRED.
1929  */
1930 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1931 			    const struct sock *other)
1932 {
1933 	if (UNIXCB(skb).pid)
1934 		return;
1935 	if (unix_passcred_enabled(sock, other)) {
1936 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1937 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1938 	}
1939 }
1940 
1941 static bool unix_skb_scm_eq(struct sk_buff *skb,
1942 			    struct scm_cookie *scm)
1943 {
1944 	return UNIXCB(skb).pid == scm->pid &&
1945 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1946 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1947 	       unix_secdata_eq(scm, skb);
1948 }
1949 
1950 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1951 {
1952 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1953 	struct unix_sock *u = unix_sk(sk);
1954 
1955 	if (unlikely(fp && fp->count))
1956 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1957 }
1958 
1959 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1960 {
1961 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1962 	struct unix_sock *u = unix_sk(sk);
1963 
1964 	if (unlikely(fp && fp->count))
1965 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1966 }
1967 
1968 /*
1969  *	Send AF_UNIX data.
1970  */
1971 
1972 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1973 			      size_t len)
1974 {
1975 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1976 	struct sock *sk = sock->sk, *other = NULL;
1977 	struct unix_sock *u = unix_sk(sk);
1978 	struct scm_cookie scm;
1979 	struct sk_buff *skb;
1980 	int data_len = 0;
1981 	int sk_locked;
1982 	long timeo;
1983 	int err;
1984 
1985 	err = scm_send(sock, msg, &scm, false);
1986 	if (err < 0)
1987 		return err;
1988 
1989 	wait_for_unix_gc(scm.fp);
1990 
1991 	err = -EOPNOTSUPP;
1992 	if (msg->msg_flags&MSG_OOB)
1993 		goto out;
1994 
1995 	if (msg->msg_namelen) {
1996 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1997 		if (err)
1998 			goto out;
1999 
2000 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
2001 							    msg->msg_name,
2002 							    &msg->msg_namelen,
2003 							    NULL);
2004 		if (err)
2005 			goto out;
2006 	} else {
2007 		sunaddr = NULL;
2008 		err = -ENOTCONN;
2009 		other = unix_peer_get(sk);
2010 		if (!other)
2011 			goto out;
2012 	}
2013 
2014 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2015 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
2016 		err = unix_autobind(sk);
2017 		if (err)
2018 			goto out;
2019 	}
2020 
2021 	err = -EMSGSIZE;
2022 	if (len > sk->sk_sndbuf - 32)
2023 		goto out;
2024 
2025 	if (len > SKB_MAX_ALLOC) {
2026 		data_len = min_t(size_t,
2027 				 len - SKB_MAX_ALLOC,
2028 				 MAX_SKB_FRAGS * PAGE_SIZE);
2029 		data_len = PAGE_ALIGN(data_len);
2030 
2031 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2032 	}
2033 
2034 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2035 				   msg->msg_flags & MSG_DONTWAIT, &err,
2036 				   PAGE_ALLOC_COSTLY_ORDER);
2037 	if (skb == NULL)
2038 		goto out;
2039 
2040 	err = unix_scm_to_skb(&scm, skb, true);
2041 	if (err < 0)
2042 		goto out_free;
2043 
2044 	skb_put(skb, len - data_len);
2045 	skb->data_len = data_len;
2046 	skb->len = len;
2047 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2048 	if (err)
2049 		goto out_free;
2050 
2051 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2052 
2053 restart:
2054 	if (!other) {
2055 		err = -ECONNRESET;
2056 		if (sunaddr == NULL)
2057 			goto out_free;
2058 
2059 		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2060 					sk->sk_type);
2061 		if (IS_ERR(other)) {
2062 			err = PTR_ERR(other);
2063 			other = NULL;
2064 			goto out_free;
2065 		}
2066 	}
2067 
2068 	if (sk_filter(other, skb) < 0) {
2069 		/* Toss the packet but do not return any error to the sender */
2070 		err = len;
2071 		goto out_free;
2072 	}
2073 
2074 	sk_locked = 0;
2075 	unix_state_lock(other);
2076 restart_locked:
2077 	err = -EPERM;
2078 	if (!unix_may_send(sk, other))
2079 		goto out_unlock;
2080 
2081 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2082 		/*
2083 		 *	Check with 1003.1g - what should
2084 		 *	datagram error
2085 		 */
2086 		unix_state_unlock(other);
2087 		sock_put(other);
2088 
2089 		if (!sk_locked)
2090 			unix_state_lock(sk);
2091 
2092 		err = 0;
2093 		if (sk->sk_type == SOCK_SEQPACKET) {
2094 			/* We are here only when racing with unix_release_sock()
2095 			 * is clearing @other. Never change state to TCP_CLOSE
2096 			 * unlike SOCK_DGRAM wants.
2097 			 */
2098 			unix_state_unlock(sk);
2099 			err = -EPIPE;
2100 		} else if (unix_peer(sk) == other) {
2101 			unix_peer(sk) = NULL;
2102 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2103 
2104 			sk->sk_state = TCP_CLOSE;
2105 			unix_state_unlock(sk);
2106 
2107 			unix_dgram_disconnected(sk, other);
2108 			sock_put(other);
2109 			err = -ECONNREFUSED;
2110 		} else {
2111 			unix_state_unlock(sk);
2112 		}
2113 
2114 		other = NULL;
2115 		if (err)
2116 			goto out_free;
2117 		goto restart;
2118 	}
2119 
2120 	err = -EPIPE;
2121 	if (other->sk_shutdown & RCV_SHUTDOWN)
2122 		goto out_unlock;
2123 
2124 	if (sk->sk_type != SOCK_SEQPACKET) {
2125 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2126 		if (err)
2127 			goto out_unlock;
2128 	}
2129 
2130 	/* other == sk && unix_peer(other) != sk if
2131 	 * - unix_peer(sk) == NULL, destination address bound to sk
2132 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2133 	 */
2134 	if (other != sk &&
2135 	    unlikely(unix_peer(other) != sk &&
2136 	    unix_recvq_full_lockless(other))) {
2137 		if (timeo) {
2138 			timeo = unix_wait_for_peer(other, timeo);
2139 
2140 			err = sock_intr_errno(timeo);
2141 			if (signal_pending(current))
2142 				goto out_free;
2143 
2144 			goto restart;
2145 		}
2146 
2147 		if (!sk_locked) {
2148 			unix_state_unlock(other);
2149 			unix_state_double_lock(sk, other);
2150 		}
2151 
2152 		if (unix_peer(sk) != other ||
2153 		    unix_dgram_peer_wake_me(sk, other)) {
2154 			err = -EAGAIN;
2155 			sk_locked = 1;
2156 			goto out_unlock;
2157 		}
2158 
2159 		if (!sk_locked) {
2160 			sk_locked = 1;
2161 			goto restart_locked;
2162 		}
2163 	}
2164 
2165 	if (unlikely(sk_locked))
2166 		unix_state_unlock(sk);
2167 
2168 	if (sock_flag(other, SOCK_RCVTSTAMP))
2169 		__net_timestamp(skb);
2170 	maybe_add_creds(skb, sock, other);
2171 	scm_stat_add(other, skb);
2172 	skb_queue_tail(&other->sk_receive_queue, skb);
2173 	unix_state_unlock(other);
2174 	other->sk_data_ready(other);
2175 	sock_put(other);
2176 	scm_destroy(&scm);
2177 	return len;
2178 
2179 out_unlock:
2180 	if (sk_locked)
2181 		unix_state_unlock(sk);
2182 	unix_state_unlock(other);
2183 out_free:
2184 	kfree_skb(skb);
2185 out:
2186 	if (other)
2187 		sock_put(other);
2188 	scm_destroy(&scm);
2189 	return err;
2190 }
2191 
2192 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2193  * bytes, and a minimum of a full page.
2194  */
2195 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2196 
2197 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2198 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2199 		     struct scm_cookie *scm, bool fds_sent)
2200 {
2201 	struct unix_sock *ousk = unix_sk(other);
2202 	struct sk_buff *skb;
2203 	int err = 0;
2204 
2205 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2206 
2207 	if (!skb)
2208 		return err;
2209 
2210 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2211 	if (err < 0) {
2212 		kfree_skb(skb);
2213 		return err;
2214 	}
2215 	skb_put(skb, 1);
2216 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2217 
2218 	if (err) {
2219 		kfree_skb(skb);
2220 		return err;
2221 	}
2222 
2223 	unix_state_lock(other);
2224 
2225 	if (sock_flag(other, SOCK_DEAD) ||
2226 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2227 		unix_state_unlock(other);
2228 		kfree_skb(skb);
2229 		return -EPIPE;
2230 	}
2231 
2232 	maybe_add_creds(skb, sock, other);
2233 	skb_get(skb);
2234 
2235 	if (ousk->oob_skb)
2236 		consume_skb(ousk->oob_skb);
2237 
2238 	WRITE_ONCE(ousk->oob_skb, skb);
2239 
2240 	scm_stat_add(other, skb);
2241 	skb_queue_tail(&other->sk_receive_queue, skb);
2242 	sk_send_sigurg(other);
2243 	unix_state_unlock(other);
2244 	other->sk_data_ready(other);
2245 
2246 	return err;
2247 }
2248 #endif
2249 
2250 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2251 			       size_t len)
2252 {
2253 	struct sock *sk = sock->sk;
2254 	struct sock *other = NULL;
2255 	int err, size;
2256 	struct sk_buff *skb;
2257 	int sent = 0;
2258 	struct scm_cookie scm;
2259 	bool fds_sent = false;
2260 	int data_len;
2261 
2262 	err = scm_send(sock, msg, &scm, false);
2263 	if (err < 0)
2264 		return err;
2265 
2266 	wait_for_unix_gc(scm.fp);
2267 
2268 	err = -EOPNOTSUPP;
2269 	if (msg->msg_flags & MSG_OOB) {
2270 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2271 		if (len)
2272 			len--;
2273 		else
2274 #endif
2275 			goto out_err;
2276 	}
2277 
2278 	if (msg->msg_namelen) {
2279 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2280 		goto out_err;
2281 	} else {
2282 		err = -ENOTCONN;
2283 		other = unix_peer(sk);
2284 		if (!other)
2285 			goto out_err;
2286 	}
2287 
2288 	if (sk->sk_shutdown & SEND_SHUTDOWN)
2289 		goto pipe_err;
2290 
2291 	while (sent < len) {
2292 		size = len - sent;
2293 
2294 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2295 			skb = sock_alloc_send_pskb(sk, 0, 0,
2296 						   msg->msg_flags & MSG_DONTWAIT,
2297 						   &err, 0);
2298 		} else {
2299 			/* Keep two messages in the pipe so it schedules better */
2300 			size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2301 
2302 			/* allow fallback to order-0 allocations */
2303 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2304 
2305 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2306 
2307 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2308 
2309 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2310 						   msg->msg_flags & MSG_DONTWAIT, &err,
2311 						   get_order(UNIX_SKB_FRAGS_SZ));
2312 		}
2313 		if (!skb)
2314 			goto out_err;
2315 
2316 		/* Only send the fds in the first buffer */
2317 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2318 		if (err < 0) {
2319 			kfree_skb(skb);
2320 			goto out_err;
2321 		}
2322 		fds_sent = true;
2323 
2324 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2325 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2326 						   sk->sk_allocation);
2327 			if (err < 0) {
2328 				kfree_skb(skb);
2329 				goto out_err;
2330 			}
2331 			size = err;
2332 			refcount_add(size, &sk->sk_wmem_alloc);
2333 		} else {
2334 			skb_put(skb, size - data_len);
2335 			skb->data_len = data_len;
2336 			skb->len = size;
2337 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2338 			if (err) {
2339 				kfree_skb(skb);
2340 				goto out_err;
2341 			}
2342 		}
2343 
2344 		unix_state_lock(other);
2345 
2346 		if (sock_flag(other, SOCK_DEAD) ||
2347 		    (other->sk_shutdown & RCV_SHUTDOWN))
2348 			goto pipe_err_free;
2349 
2350 		maybe_add_creds(skb, sock, other);
2351 		scm_stat_add(other, skb);
2352 		skb_queue_tail(&other->sk_receive_queue, skb);
2353 		unix_state_unlock(other);
2354 		other->sk_data_ready(other);
2355 		sent += size;
2356 	}
2357 
2358 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2359 	if (msg->msg_flags & MSG_OOB) {
2360 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2361 		if (err)
2362 			goto out_err;
2363 		sent++;
2364 	}
2365 #endif
2366 
2367 	scm_destroy(&scm);
2368 
2369 	return sent;
2370 
2371 pipe_err_free:
2372 	unix_state_unlock(other);
2373 	kfree_skb(skb);
2374 pipe_err:
2375 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2376 		send_sig(SIGPIPE, current, 0);
2377 	err = -EPIPE;
2378 out_err:
2379 	scm_destroy(&scm);
2380 	return sent ? : err;
2381 }
2382 
2383 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2384 				  size_t len)
2385 {
2386 	int err;
2387 	struct sock *sk = sock->sk;
2388 
2389 	err = sock_error(sk);
2390 	if (err)
2391 		return err;
2392 
2393 	if (sk->sk_state != TCP_ESTABLISHED)
2394 		return -ENOTCONN;
2395 
2396 	if (msg->msg_namelen)
2397 		msg->msg_namelen = 0;
2398 
2399 	return unix_dgram_sendmsg(sock, msg, len);
2400 }
2401 
2402 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2403 				  size_t size, int flags)
2404 {
2405 	struct sock *sk = sock->sk;
2406 
2407 	if (sk->sk_state != TCP_ESTABLISHED)
2408 		return -ENOTCONN;
2409 
2410 	return unix_dgram_recvmsg(sock, msg, size, flags);
2411 }
2412 
2413 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2414 {
2415 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2416 
2417 	if (addr) {
2418 		msg->msg_namelen = addr->len;
2419 		memcpy(msg->msg_name, addr->name, addr->len);
2420 	}
2421 }
2422 
2423 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2424 			 int flags)
2425 {
2426 	struct scm_cookie scm;
2427 	struct socket *sock = sk->sk_socket;
2428 	struct unix_sock *u = unix_sk(sk);
2429 	struct sk_buff *skb, *last;
2430 	long timeo;
2431 	int skip;
2432 	int err;
2433 
2434 	err = -EOPNOTSUPP;
2435 	if (flags&MSG_OOB)
2436 		goto out;
2437 
2438 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2439 
2440 	do {
2441 		mutex_lock(&u->iolock);
2442 
2443 		skip = sk_peek_offset(sk, flags);
2444 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2445 					      &skip, &err, &last);
2446 		if (skb) {
2447 			if (!(flags & MSG_PEEK))
2448 				scm_stat_del(sk, skb);
2449 			break;
2450 		}
2451 
2452 		mutex_unlock(&u->iolock);
2453 
2454 		if (err != -EAGAIN)
2455 			break;
2456 	} while (timeo &&
2457 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2458 					      &err, &timeo, last));
2459 
2460 	if (!skb) { /* implies iolock unlocked */
2461 		unix_state_lock(sk);
2462 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2463 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2464 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2465 			err = 0;
2466 		unix_state_unlock(sk);
2467 		goto out;
2468 	}
2469 
2470 	if (wq_has_sleeper(&u->peer_wait))
2471 		wake_up_interruptible_sync_poll(&u->peer_wait,
2472 						EPOLLOUT | EPOLLWRNORM |
2473 						EPOLLWRBAND);
2474 
2475 	if (msg->msg_name) {
2476 		unix_copy_addr(msg, skb->sk);
2477 
2478 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2479 						      msg->msg_name,
2480 						      &msg->msg_namelen);
2481 	}
2482 
2483 	if (size > skb->len - skip)
2484 		size = skb->len - skip;
2485 	else if (size < skb->len - skip)
2486 		msg->msg_flags |= MSG_TRUNC;
2487 
2488 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2489 	if (err)
2490 		goto out_free;
2491 
2492 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2493 		__sock_recv_timestamp(msg, sk, skb);
2494 
2495 	memset(&scm, 0, sizeof(scm));
2496 
2497 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2498 	unix_set_secdata(&scm, skb);
2499 
2500 	if (!(flags & MSG_PEEK)) {
2501 		if (UNIXCB(skb).fp)
2502 			unix_detach_fds(&scm, skb);
2503 
2504 		sk_peek_offset_bwd(sk, skb->len);
2505 	} else {
2506 		/* It is questionable: on PEEK we could:
2507 		   - do not return fds - good, but too simple 8)
2508 		   - return fds, and do not return them on read (old strategy,
2509 		     apparently wrong)
2510 		   - clone fds (I chose it for now, it is the most universal
2511 		     solution)
2512 
2513 		   POSIX 1003.1g does not actually define this clearly
2514 		   at all. POSIX 1003.1g doesn't define a lot of things
2515 		   clearly however!
2516 
2517 		*/
2518 
2519 		sk_peek_offset_fwd(sk, size);
2520 
2521 		if (UNIXCB(skb).fp)
2522 			unix_peek_fds(&scm, skb);
2523 	}
2524 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2525 
2526 	scm_recv_unix(sock, msg, &scm, flags);
2527 
2528 out_free:
2529 	skb_free_datagram(sk, skb);
2530 	mutex_unlock(&u->iolock);
2531 out:
2532 	return err;
2533 }
2534 
2535 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2536 			      int flags)
2537 {
2538 	struct sock *sk = sock->sk;
2539 
2540 #ifdef CONFIG_BPF_SYSCALL
2541 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2542 
2543 	if (prot != &unix_dgram_proto)
2544 		return prot->recvmsg(sk, msg, size, flags, NULL);
2545 #endif
2546 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2547 }
2548 
2549 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2550 {
2551 	struct unix_sock *u = unix_sk(sk);
2552 	struct sk_buff *skb;
2553 	int err;
2554 
2555 	mutex_lock(&u->iolock);
2556 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2557 	mutex_unlock(&u->iolock);
2558 	if (!skb)
2559 		return err;
2560 
2561 	return recv_actor(sk, skb);
2562 }
2563 
2564 /*
2565  *	Sleep until more data has arrived. But check for races..
2566  */
2567 static long unix_stream_data_wait(struct sock *sk, long timeo,
2568 				  struct sk_buff *last, unsigned int last_len,
2569 				  bool freezable)
2570 {
2571 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2572 	struct sk_buff *tail;
2573 	DEFINE_WAIT(wait);
2574 
2575 	unix_state_lock(sk);
2576 
2577 	for (;;) {
2578 		prepare_to_wait(sk_sleep(sk), &wait, state);
2579 
2580 		tail = skb_peek_tail(&sk->sk_receive_queue);
2581 		if (tail != last ||
2582 		    (tail && tail->len != last_len) ||
2583 		    sk->sk_err ||
2584 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2585 		    signal_pending(current) ||
2586 		    !timeo)
2587 			break;
2588 
2589 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2590 		unix_state_unlock(sk);
2591 		timeo = schedule_timeout(timeo);
2592 		unix_state_lock(sk);
2593 
2594 		if (sock_flag(sk, SOCK_DEAD))
2595 			break;
2596 
2597 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2598 	}
2599 
2600 	finish_wait(sk_sleep(sk), &wait);
2601 	unix_state_unlock(sk);
2602 	return timeo;
2603 }
2604 
2605 static unsigned int unix_skb_len(const struct sk_buff *skb)
2606 {
2607 	return skb->len - UNIXCB(skb).consumed;
2608 }
2609 
2610 struct unix_stream_read_state {
2611 	int (*recv_actor)(struct sk_buff *, int, int,
2612 			  struct unix_stream_read_state *);
2613 	struct socket *socket;
2614 	struct msghdr *msg;
2615 	struct pipe_inode_info *pipe;
2616 	size_t size;
2617 	int flags;
2618 	unsigned int splice_flags;
2619 };
2620 
2621 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2622 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2623 {
2624 	struct socket *sock = state->socket;
2625 	struct sock *sk = sock->sk;
2626 	struct unix_sock *u = unix_sk(sk);
2627 	int chunk = 1;
2628 	struct sk_buff *oob_skb;
2629 
2630 	mutex_lock(&u->iolock);
2631 	unix_state_lock(sk);
2632 
2633 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2634 		unix_state_unlock(sk);
2635 		mutex_unlock(&u->iolock);
2636 		return -EINVAL;
2637 	}
2638 
2639 	oob_skb = u->oob_skb;
2640 
2641 	if (!(state->flags & MSG_PEEK))
2642 		WRITE_ONCE(u->oob_skb, NULL);
2643 	else
2644 		skb_get(oob_skb);
2645 	unix_state_unlock(sk);
2646 
2647 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2648 
2649 	if (!(state->flags & MSG_PEEK))
2650 		UNIXCB(oob_skb).consumed += 1;
2651 
2652 	consume_skb(oob_skb);
2653 
2654 	mutex_unlock(&u->iolock);
2655 
2656 	if (chunk < 0)
2657 		return -EFAULT;
2658 
2659 	state->msg->msg_flags |= MSG_OOB;
2660 	return 1;
2661 }
2662 
2663 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2664 				  int flags, int copied)
2665 {
2666 	struct unix_sock *u = unix_sk(sk);
2667 
2668 	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2669 		skb_unlink(skb, &sk->sk_receive_queue);
2670 		consume_skb(skb);
2671 		skb = NULL;
2672 	} else {
2673 		if (skb == u->oob_skb) {
2674 			if (copied) {
2675 				skb = NULL;
2676 			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2677 				if (!(flags & MSG_PEEK)) {
2678 					WRITE_ONCE(u->oob_skb, NULL);
2679 					consume_skb(skb);
2680 				}
2681 			} else if (!(flags & MSG_PEEK)) {
2682 				skb_unlink(skb, &sk->sk_receive_queue);
2683 				consume_skb(skb);
2684 				skb = skb_peek(&sk->sk_receive_queue);
2685 			}
2686 		}
2687 	}
2688 	return skb;
2689 }
2690 #endif
2691 
2692 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2693 {
2694 	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2695 		return -ENOTCONN;
2696 
2697 	return unix_read_skb(sk, recv_actor);
2698 }
2699 
2700 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2701 				    bool freezable)
2702 {
2703 	struct scm_cookie scm;
2704 	struct socket *sock = state->socket;
2705 	struct sock *sk = sock->sk;
2706 	struct unix_sock *u = unix_sk(sk);
2707 	int copied = 0;
2708 	int flags = state->flags;
2709 	int noblock = flags & MSG_DONTWAIT;
2710 	bool check_creds = false;
2711 	int target;
2712 	int err = 0;
2713 	long timeo;
2714 	int skip;
2715 	size_t size = state->size;
2716 	unsigned int last_len;
2717 
2718 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2719 		err = -EINVAL;
2720 		goto out;
2721 	}
2722 
2723 	if (unlikely(flags & MSG_OOB)) {
2724 		err = -EOPNOTSUPP;
2725 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2726 		err = unix_stream_recv_urg(state);
2727 #endif
2728 		goto out;
2729 	}
2730 
2731 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2732 	timeo = sock_rcvtimeo(sk, noblock);
2733 
2734 	memset(&scm, 0, sizeof(scm));
2735 
2736 	/* Lock the socket to prevent queue disordering
2737 	 * while sleeps in memcpy_tomsg
2738 	 */
2739 	mutex_lock(&u->iolock);
2740 
2741 	skip = max(sk_peek_offset(sk, flags), 0);
2742 
2743 	do {
2744 		int chunk;
2745 		bool drop_skb;
2746 		struct sk_buff *skb, *last;
2747 
2748 redo:
2749 		unix_state_lock(sk);
2750 		if (sock_flag(sk, SOCK_DEAD)) {
2751 			err = -ECONNRESET;
2752 			goto unlock;
2753 		}
2754 		last = skb = skb_peek(&sk->sk_receive_queue);
2755 		last_len = last ? last->len : 0;
2756 
2757 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2758 		if (skb) {
2759 			skb = manage_oob(skb, sk, flags, copied);
2760 			if (!skb) {
2761 				unix_state_unlock(sk);
2762 				if (copied)
2763 					break;
2764 				goto redo;
2765 			}
2766 		}
2767 #endif
2768 again:
2769 		if (skb == NULL) {
2770 			if (copied >= target)
2771 				goto unlock;
2772 
2773 			/*
2774 			 *	POSIX 1003.1g mandates this order.
2775 			 */
2776 
2777 			err = sock_error(sk);
2778 			if (err)
2779 				goto unlock;
2780 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2781 				goto unlock;
2782 
2783 			unix_state_unlock(sk);
2784 			if (!timeo) {
2785 				err = -EAGAIN;
2786 				break;
2787 			}
2788 
2789 			mutex_unlock(&u->iolock);
2790 
2791 			timeo = unix_stream_data_wait(sk, timeo, last,
2792 						      last_len, freezable);
2793 
2794 			if (signal_pending(current)) {
2795 				err = sock_intr_errno(timeo);
2796 				scm_destroy(&scm);
2797 				goto out;
2798 			}
2799 
2800 			mutex_lock(&u->iolock);
2801 			goto redo;
2802 unlock:
2803 			unix_state_unlock(sk);
2804 			break;
2805 		}
2806 
2807 		while (skip >= unix_skb_len(skb)) {
2808 			skip -= unix_skb_len(skb);
2809 			last = skb;
2810 			last_len = skb->len;
2811 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2812 			if (!skb)
2813 				goto again;
2814 		}
2815 
2816 		unix_state_unlock(sk);
2817 
2818 		if (check_creds) {
2819 			/* Never glue messages from different writers */
2820 			if (!unix_skb_scm_eq(skb, &scm))
2821 				break;
2822 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2823 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2824 			/* Copy credentials */
2825 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2826 			unix_set_secdata(&scm, skb);
2827 			check_creds = true;
2828 		}
2829 
2830 		/* Copy address just once */
2831 		if (state->msg && state->msg->msg_name) {
2832 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2833 					 state->msg->msg_name);
2834 			unix_copy_addr(state->msg, skb->sk);
2835 
2836 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2837 							      state->msg->msg_name,
2838 							      &state->msg->msg_namelen);
2839 
2840 			sunaddr = NULL;
2841 		}
2842 
2843 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2844 		skb_get(skb);
2845 		chunk = state->recv_actor(skb, skip, chunk, state);
2846 		drop_skb = !unix_skb_len(skb);
2847 		/* skb is only safe to use if !drop_skb */
2848 		consume_skb(skb);
2849 		if (chunk < 0) {
2850 			if (copied == 0)
2851 				copied = -EFAULT;
2852 			break;
2853 		}
2854 		copied += chunk;
2855 		size -= chunk;
2856 
2857 		if (drop_skb) {
2858 			/* the skb was touched by a concurrent reader;
2859 			 * we should not expect anything from this skb
2860 			 * anymore and assume it invalid - we can be
2861 			 * sure it was dropped from the socket queue
2862 			 *
2863 			 * let's report a short read
2864 			 */
2865 			err = 0;
2866 			break;
2867 		}
2868 
2869 		/* Mark read part of skb as used */
2870 		if (!(flags & MSG_PEEK)) {
2871 			UNIXCB(skb).consumed += chunk;
2872 
2873 			sk_peek_offset_bwd(sk, chunk);
2874 
2875 			if (UNIXCB(skb).fp) {
2876 				scm_stat_del(sk, skb);
2877 				unix_detach_fds(&scm, skb);
2878 			}
2879 
2880 			if (unix_skb_len(skb))
2881 				break;
2882 
2883 			skb_unlink(skb, &sk->sk_receive_queue);
2884 			consume_skb(skb);
2885 
2886 			if (scm.fp)
2887 				break;
2888 		} else {
2889 			/* It is questionable, see note in unix_dgram_recvmsg.
2890 			 */
2891 			if (UNIXCB(skb).fp)
2892 				unix_peek_fds(&scm, skb);
2893 
2894 			sk_peek_offset_fwd(sk, chunk);
2895 
2896 			if (UNIXCB(skb).fp)
2897 				break;
2898 
2899 			skip = 0;
2900 			last = skb;
2901 			last_len = skb->len;
2902 			unix_state_lock(sk);
2903 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2904 			if (skb)
2905 				goto again;
2906 			unix_state_unlock(sk);
2907 			break;
2908 		}
2909 	} while (size);
2910 
2911 	mutex_unlock(&u->iolock);
2912 	if (state->msg)
2913 		scm_recv_unix(sock, state->msg, &scm, flags);
2914 	else
2915 		scm_destroy(&scm);
2916 out:
2917 	return copied ? : err;
2918 }
2919 
2920 static int unix_stream_read_actor(struct sk_buff *skb,
2921 				  int skip, int chunk,
2922 				  struct unix_stream_read_state *state)
2923 {
2924 	int ret;
2925 
2926 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2927 				    state->msg, chunk);
2928 	return ret ?: chunk;
2929 }
2930 
2931 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2932 			  size_t size, int flags)
2933 {
2934 	struct unix_stream_read_state state = {
2935 		.recv_actor = unix_stream_read_actor,
2936 		.socket = sk->sk_socket,
2937 		.msg = msg,
2938 		.size = size,
2939 		.flags = flags
2940 	};
2941 
2942 	return unix_stream_read_generic(&state, true);
2943 }
2944 
2945 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2946 			       size_t size, int flags)
2947 {
2948 	struct unix_stream_read_state state = {
2949 		.recv_actor = unix_stream_read_actor,
2950 		.socket = sock,
2951 		.msg = msg,
2952 		.size = size,
2953 		.flags = flags
2954 	};
2955 
2956 #ifdef CONFIG_BPF_SYSCALL
2957 	struct sock *sk = sock->sk;
2958 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2959 
2960 	if (prot != &unix_stream_proto)
2961 		return prot->recvmsg(sk, msg, size, flags, NULL);
2962 #endif
2963 	return unix_stream_read_generic(&state, true);
2964 }
2965 
2966 static int unix_stream_splice_actor(struct sk_buff *skb,
2967 				    int skip, int chunk,
2968 				    struct unix_stream_read_state *state)
2969 {
2970 	return skb_splice_bits(skb, state->socket->sk,
2971 			       UNIXCB(skb).consumed + skip,
2972 			       state->pipe, chunk, state->splice_flags);
2973 }
2974 
2975 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2976 				       struct pipe_inode_info *pipe,
2977 				       size_t size, unsigned int flags)
2978 {
2979 	struct unix_stream_read_state state = {
2980 		.recv_actor = unix_stream_splice_actor,
2981 		.socket = sock,
2982 		.pipe = pipe,
2983 		.size = size,
2984 		.splice_flags = flags,
2985 	};
2986 
2987 	if (unlikely(*ppos))
2988 		return -ESPIPE;
2989 
2990 	if (sock->file->f_flags & O_NONBLOCK ||
2991 	    flags & SPLICE_F_NONBLOCK)
2992 		state.flags = MSG_DONTWAIT;
2993 
2994 	return unix_stream_read_generic(&state, false);
2995 }
2996 
2997 static int unix_shutdown(struct socket *sock, int mode)
2998 {
2999 	struct sock *sk = sock->sk;
3000 	struct sock *other;
3001 
3002 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3003 		return -EINVAL;
3004 	/* This maps:
3005 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3006 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3007 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3008 	 */
3009 	++mode;
3010 
3011 	unix_state_lock(sk);
3012 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3013 	other = unix_peer(sk);
3014 	if (other)
3015 		sock_hold(other);
3016 	unix_state_unlock(sk);
3017 	sk->sk_state_change(sk);
3018 
3019 	if (other &&
3020 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3021 
3022 		int peer_mode = 0;
3023 		const struct proto *prot = READ_ONCE(other->sk_prot);
3024 
3025 		if (prot->unhash)
3026 			prot->unhash(other);
3027 		if (mode&RCV_SHUTDOWN)
3028 			peer_mode |= SEND_SHUTDOWN;
3029 		if (mode&SEND_SHUTDOWN)
3030 			peer_mode |= RCV_SHUTDOWN;
3031 		unix_state_lock(other);
3032 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3033 		unix_state_unlock(other);
3034 		other->sk_state_change(other);
3035 		if (peer_mode == SHUTDOWN_MASK)
3036 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3037 		else if (peer_mode & RCV_SHUTDOWN)
3038 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3039 	}
3040 	if (other)
3041 		sock_put(other);
3042 
3043 	return 0;
3044 }
3045 
3046 long unix_inq_len(struct sock *sk)
3047 {
3048 	struct sk_buff *skb;
3049 	long amount = 0;
3050 
3051 	if (sk->sk_state == TCP_LISTEN)
3052 		return -EINVAL;
3053 
3054 	spin_lock(&sk->sk_receive_queue.lock);
3055 	if (sk->sk_type == SOCK_STREAM ||
3056 	    sk->sk_type == SOCK_SEQPACKET) {
3057 		skb_queue_walk(&sk->sk_receive_queue, skb)
3058 			amount += unix_skb_len(skb);
3059 	} else {
3060 		skb = skb_peek(&sk->sk_receive_queue);
3061 		if (skb)
3062 			amount = skb->len;
3063 	}
3064 	spin_unlock(&sk->sk_receive_queue.lock);
3065 
3066 	return amount;
3067 }
3068 EXPORT_SYMBOL_GPL(unix_inq_len);
3069 
3070 long unix_outq_len(struct sock *sk)
3071 {
3072 	return sk_wmem_alloc_get(sk);
3073 }
3074 EXPORT_SYMBOL_GPL(unix_outq_len);
3075 
3076 static int unix_open_file(struct sock *sk)
3077 {
3078 	struct path path;
3079 	struct file *f;
3080 	int fd;
3081 
3082 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3083 		return -EPERM;
3084 
3085 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3086 		return -ENOENT;
3087 
3088 	path = unix_sk(sk)->path;
3089 	if (!path.dentry)
3090 		return -ENOENT;
3091 
3092 	path_get(&path);
3093 
3094 	fd = get_unused_fd_flags(O_CLOEXEC);
3095 	if (fd < 0)
3096 		goto out;
3097 
3098 	f = dentry_open(&path, O_PATH, current_cred());
3099 	if (IS_ERR(f)) {
3100 		put_unused_fd(fd);
3101 		fd = PTR_ERR(f);
3102 		goto out;
3103 	}
3104 
3105 	fd_install(fd, f);
3106 out:
3107 	path_put(&path);
3108 
3109 	return fd;
3110 }
3111 
3112 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3113 {
3114 	struct sock *sk = sock->sk;
3115 	long amount = 0;
3116 	int err;
3117 
3118 	switch (cmd) {
3119 	case SIOCOUTQ:
3120 		amount = unix_outq_len(sk);
3121 		err = put_user(amount, (int __user *)arg);
3122 		break;
3123 	case SIOCINQ:
3124 		amount = unix_inq_len(sk);
3125 		if (amount < 0)
3126 			err = amount;
3127 		else
3128 			err = put_user(amount, (int __user *)arg);
3129 		break;
3130 	case SIOCUNIXFILE:
3131 		err = unix_open_file(sk);
3132 		break;
3133 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3134 	case SIOCATMARK:
3135 		{
3136 			struct sk_buff *skb;
3137 			int answ = 0;
3138 
3139 			skb = skb_peek(&sk->sk_receive_queue);
3140 			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3141 				answ = 1;
3142 			err = put_user(answ, (int __user *)arg);
3143 		}
3144 		break;
3145 #endif
3146 	default:
3147 		err = -ENOIOCTLCMD;
3148 		break;
3149 	}
3150 	return err;
3151 }
3152 
3153 #ifdef CONFIG_COMPAT
3154 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3155 {
3156 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3157 }
3158 #endif
3159 
3160 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3161 {
3162 	struct sock *sk = sock->sk;
3163 	__poll_t mask;
3164 	u8 shutdown;
3165 
3166 	sock_poll_wait(file, sock, wait);
3167 	mask = 0;
3168 	shutdown = READ_ONCE(sk->sk_shutdown);
3169 
3170 	/* exceptional events? */
3171 	if (READ_ONCE(sk->sk_err))
3172 		mask |= EPOLLERR;
3173 	if (shutdown == SHUTDOWN_MASK)
3174 		mask |= EPOLLHUP;
3175 	if (shutdown & RCV_SHUTDOWN)
3176 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3177 
3178 	/* readable? */
3179 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3180 		mask |= EPOLLIN | EPOLLRDNORM;
3181 	if (sk_is_readable(sk))
3182 		mask |= EPOLLIN | EPOLLRDNORM;
3183 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3184 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3185 		mask |= EPOLLPRI;
3186 #endif
3187 
3188 	/* Connection-based need to check for termination and startup */
3189 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3190 	    sk->sk_state == TCP_CLOSE)
3191 		mask |= EPOLLHUP;
3192 
3193 	/*
3194 	 * we set writable also when the other side has shut down the
3195 	 * connection. This prevents stuck sockets.
3196 	 */
3197 	if (unix_writable(sk))
3198 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3199 
3200 	return mask;
3201 }
3202 
3203 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3204 				    poll_table *wait)
3205 {
3206 	struct sock *sk = sock->sk, *other;
3207 	unsigned int writable;
3208 	__poll_t mask;
3209 	u8 shutdown;
3210 
3211 	sock_poll_wait(file, sock, wait);
3212 	mask = 0;
3213 	shutdown = READ_ONCE(sk->sk_shutdown);
3214 
3215 	/* exceptional events? */
3216 	if (READ_ONCE(sk->sk_err) ||
3217 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3218 		mask |= EPOLLERR |
3219 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3220 
3221 	if (shutdown & RCV_SHUTDOWN)
3222 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3223 	if (shutdown == SHUTDOWN_MASK)
3224 		mask |= EPOLLHUP;
3225 
3226 	/* readable? */
3227 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3228 		mask |= EPOLLIN | EPOLLRDNORM;
3229 	if (sk_is_readable(sk))
3230 		mask |= EPOLLIN | EPOLLRDNORM;
3231 
3232 	/* Connection-based need to check for termination and startup */
3233 	if (sk->sk_type == SOCK_SEQPACKET) {
3234 		if (sk->sk_state == TCP_CLOSE)
3235 			mask |= EPOLLHUP;
3236 		/* connection hasn't started yet? */
3237 		if (sk->sk_state == TCP_SYN_SENT)
3238 			return mask;
3239 	}
3240 
3241 	/* No write status requested, avoid expensive OUT tests. */
3242 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3243 		return mask;
3244 
3245 	writable = unix_writable(sk);
3246 	if (writable) {
3247 		unix_state_lock(sk);
3248 
3249 		other = unix_peer(sk);
3250 		if (other && unix_peer(other) != sk &&
3251 		    unix_recvq_full_lockless(other) &&
3252 		    unix_dgram_peer_wake_me(sk, other))
3253 			writable = 0;
3254 
3255 		unix_state_unlock(sk);
3256 	}
3257 
3258 	if (writable)
3259 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3260 	else
3261 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3262 
3263 	return mask;
3264 }
3265 
3266 #ifdef CONFIG_PROC_FS
3267 
3268 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3269 
3270 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3271 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3272 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3273 
3274 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3275 {
3276 	unsigned long offset = get_offset(*pos);
3277 	unsigned long bucket = get_bucket(*pos);
3278 	unsigned long count = 0;
3279 	struct sock *sk;
3280 
3281 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3282 	     sk; sk = sk_next(sk)) {
3283 		if (++count == offset)
3284 			break;
3285 	}
3286 
3287 	return sk;
3288 }
3289 
3290 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3291 {
3292 	unsigned long bucket = get_bucket(*pos);
3293 	struct net *net = seq_file_net(seq);
3294 	struct sock *sk;
3295 
3296 	while (bucket < UNIX_HASH_SIZE) {
3297 		spin_lock(&net->unx.table.locks[bucket]);
3298 
3299 		sk = unix_from_bucket(seq, pos);
3300 		if (sk)
3301 			return sk;
3302 
3303 		spin_unlock(&net->unx.table.locks[bucket]);
3304 
3305 		*pos = set_bucket_offset(++bucket, 1);
3306 	}
3307 
3308 	return NULL;
3309 }
3310 
3311 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3312 				  loff_t *pos)
3313 {
3314 	unsigned long bucket = get_bucket(*pos);
3315 
3316 	sk = sk_next(sk);
3317 	if (sk)
3318 		return sk;
3319 
3320 
3321 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3322 
3323 	*pos = set_bucket_offset(++bucket, 1);
3324 
3325 	return unix_get_first(seq, pos);
3326 }
3327 
3328 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3329 {
3330 	if (!*pos)
3331 		return SEQ_START_TOKEN;
3332 
3333 	return unix_get_first(seq, pos);
3334 }
3335 
3336 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3337 {
3338 	++*pos;
3339 
3340 	if (v == SEQ_START_TOKEN)
3341 		return unix_get_first(seq, pos);
3342 
3343 	return unix_get_next(seq, v, pos);
3344 }
3345 
3346 static void unix_seq_stop(struct seq_file *seq, void *v)
3347 {
3348 	struct sock *sk = v;
3349 
3350 	if (sk)
3351 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3352 }
3353 
3354 static int unix_seq_show(struct seq_file *seq, void *v)
3355 {
3356 
3357 	if (v == SEQ_START_TOKEN)
3358 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3359 			 "Inode Path\n");
3360 	else {
3361 		struct sock *s = v;
3362 		struct unix_sock *u = unix_sk(s);
3363 		unix_state_lock(s);
3364 
3365 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3366 			s,
3367 			refcount_read(&s->sk_refcnt),
3368 			0,
3369 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3370 			s->sk_type,
3371 			s->sk_socket ?
3372 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3373 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3374 			sock_i_ino(s));
3375 
3376 		if (u->addr) {	// under a hash table lock here
3377 			int i, len;
3378 			seq_putc(seq, ' ');
3379 
3380 			i = 0;
3381 			len = u->addr->len -
3382 				offsetof(struct sockaddr_un, sun_path);
3383 			if (u->addr->name->sun_path[0]) {
3384 				len--;
3385 			} else {
3386 				seq_putc(seq, '@');
3387 				i++;
3388 			}
3389 			for ( ; i < len; i++)
3390 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3391 					 '@');
3392 		}
3393 		unix_state_unlock(s);
3394 		seq_putc(seq, '\n');
3395 	}
3396 
3397 	return 0;
3398 }
3399 
3400 static const struct seq_operations unix_seq_ops = {
3401 	.start  = unix_seq_start,
3402 	.next   = unix_seq_next,
3403 	.stop   = unix_seq_stop,
3404 	.show   = unix_seq_show,
3405 };
3406 
3407 #ifdef CONFIG_BPF_SYSCALL
3408 struct bpf_unix_iter_state {
3409 	struct seq_net_private p;
3410 	unsigned int cur_sk;
3411 	unsigned int end_sk;
3412 	unsigned int max_sk;
3413 	struct sock **batch;
3414 	bool st_bucket_done;
3415 };
3416 
3417 struct bpf_iter__unix {
3418 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3419 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3420 	uid_t uid __aligned(8);
3421 };
3422 
3423 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3424 			      struct unix_sock *unix_sk, uid_t uid)
3425 {
3426 	struct bpf_iter__unix ctx;
3427 
3428 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3429 	ctx.meta = meta;
3430 	ctx.unix_sk = unix_sk;
3431 	ctx.uid = uid;
3432 	return bpf_iter_run_prog(prog, &ctx);
3433 }
3434 
3435 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3436 
3437 {
3438 	struct bpf_unix_iter_state *iter = seq->private;
3439 	unsigned int expected = 1;
3440 	struct sock *sk;
3441 
3442 	sock_hold(start_sk);
3443 	iter->batch[iter->end_sk++] = start_sk;
3444 
3445 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3446 		if (iter->end_sk < iter->max_sk) {
3447 			sock_hold(sk);
3448 			iter->batch[iter->end_sk++] = sk;
3449 		}
3450 
3451 		expected++;
3452 	}
3453 
3454 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3455 
3456 	return expected;
3457 }
3458 
3459 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3460 {
3461 	while (iter->cur_sk < iter->end_sk)
3462 		sock_put(iter->batch[iter->cur_sk++]);
3463 }
3464 
3465 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3466 				       unsigned int new_batch_sz)
3467 {
3468 	struct sock **new_batch;
3469 
3470 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3471 			     GFP_USER | __GFP_NOWARN);
3472 	if (!new_batch)
3473 		return -ENOMEM;
3474 
3475 	bpf_iter_unix_put_batch(iter);
3476 	kvfree(iter->batch);
3477 	iter->batch = new_batch;
3478 	iter->max_sk = new_batch_sz;
3479 
3480 	return 0;
3481 }
3482 
3483 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3484 					loff_t *pos)
3485 {
3486 	struct bpf_unix_iter_state *iter = seq->private;
3487 	unsigned int expected;
3488 	bool resized = false;
3489 	struct sock *sk;
3490 
3491 	if (iter->st_bucket_done)
3492 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3493 
3494 again:
3495 	/* Get a new batch */
3496 	iter->cur_sk = 0;
3497 	iter->end_sk = 0;
3498 
3499 	sk = unix_get_first(seq, pos);
3500 	if (!sk)
3501 		return NULL; /* Done */
3502 
3503 	expected = bpf_iter_unix_hold_batch(seq, sk);
3504 
3505 	if (iter->end_sk == expected) {
3506 		iter->st_bucket_done = true;
3507 		return sk;
3508 	}
3509 
3510 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3511 		resized = true;
3512 		goto again;
3513 	}
3514 
3515 	return sk;
3516 }
3517 
3518 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3519 {
3520 	if (!*pos)
3521 		return SEQ_START_TOKEN;
3522 
3523 	/* bpf iter does not support lseek, so it always
3524 	 * continue from where it was stop()-ped.
3525 	 */
3526 	return bpf_iter_unix_batch(seq, pos);
3527 }
3528 
3529 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3530 {
3531 	struct bpf_unix_iter_state *iter = seq->private;
3532 	struct sock *sk;
3533 
3534 	/* Whenever seq_next() is called, the iter->cur_sk is
3535 	 * done with seq_show(), so advance to the next sk in
3536 	 * the batch.
3537 	 */
3538 	if (iter->cur_sk < iter->end_sk)
3539 		sock_put(iter->batch[iter->cur_sk++]);
3540 
3541 	++*pos;
3542 
3543 	if (iter->cur_sk < iter->end_sk)
3544 		sk = iter->batch[iter->cur_sk];
3545 	else
3546 		sk = bpf_iter_unix_batch(seq, pos);
3547 
3548 	return sk;
3549 }
3550 
3551 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3552 {
3553 	struct bpf_iter_meta meta;
3554 	struct bpf_prog *prog;
3555 	struct sock *sk = v;
3556 	uid_t uid;
3557 	bool slow;
3558 	int ret;
3559 
3560 	if (v == SEQ_START_TOKEN)
3561 		return 0;
3562 
3563 	slow = lock_sock_fast(sk);
3564 
3565 	if (unlikely(sk_unhashed(sk))) {
3566 		ret = SEQ_SKIP;
3567 		goto unlock;
3568 	}
3569 
3570 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3571 	meta.seq = seq;
3572 	prog = bpf_iter_get_info(&meta, false);
3573 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3574 unlock:
3575 	unlock_sock_fast(sk, slow);
3576 	return ret;
3577 }
3578 
3579 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3580 {
3581 	struct bpf_unix_iter_state *iter = seq->private;
3582 	struct bpf_iter_meta meta;
3583 	struct bpf_prog *prog;
3584 
3585 	if (!v) {
3586 		meta.seq = seq;
3587 		prog = bpf_iter_get_info(&meta, true);
3588 		if (prog)
3589 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3590 	}
3591 
3592 	if (iter->cur_sk < iter->end_sk)
3593 		bpf_iter_unix_put_batch(iter);
3594 }
3595 
3596 static const struct seq_operations bpf_iter_unix_seq_ops = {
3597 	.start	= bpf_iter_unix_seq_start,
3598 	.next	= bpf_iter_unix_seq_next,
3599 	.stop	= bpf_iter_unix_seq_stop,
3600 	.show	= bpf_iter_unix_seq_show,
3601 };
3602 #endif
3603 #endif
3604 
3605 static const struct net_proto_family unix_family_ops = {
3606 	.family = PF_UNIX,
3607 	.create = unix_create,
3608 	.owner	= THIS_MODULE,
3609 };
3610 
3611 
3612 static int __net_init unix_net_init(struct net *net)
3613 {
3614 	int i;
3615 
3616 	net->unx.sysctl_max_dgram_qlen = 10;
3617 	if (unix_sysctl_register(net))
3618 		goto out;
3619 
3620 #ifdef CONFIG_PROC_FS
3621 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3622 			     sizeof(struct seq_net_private)))
3623 		goto err_sysctl;
3624 #endif
3625 
3626 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3627 					      sizeof(spinlock_t), GFP_KERNEL);
3628 	if (!net->unx.table.locks)
3629 		goto err_proc;
3630 
3631 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3632 						sizeof(struct hlist_head),
3633 						GFP_KERNEL);
3634 	if (!net->unx.table.buckets)
3635 		goto free_locks;
3636 
3637 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3638 		spin_lock_init(&net->unx.table.locks[i]);
3639 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3640 	}
3641 
3642 	return 0;
3643 
3644 free_locks:
3645 	kvfree(net->unx.table.locks);
3646 err_proc:
3647 #ifdef CONFIG_PROC_FS
3648 	remove_proc_entry("unix", net->proc_net);
3649 err_sysctl:
3650 #endif
3651 	unix_sysctl_unregister(net);
3652 out:
3653 	return -ENOMEM;
3654 }
3655 
3656 static void __net_exit unix_net_exit(struct net *net)
3657 {
3658 	kvfree(net->unx.table.buckets);
3659 	kvfree(net->unx.table.locks);
3660 	unix_sysctl_unregister(net);
3661 	remove_proc_entry("unix", net->proc_net);
3662 }
3663 
3664 static struct pernet_operations unix_net_ops = {
3665 	.init = unix_net_init,
3666 	.exit = unix_net_exit,
3667 };
3668 
3669 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3670 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3671 		     struct unix_sock *unix_sk, uid_t uid)
3672 
3673 #define INIT_BATCH_SZ 16
3674 
3675 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3676 {
3677 	struct bpf_unix_iter_state *iter = priv_data;
3678 	int err;
3679 
3680 	err = bpf_iter_init_seq_net(priv_data, aux);
3681 	if (err)
3682 		return err;
3683 
3684 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3685 	if (err) {
3686 		bpf_iter_fini_seq_net(priv_data);
3687 		return err;
3688 	}
3689 
3690 	return 0;
3691 }
3692 
3693 static void bpf_iter_fini_unix(void *priv_data)
3694 {
3695 	struct bpf_unix_iter_state *iter = priv_data;
3696 
3697 	bpf_iter_fini_seq_net(priv_data);
3698 	kvfree(iter->batch);
3699 }
3700 
3701 static const struct bpf_iter_seq_info unix_seq_info = {
3702 	.seq_ops		= &bpf_iter_unix_seq_ops,
3703 	.init_seq_private	= bpf_iter_init_unix,
3704 	.fini_seq_private	= bpf_iter_fini_unix,
3705 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3706 };
3707 
3708 static const struct bpf_func_proto *
3709 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3710 			     const struct bpf_prog *prog)
3711 {
3712 	switch (func_id) {
3713 	case BPF_FUNC_setsockopt:
3714 		return &bpf_sk_setsockopt_proto;
3715 	case BPF_FUNC_getsockopt:
3716 		return &bpf_sk_getsockopt_proto;
3717 	default:
3718 		return NULL;
3719 	}
3720 }
3721 
3722 static struct bpf_iter_reg unix_reg_info = {
3723 	.target			= "unix",
3724 	.ctx_arg_info_size	= 1,
3725 	.ctx_arg_info		= {
3726 		{ offsetof(struct bpf_iter__unix, unix_sk),
3727 		  PTR_TO_BTF_ID_OR_NULL },
3728 	},
3729 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3730 	.seq_info		= &unix_seq_info,
3731 };
3732 
3733 static void __init bpf_iter_register(void)
3734 {
3735 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3736 	if (bpf_iter_reg_target(&unix_reg_info))
3737 		pr_warn("Warning: could not register bpf iterator unix\n");
3738 }
3739 #endif
3740 
3741 static int __init af_unix_init(void)
3742 {
3743 	int i, rc = -1;
3744 
3745 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3746 
3747 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3748 		spin_lock_init(&bsd_socket_locks[i]);
3749 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3750 	}
3751 
3752 	rc = proto_register(&unix_dgram_proto, 1);
3753 	if (rc != 0) {
3754 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3755 		goto out;
3756 	}
3757 
3758 	rc = proto_register(&unix_stream_proto, 1);
3759 	if (rc != 0) {
3760 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3761 		proto_unregister(&unix_dgram_proto);
3762 		goto out;
3763 	}
3764 
3765 	sock_register(&unix_family_ops);
3766 	register_pernet_subsys(&unix_net_ops);
3767 	unix_bpf_build_proto();
3768 
3769 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3770 	bpf_iter_register();
3771 #endif
3772 
3773 out:
3774 	return rc;
3775 }
3776 
3777 /* Later than subsys_initcall() because we depend on stuff initialised there */
3778 fs_initcall(af_unix_init);
3779