xref: /linux/net/unix/af_unix.c (revision 0d5ec7919f3747193f051036b2301734a4b5e1d6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/bpf-cgroup.h>
81 #include <linux/btf_ids.h>
82 #include <linux/dcache.h>
83 #include <linux/errno.h>
84 #include <linux/fcntl.h>
85 #include <linux/file.h>
86 #include <linux/filter.h>
87 #include <linux/fs.h>
88 #include <linux/fs_struct.h>
89 #include <linux/init.h>
90 #include <linux/kernel.h>
91 #include <linux/mount.h>
92 #include <linux/namei.h>
93 #include <linux/net.h>
94 #include <linux/pidfs.h>
95 #include <linux/poll.h>
96 #include <linux/proc_fs.h>
97 #include <linux/sched/signal.h>
98 #include <linux/security.h>
99 #include <linux/seq_file.h>
100 #include <linux/skbuff.h>
101 #include <linux/slab.h>
102 #include <linux/socket.h>
103 #include <linux/splice.h>
104 #include <linux/string.h>
105 #include <linux/uaccess.h>
106 #include <net/af_unix.h>
107 #include <net/net_namespace.h>
108 #include <net/scm.h>
109 #include <net/tcp_states.h>
110 #include <uapi/linux/sockios.h>
111 #include <uapi/linux/termios.h>
112 
113 #include "af_unix.h"
114 
115 static atomic_long_t unix_nr_socks;
116 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
117 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
118 
119 /* SMP locking strategy:
120  *    hash table is protected with spinlock.
121  *    each socket state is protected by separate spinlock.
122  */
123 #ifdef CONFIG_PROVE_LOCKING
124 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
125 
126 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
127 				  const struct lockdep_map *b)
128 {
129 	return cmp_ptr(a, b);
130 }
131 
132 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
133 				  const struct lockdep_map *_b)
134 {
135 	const struct unix_sock *a, *b;
136 
137 	a = container_of(_a, struct unix_sock, lock.dep_map);
138 	b = container_of(_b, struct unix_sock, lock.dep_map);
139 
140 	if (a->sk.sk_state == TCP_LISTEN) {
141 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
142 		 *
143 		 *   1. a is TCP_LISTEN.
144 		 *   2. b is not a.
145 		 *   3. concurrent connect(b -> a) must fail.
146 		 *
147 		 * Except for 2. & 3., the b's state can be any possible
148 		 * value due to concurrent connect() or listen().
149 		 *
150 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
151 		 * be expressed as lock_cmp_fn.
152 		 */
153 		switch (b->sk.sk_state) {
154 		case TCP_CLOSE:
155 		case TCP_ESTABLISHED:
156 		case TCP_LISTEN:
157 			return -1;
158 		default:
159 			/* Invalid case. */
160 			return 0;
161 		}
162 	}
163 
164 	/* Should never happen.  Just to be symmetric. */
165 	if (b->sk.sk_state == TCP_LISTEN) {
166 		switch (b->sk.sk_state) {
167 		case TCP_CLOSE:
168 		case TCP_ESTABLISHED:
169 			return 1;
170 		default:
171 			return 0;
172 		}
173 	}
174 
175 	/* unix_state_double_lock(): ascending address order. */
176 	return cmp_ptr(a, b);
177 }
178 
179 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
180 				  const struct lockdep_map *_b)
181 {
182 	const struct sock *a, *b;
183 
184 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
185 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
186 
187 	/* unix_collect_skb(): listener -> embryo order. */
188 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
189 		return -1;
190 
191 	/* Should never happen.  Just to be symmetric. */
192 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
193 		return 1;
194 
195 	return 0;
196 }
197 #endif
198 
199 static unsigned int unix_unbound_hash(struct sock *sk)
200 {
201 	unsigned long hash = (unsigned long)sk;
202 
203 	hash ^= hash >> 16;
204 	hash ^= hash >> 8;
205 	hash ^= sk->sk_type;
206 
207 	return hash & UNIX_HASH_MOD;
208 }
209 
210 static unsigned int unix_bsd_hash(struct inode *i)
211 {
212 	return i->i_ino & UNIX_HASH_MOD;
213 }
214 
215 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
216 				       int addr_len, int type)
217 {
218 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
219 	unsigned int hash;
220 
221 	hash = (__force unsigned int)csum_fold(csum);
222 	hash ^= hash >> 8;
223 	hash ^= type;
224 
225 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
226 }
227 
228 static void unix_table_double_lock(struct net *net,
229 				   unsigned int hash1, unsigned int hash2)
230 {
231 	if (hash1 == hash2) {
232 		spin_lock(&net->unx.table.locks[hash1]);
233 		return;
234 	}
235 
236 	if (hash1 > hash2)
237 		swap(hash1, hash2);
238 
239 	spin_lock(&net->unx.table.locks[hash1]);
240 	spin_lock(&net->unx.table.locks[hash2]);
241 }
242 
243 static void unix_table_double_unlock(struct net *net,
244 				     unsigned int hash1, unsigned int hash2)
245 {
246 	if (hash1 == hash2) {
247 		spin_unlock(&net->unx.table.locks[hash1]);
248 		return;
249 	}
250 
251 	spin_unlock(&net->unx.table.locks[hash1]);
252 	spin_unlock(&net->unx.table.locks[hash2]);
253 }
254 
255 #ifdef CONFIG_SECURITY_NETWORK
256 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
257 {
258 	UNIXCB(skb).secid = scm->secid;
259 }
260 
261 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
262 {
263 	scm->secid = UNIXCB(skb).secid;
264 }
265 
266 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
267 {
268 	return (scm->secid == UNIXCB(skb).secid);
269 }
270 #else
271 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
272 { }
273 
274 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
275 { }
276 
277 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
278 {
279 	return true;
280 }
281 #endif /* CONFIG_SECURITY_NETWORK */
282 
283 static inline int unix_may_send(struct sock *sk, struct sock *osk)
284 {
285 	return !unix_peer(osk) || unix_peer(osk) == sk;
286 }
287 
288 static inline int unix_recvq_full_lockless(const struct sock *sk)
289 {
290 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
291 }
292 
293 struct sock *unix_peer_get(struct sock *s)
294 {
295 	struct sock *peer;
296 
297 	unix_state_lock(s);
298 	peer = unix_peer(s);
299 	if (peer)
300 		sock_hold(peer);
301 	unix_state_unlock(s);
302 	return peer;
303 }
304 EXPORT_SYMBOL_GPL(unix_peer_get);
305 
306 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
307 					     int addr_len)
308 {
309 	struct unix_address *addr;
310 
311 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
312 	if (!addr)
313 		return NULL;
314 
315 	refcount_set(&addr->refcnt, 1);
316 	addr->len = addr_len;
317 	memcpy(addr->name, sunaddr, addr_len);
318 
319 	return addr;
320 }
321 
322 static inline void unix_release_addr(struct unix_address *addr)
323 {
324 	if (refcount_dec_and_test(&addr->refcnt))
325 		kfree(addr);
326 }
327 
328 /*
329  *	Check unix socket name:
330  *		- should be not zero length.
331  *	        - if started by not zero, should be NULL terminated (FS object)
332  *		- if started by zero, it is abstract name.
333  */
334 
335 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
336 {
337 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
338 	    addr_len > sizeof(*sunaddr))
339 		return -EINVAL;
340 
341 	if (sunaddr->sun_family != AF_UNIX)
342 		return -EINVAL;
343 
344 	return 0;
345 }
346 
347 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
348 {
349 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
350 	short offset = offsetof(struct sockaddr_storage, __data);
351 
352 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
353 
354 	/* This may look like an off by one error but it is a bit more
355 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
356 	 * sun_path[108] doesn't as such exist.  However in kernel space
357 	 * we are guaranteed that it is a valid memory location in our
358 	 * kernel address buffer because syscall functions always pass
359 	 * a pointer of struct sockaddr_storage which has a bigger buffer
360 	 * than 108.  Also, we must terminate sun_path for strlen() in
361 	 * getname_kernel().
362 	 */
363 	addr->__data[addr_len - offset] = 0;
364 
365 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
366 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
367 	 * know the actual buffer.
368 	 */
369 	return strlen(addr->__data) + offset + 1;
370 }
371 
372 static void __unix_remove_socket(struct sock *sk)
373 {
374 	sk_del_node_init(sk);
375 }
376 
377 static void __unix_insert_socket(struct net *net, struct sock *sk)
378 {
379 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
380 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
381 }
382 
383 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
384 				 struct unix_address *addr, unsigned int hash)
385 {
386 	__unix_remove_socket(sk);
387 	smp_store_release(&unix_sk(sk)->addr, addr);
388 
389 	sk->sk_hash = hash;
390 	__unix_insert_socket(net, sk);
391 }
392 
393 static void unix_remove_socket(struct net *net, struct sock *sk)
394 {
395 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
396 	__unix_remove_socket(sk);
397 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
398 }
399 
400 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
401 {
402 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
403 	__unix_insert_socket(net, sk);
404 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
405 }
406 
407 static void unix_insert_bsd_socket(struct sock *sk)
408 {
409 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
410 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
411 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
412 }
413 
414 static void unix_remove_bsd_socket(struct sock *sk)
415 {
416 	if (!hlist_unhashed(&sk->sk_bind_node)) {
417 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
418 		__sk_del_bind_node(sk);
419 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
420 
421 		sk_node_init(&sk->sk_bind_node);
422 	}
423 }
424 
425 static struct sock *__unix_find_socket_byname(struct net *net,
426 					      struct sockaddr_un *sunname,
427 					      int len, unsigned int hash)
428 {
429 	struct sock *s;
430 
431 	sk_for_each(s, &net->unx.table.buckets[hash]) {
432 		struct unix_sock *u = unix_sk(s);
433 
434 		if (u->addr->len == len &&
435 		    !memcmp(u->addr->name, sunname, len))
436 			return s;
437 	}
438 	return NULL;
439 }
440 
441 static inline struct sock *unix_find_socket_byname(struct net *net,
442 						   struct sockaddr_un *sunname,
443 						   int len, unsigned int hash)
444 {
445 	struct sock *s;
446 
447 	spin_lock(&net->unx.table.locks[hash]);
448 	s = __unix_find_socket_byname(net, sunname, len, hash);
449 	if (s)
450 		sock_hold(s);
451 	spin_unlock(&net->unx.table.locks[hash]);
452 	return s;
453 }
454 
455 static struct sock *unix_find_socket_byinode(struct inode *i)
456 {
457 	unsigned int hash = unix_bsd_hash(i);
458 	struct sock *s;
459 
460 	spin_lock(&bsd_socket_locks[hash]);
461 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
462 		struct dentry *dentry = unix_sk(s)->path.dentry;
463 
464 		if (dentry && d_backing_inode(dentry) == i) {
465 			sock_hold(s);
466 			spin_unlock(&bsd_socket_locks[hash]);
467 			return s;
468 		}
469 	}
470 	spin_unlock(&bsd_socket_locks[hash]);
471 	return NULL;
472 }
473 
474 /* Support code for asymmetrically connected dgram sockets
475  *
476  * If a datagram socket is connected to a socket not itself connected
477  * to the first socket (eg, /dev/log), clients may only enqueue more
478  * messages if the present receive queue of the server socket is not
479  * "too large". This means there's a second writeability condition
480  * poll and sendmsg need to test. The dgram recv code will do a wake
481  * up on the peer_wait wait queue of a socket upon reception of a
482  * datagram which needs to be propagated to sleeping would-be writers
483  * since these might not have sent anything so far. This can't be
484  * accomplished via poll_wait because the lifetime of the server
485  * socket might be less than that of its clients if these break their
486  * association with it or if the server socket is closed while clients
487  * are still connected to it and there's no way to inform "a polling
488  * implementation" that it should let go of a certain wait queue
489  *
490  * In order to propagate a wake up, a wait_queue_entry_t of the client
491  * socket is enqueued on the peer_wait queue of the server socket
492  * whose wake function does a wake_up on the ordinary client socket
493  * wait queue. This connection is established whenever a write (or
494  * poll for write) hit the flow control condition and broken when the
495  * association to the server socket is dissolved or after a wake up
496  * was relayed.
497  */
498 
499 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
500 				      void *key)
501 {
502 	struct unix_sock *u;
503 	wait_queue_head_t *u_sleep;
504 
505 	u = container_of(q, struct unix_sock, peer_wake);
506 
507 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
508 			    q);
509 	u->peer_wake.private = NULL;
510 
511 	/* relaying can only happen while the wq still exists */
512 	u_sleep = sk_sleep(&u->sk);
513 	if (u_sleep)
514 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
515 
516 	return 0;
517 }
518 
519 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
520 {
521 	struct unix_sock *u, *u_other;
522 	int rc;
523 
524 	u = unix_sk(sk);
525 	u_other = unix_sk(other);
526 	rc = 0;
527 	spin_lock(&u_other->peer_wait.lock);
528 
529 	if (!u->peer_wake.private) {
530 		u->peer_wake.private = other;
531 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
532 
533 		rc = 1;
534 	}
535 
536 	spin_unlock(&u_other->peer_wait.lock);
537 	return rc;
538 }
539 
540 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
541 					    struct sock *other)
542 {
543 	struct unix_sock *u, *u_other;
544 
545 	u = unix_sk(sk);
546 	u_other = unix_sk(other);
547 	spin_lock(&u_other->peer_wait.lock);
548 
549 	if (u->peer_wake.private == other) {
550 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
551 		u->peer_wake.private = NULL;
552 	}
553 
554 	spin_unlock(&u_other->peer_wait.lock);
555 }
556 
557 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
558 						   struct sock *other)
559 {
560 	unix_dgram_peer_wake_disconnect(sk, other);
561 	wake_up_interruptible_poll(sk_sleep(sk),
562 				   EPOLLOUT |
563 				   EPOLLWRNORM |
564 				   EPOLLWRBAND);
565 }
566 
567 /* preconditions:
568  *	- unix_peer(sk) == other
569  *	- association is stable
570  */
571 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
572 {
573 	int connected;
574 
575 	connected = unix_dgram_peer_wake_connect(sk, other);
576 
577 	/* If other is SOCK_DEAD, we want to make sure we signal
578 	 * POLLOUT, such that a subsequent write() can get a
579 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
580 	 * to other and its full, we will hang waiting for POLLOUT.
581 	 */
582 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
583 		return 1;
584 
585 	if (connected)
586 		unix_dgram_peer_wake_disconnect(sk, other);
587 
588 	return 0;
589 }
590 
591 static int unix_writable(const struct sock *sk, unsigned char state)
592 {
593 	return state != TCP_LISTEN &&
594 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
595 }
596 
597 static void unix_write_space(struct sock *sk)
598 {
599 	struct socket_wq *wq;
600 
601 	rcu_read_lock();
602 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
603 		wq = rcu_dereference(sk->sk_wq);
604 		if (skwq_has_sleeper(wq))
605 			wake_up_interruptible_sync_poll(&wq->wait,
606 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
607 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
608 	}
609 	rcu_read_unlock();
610 }
611 
612 /* When dgram socket disconnects (or changes its peer), we clear its receive
613  * queue of packets arrived from previous peer. First, it allows to do
614  * flow control based only on wmem_alloc; second, sk connected to peer
615  * may receive messages only from that peer. */
616 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
617 {
618 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
619 		skb_queue_purge_reason(&sk->sk_receive_queue,
620 				       SKB_DROP_REASON_UNIX_DISCONNECT);
621 
622 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
623 
624 		/* If one link of bidirectional dgram pipe is disconnected,
625 		 * we signal error. Messages are lost. Do not make this,
626 		 * when peer was not connected to us.
627 		 */
628 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
629 			WRITE_ONCE(other->sk_err, ECONNRESET);
630 			sk_error_report(other);
631 		}
632 	}
633 }
634 
635 static void unix_sock_destructor(struct sock *sk)
636 {
637 	struct unix_sock *u = unix_sk(sk);
638 
639 	skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
640 
641 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
642 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
643 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
644 	if (!sock_flag(sk, SOCK_DEAD)) {
645 		pr_info("Attempt to release alive unix socket: %p\n", sk);
646 		return;
647 	}
648 
649 	if (u->addr)
650 		unix_release_addr(u->addr);
651 
652 	atomic_long_dec(&unix_nr_socks);
653 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
654 #ifdef UNIX_REFCNT_DEBUG
655 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
656 		atomic_long_read(&unix_nr_socks));
657 #endif
658 }
659 
660 static unsigned int unix_skb_len(const struct sk_buff *skb)
661 {
662 	return skb->len - UNIXCB(skb).consumed;
663 }
664 
665 static void unix_release_sock(struct sock *sk, int embrion)
666 {
667 	struct unix_sock *u = unix_sk(sk);
668 	struct sock *skpair;
669 	struct sk_buff *skb;
670 	struct path path;
671 	int state;
672 
673 	unix_remove_socket(sock_net(sk), sk);
674 	unix_remove_bsd_socket(sk);
675 
676 	/* Clear state */
677 	unix_state_lock(sk);
678 	sock_orphan(sk);
679 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
680 	path	     = u->path;
681 	u->path.dentry = NULL;
682 	u->path.mnt = NULL;
683 	state = sk->sk_state;
684 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
685 
686 	skpair = unix_peer(sk);
687 	unix_peer(sk) = NULL;
688 
689 	unix_state_unlock(sk);
690 
691 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
692 	u->oob_skb = NULL;
693 #endif
694 
695 	wake_up_interruptible_all(&u->peer_wait);
696 
697 	if (skpair != NULL) {
698 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
699 			struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
700 
701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
702 			if (skb && !unix_skb_len(skb))
703 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
704 #endif
705 			unix_state_lock(skpair);
706 			/* No more writes */
707 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
708 			if (skb || embrion)
709 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
710 			unix_state_unlock(skpair);
711 			skpair->sk_state_change(skpair);
712 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
713 		}
714 
715 		unix_dgram_peer_wake_disconnect(sk, skpair);
716 		sock_put(skpair); /* It may now die */
717 	}
718 
719 	/* Try to flush out this socket. Throw out buffers at least */
720 
721 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
722 		if (state == TCP_LISTEN)
723 			unix_release_sock(skb->sk, 1);
724 
725 		/* passed fds are erased in the kfree_skb hook */
726 		kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
727 	}
728 
729 	if (path.dentry)
730 		path_put(&path);
731 
732 	sock_put(sk);
733 
734 	/* ---- Socket is dead now and most probably destroyed ---- */
735 
736 	/*
737 	 * Fixme: BSD difference: In BSD all sockets connected to us get
738 	 *	  ECONNRESET and we die on the spot. In Linux we behave
739 	 *	  like files and pipes do and wait for the last
740 	 *	  dereference.
741 	 *
742 	 * Can't we simply set sock->err?
743 	 *
744 	 *	  What the above comment does talk about? --ANK(980817)
745 	 */
746 
747 	if (READ_ONCE(unix_tot_inflight))
748 		unix_gc();		/* Garbage collect fds */
749 }
750 
751 struct unix_peercred {
752 	struct pid *peer_pid;
753 	const struct cred *peer_cred;
754 };
755 
756 static inline int prepare_peercred(struct unix_peercred *peercred)
757 {
758 	struct pid *pid;
759 	int err;
760 
761 	pid = task_tgid(current);
762 	err = pidfs_register_pid(pid);
763 	if (likely(!err)) {
764 		peercred->peer_pid = get_pid(pid);
765 		peercred->peer_cred = get_current_cred();
766 	}
767 	return err;
768 }
769 
770 static void drop_peercred(struct unix_peercred *peercred)
771 {
772 	const struct cred *cred = NULL;
773 	struct pid *pid = NULL;
774 
775 	might_sleep();
776 
777 	swap(peercred->peer_pid, pid);
778 	swap(peercred->peer_cred, cred);
779 
780 	put_pid(pid);
781 	put_cred(cred);
782 }
783 
784 static inline void init_peercred(struct sock *sk,
785 				 const struct unix_peercred *peercred)
786 {
787 	sk->sk_peer_pid = peercred->peer_pid;
788 	sk->sk_peer_cred = peercred->peer_cred;
789 }
790 
791 static void update_peercred(struct sock *sk, struct unix_peercred *peercred)
792 {
793 	const struct cred *old_cred;
794 	struct pid *old_pid;
795 
796 	spin_lock(&sk->sk_peer_lock);
797 	old_pid = sk->sk_peer_pid;
798 	old_cred = sk->sk_peer_cred;
799 	init_peercred(sk, peercred);
800 	spin_unlock(&sk->sk_peer_lock);
801 
802 	peercred->peer_pid = old_pid;
803 	peercred->peer_cred = old_cred;
804 }
805 
806 static void copy_peercred(struct sock *sk, struct sock *peersk)
807 {
808 	lockdep_assert_held(&unix_sk(peersk)->lock);
809 
810 	spin_lock(&sk->sk_peer_lock);
811 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
812 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
813 	spin_unlock(&sk->sk_peer_lock);
814 }
815 
816 static bool unix_may_passcred(const struct sock *sk)
817 {
818 	return sk->sk_scm_credentials || sk->sk_scm_pidfd;
819 }
820 
821 static int unix_listen(struct socket *sock, int backlog)
822 {
823 	int err;
824 	struct sock *sk = sock->sk;
825 	struct unix_sock *u = unix_sk(sk);
826 	struct unix_peercred peercred = {};
827 
828 	err = -EOPNOTSUPP;
829 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
830 		goto out;	/* Only stream/seqpacket sockets accept */
831 	err = -EINVAL;
832 	if (!READ_ONCE(u->addr))
833 		goto out;	/* No listens on an unbound socket */
834 	err = prepare_peercred(&peercred);
835 	if (err)
836 		goto out;
837 	unix_state_lock(sk);
838 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
839 		goto out_unlock;
840 	if (backlog > sk->sk_max_ack_backlog)
841 		wake_up_interruptible_all(&u->peer_wait);
842 	sk->sk_max_ack_backlog	= backlog;
843 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
844 
845 	/* set credentials so connect can copy them */
846 	update_peercred(sk, &peercred);
847 	err = 0;
848 
849 out_unlock:
850 	unix_state_unlock(sk);
851 	drop_peercred(&peercred);
852 out:
853 	return err;
854 }
855 
856 static int unix_release(struct socket *);
857 static int unix_bind(struct socket *, struct sockaddr *, int);
858 static int unix_stream_connect(struct socket *, struct sockaddr *,
859 			       int addr_len, int flags);
860 static int unix_socketpair(struct socket *, struct socket *);
861 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
862 static int unix_getname(struct socket *, struct sockaddr *, int);
863 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
864 static __poll_t unix_dgram_poll(struct file *, struct socket *,
865 				    poll_table *);
866 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
867 #ifdef CONFIG_COMPAT
868 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
869 #endif
870 static int unix_shutdown(struct socket *, int);
871 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
872 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
873 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
874 				       struct pipe_inode_info *, size_t size,
875 				       unsigned int flags);
876 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
877 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
878 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
879 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
880 static int unix_dgram_connect(struct socket *, struct sockaddr *,
881 			      int, int);
882 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
883 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
884 				  int);
885 
886 #ifdef CONFIG_PROC_FS
887 static int unix_count_nr_fds(struct sock *sk)
888 {
889 	struct sk_buff *skb;
890 	struct unix_sock *u;
891 	int nr_fds = 0;
892 
893 	spin_lock(&sk->sk_receive_queue.lock);
894 	skb = skb_peek(&sk->sk_receive_queue);
895 	while (skb) {
896 		u = unix_sk(skb->sk);
897 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
898 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
899 	}
900 	spin_unlock(&sk->sk_receive_queue.lock);
901 
902 	return nr_fds;
903 }
904 
905 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
906 {
907 	struct sock *sk = sock->sk;
908 	unsigned char s_state;
909 	struct unix_sock *u;
910 	int nr_fds = 0;
911 
912 	if (sk) {
913 		s_state = READ_ONCE(sk->sk_state);
914 		u = unix_sk(sk);
915 
916 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
917 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
918 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
919 		 */
920 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
921 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
922 		else if (s_state == TCP_LISTEN)
923 			nr_fds = unix_count_nr_fds(sk);
924 
925 		seq_printf(m, "scm_fds: %u\n", nr_fds);
926 	}
927 }
928 #else
929 #define unix_show_fdinfo NULL
930 #endif
931 
932 static const struct proto_ops unix_stream_ops = {
933 	.family =	PF_UNIX,
934 	.owner =	THIS_MODULE,
935 	.release =	unix_release,
936 	.bind =		unix_bind,
937 	.connect =	unix_stream_connect,
938 	.socketpair =	unix_socketpair,
939 	.accept =	unix_accept,
940 	.getname =	unix_getname,
941 	.poll =		unix_poll,
942 	.ioctl =	unix_ioctl,
943 #ifdef CONFIG_COMPAT
944 	.compat_ioctl =	unix_compat_ioctl,
945 #endif
946 	.listen =	unix_listen,
947 	.shutdown =	unix_shutdown,
948 	.sendmsg =	unix_stream_sendmsg,
949 	.recvmsg =	unix_stream_recvmsg,
950 	.read_skb =	unix_stream_read_skb,
951 	.mmap =		sock_no_mmap,
952 	.splice_read =	unix_stream_splice_read,
953 	.set_peek_off =	sk_set_peek_off,
954 	.show_fdinfo =	unix_show_fdinfo,
955 };
956 
957 static const struct proto_ops unix_dgram_ops = {
958 	.family =	PF_UNIX,
959 	.owner =	THIS_MODULE,
960 	.release =	unix_release,
961 	.bind =		unix_bind,
962 	.connect =	unix_dgram_connect,
963 	.socketpair =	unix_socketpair,
964 	.accept =	sock_no_accept,
965 	.getname =	unix_getname,
966 	.poll =		unix_dgram_poll,
967 	.ioctl =	unix_ioctl,
968 #ifdef CONFIG_COMPAT
969 	.compat_ioctl =	unix_compat_ioctl,
970 #endif
971 	.listen =	sock_no_listen,
972 	.shutdown =	unix_shutdown,
973 	.sendmsg =	unix_dgram_sendmsg,
974 	.read_skb =	unix_read_skb,
975 	.recvmsg =	unix_dgram_recvmsg,
976 	.mmap =		sock_no_mmap,
977 	.set_peek_off =	sk_set_peek_off,
978 	.show_fdinfo =	unix_show_fdinfo,
979 };
980 
981 static const struct proto_ops unix_seqpacket_ops = {
982 	.family =	PF_UNIX,
983 	.owner =	THIS_MODULE,
984 	.release =	unix_release,
985 	.bind =		unix_bind,
986 	.connect =	unix_stream_connect,
987 	.socketpair =	unix_socketpair,
988 	.accept =	unix_accept,
989 	.getname =	unix_getname,
990 	.poll =		unix_dgram_poll,
991 	.ioctl =	unix_ioctl,
992 #ifdef CONFIG_COMPAT
993 	.compat_ioctl =	unix_compat_ioctl,
994 #endif
995 	.listen =	unix_listen,
996 	.shutdown =	unix_shutdown,
997 	.sendmsg =	unix_seqpacket_sendmsg,
998 	.recvmsg =	unix_seqpacket_recvmsg,
999 	.mmap =		sock_no_mmap,
1000 	.set_peek_off =	sk_set_peek_off,
1001 	.show_fdinfo =	unix_show_fdinfo,
1002 };
1003 
1004 static void unix_close(struct sock *sk, long timeout)
1005 {
1006 	/* Nothing to do here, unix socket does not need a ->close().
1007 	 * This is merely for sockmap.
1008 	 */
1009 }
1010 
1011 static bool unix_bpf_bypass_getsockopt(int level, int optname)
1012 {
1013 	if (level == SOL_SOCKET) {
1014 		switch (optname) {
1015 		case SO_PEERPIDFD:
1016 			return true;
1017 		default:
1018 			return false;
1019 		}
1020 	}
1021 
1022 	return false;
1023 }
1024 
1025 struct proto unix_dgram_proto = {
1026 	.name			= "UNIX",
1027 	.owner			= THIS_MODULE,
1028 	.obj_size		= sizeof(struct unix_sock),
1029 	.close			= unix_close,
1030 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1031 #ifdef CONFIG_BPF_SYSCALL
1032 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
1033 #endif
1034 };
1035 
1036 struct proto unix_stream_proto = {
1037 	.name			= "UNIX-STREAM",
1038 	.owner			= THIS_MODULE,
1039 	.obj_size		= sizeof(struct unix_sock),
1040 	.close			= unix_close,
1041 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1042 #ifdef CONFIG_BPF_SYSCALL
1043 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1044 #endif
1045 };
1046 
1047 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1048 {
1049 	struct unix_sock *u;
1050 	struct sock *sk;
1051 	int err;
1052 
1053 	atomic_long_inc(&unix_nr_socks);
1054 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1055 		err = -ENFILE;
1056 		goto err;
1057 	}
1058 
1059 	if (type == SOCK_STREAM)
1060 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1061 	else /*dgram and  seqpacket */
1062 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1063 
1064 	if (!sk) {
1065 		err = -ENOMEM;
1066 		goto err;
1067 	}
1068 
1069 	sock_init_data(sock, sk);
1070 
1071 	sk->sk_scm_rights	= 1;
1072 	sk->sk_hash		= unix_unbound_hash(sk);
1073 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1074 	sk->sk_write_space	= unix_write_space;
1075 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1076 	sk->sk_destruct		= unix_sock_destructor;
1077 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1078 
1079 	u = unix_sk(sk);
1080 	u->listener = NULL;
1081 	u->vertex = NULL;
1082 	u->path.dentry = NULL;
1083 	u->path.mnt = NULL;
1084 	spin_lock_init(&u->lock);
1085 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1086 	mutex_init(&u->iolock); /* single task reading lock */
1087 	mutex_init(&u->bindlock); /* single task binding lock */
1088 	init_waitqueue_head(&u->peer_wait);
1089 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1090 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1091 	unix_insert_unbound_socket(net, sk);
1092 
1093 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1094 
1095 	return sk;
1096 
1097 err:
1098 	atomic_long_dec(&unix_nr_socks);
1099 	return ERR_PTR(err);
1100 }
1101 
1102 static int unix_create(struct net *net, struct socket *sock, int protocol,
1103 		       int kern)
1104 {
1105 	struct sock *sk;
1106 
1107 	if (protocol && protocol != PF_UNIX)
1108 		return -EPROTONOSUPPORT;
1109 
1110 	sock->state = SS_UNCONNECTED;
1111 
1112 	switch (sock->type) {
1113 	case SOCK_STREAM:
1114 		sock->ops = &unix_stream_ops;
1115 		break;
1116 		/*
1117 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1118 		 *	nothing uses it.
1119 		 */
1120 	case SOCK_RAW:
1121 		sock->type = SOCK_DGRAM;
1122 		fallthrough;
1123 	case SOCK_DGRAM:
1124 		sock->ops = &unix_dgram_ops;
1125 		break;
1126 	case SOCK_SEQPACKET:
1127 		sock->ops = &unix_seqpacket_ops;
1128 		break;
1129 	default:
1130 		return -ESOCKTNOSUPPORT;
1131 	}
1132 
1133 	sk = unix_create1(net, sock, kern, sock->type);
1134 	if (IS_ERR(sk))
1135 		return PTR_ERR(sk);
1136 
1137 	return 0;
1138 }
1139 
1140 static int unix_release(struct socket *sock)
1141 {
1142 	struct sock *sk = sock->sk;
1143 
1144 	if (!sk)
1145 		return 0;
1146 
1147 	sk->sk_prot->close(sk, 0);
1148 	unix_release_sock(sk, 0);
1149 	sock->sk = NULL;
1150 
1151 	return 0;
1152 }
1153 
1154 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1155 				  int type, int flags)
1156 {
1157 	struct inode *inode;
1158 	struct path path;
1159 	struct sock *sk;
1160 	int err;
1161 
1162 	unix_mkname_bsd(sunaddr, addr_len);
1163 
1164 	if (flags & SOCK_COREDUMP) {
1165 		const struct cred *cred;
1166 		struct cred *kcred;
1167 		struct path root;
1168 
1169 		kcred = prepare_kernel_cred(&init_task);
1170 		if (!kcred) {
1171 			err = -ENOMEM;
1172 			goto fail;
1173 		}
1174 
1175 		task_lock(&init_task);
1176 		get_fs_root(init_task.fs, &root);
1177 		task_unlock(&init_task);
1178 
1179 		cred = override_creds(kcred);
1180 		err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path,
1181 				      LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS |
1182 				      LOOKUP_NO_MAGICLINKS, &path);
1183 		put_cred(revert_creds(cred));
1184 		path_put(&root);
1185 		if (err)
1186 			goto fail;
1187 	} else {
1188 		err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1189 		if (err)
1190 			goto fail;
1191 
1192 		err = path_permission(&path, MAY_WRITE);
1193 		if (err)
1194 			goto path_put;
1195 	}
1196 
1197 	err = -ECONNREFUSED;
1198 	inode = d_backing_inode(path.dentry);
1199 	if (!S_ISSOCK(inode->i_mode))
1200 		goto path_put;
1201 
1202 	sk = unix_find_socket_byinode(inode);
1203 	if (!sk)
1204 		goto path_put;
1205 
1206 	err = -EPROTOTYPE;
1207 	if (sk->sk_type == type)
1208 		touch_atime(&path);
1209 	else
1210 		goto sock_put;
1211 
1212 	path_put(&path);
1213 
1214 	return sk;
1215 
1216 sock_put:
1217 	sock_put(sk);
1218 path_put:
1219 	path_put(&path);
1220 fail:
1221 	return ERR_PTR(err);
1222 }
1223 
1224 static struct sock *unix_find_abstract(struct net *net,
1225 				       struct sockaddr_un *sunaddr,
1226 				       int addr_len, int type)
1227 {
1228 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1229 	struct dentry *dentry;
1230 	struct sock *sk;
1231 
1232 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1233 	if (!sk)
1234 		return ERR_PTR(-ECONNREFUSED);
1235 
1236 	dentry = unix_sk(sk)->path.dentry;
1237 	if (dentry)
1238 		touch_atime(&unix_sk(sk)->path);
1239 
1240 	return sk;
1241 }
1242 
1243 static struct sock *unix_find_other(struct net *net,
1244 				    struct sockaddr_un *sunaddr,
1245 				    int addr_len, int type, int flags)
1246 {
1247 	struct sock *sk;
1248 
1249 	if (sunaddr->sun_path[0])
1250 		sk = unix_find_bsd(sunaddr, addr_len, type, flags);
1251 	else
1252 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1253 
1254 	return sk;
1255 }
1256 
1257 static int unix_autobind(struct sock *sk)
1258 {
1259 	struct unix_sock *u = unix_sk(sk);
1260 	unsigned int new_hash, old_hash;
1261 	struct net *net = sock_net(sk);
1262 	struct unix_address *addr;
1263 	u32 lastnum, ordernum;
1264 	int err;
1265 
1266 	err = mutex_lock_interruptible(&u->bindlock);
1267 	if (err)
1268 		return err;
1269 
1270 	if (u->addr)
1271 		goto out;
1272 
1273 	err = -ENOMEM;
1274 	addr = kzalloc(sizeof(*addr) +
1275 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1276 	if (!addr)
1277 		goto out;
1278 
1279 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1280 	addr->name->sun_family = AF_UNIX;
1281 	refcount_set(&addr->refcnt, 1);
1282 
1283 	old_hash = sk->sk_hash;
1284 	ordernum = get_random_u32();
1285 	lastnum = ordernum & 0xFFFFF;
1286 retry:
1287 	ordernum = (ordernum + 1) & 0xFFFFF;
1288 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1289 
1290 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1291 	unix_table_double_lock(net, old_hash, new_hash);
1292 
1293 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1294 		unix_table_double_unlock(net, old_hash, new_hash);
1295 
1296 		/* __unix_find_socket_byname() may take long time if many names
1297 		 * are already in use.
1298 		 */
1299 		cond_resched();
1300 
1301 		if (ordernum == lastnum) {
1302 			/* Give up if all names seems to be in use. */
1303 			err = -ENOSPC;
1304 			unix_release_addr(addr);
1305 			goto out;
1306 		}
1307 
1308 		goto retry;
1309 	}
1310 
1311 	__unix_set_addr_hash(net, sk, addr, new_hash);
1312 	unix_table_double_unlock(net, old_hash, new_hash);
1313 	err = 0;
1314 
1315 out:	mutex_unlock(&u->bindlock);
1316 	return err;
1317 }
1318 
1319 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1320 			 int addr_len)
1321 {
1322 	umode_t mode = S_IFSOCK |
1323 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1324 	struct unix_sock *u = unix_sk(sk);
1325 	unsigned int new_hash, old_hash;
1326 	struct net *net = sock_net(sk);
1327 	struct mnt_idmap *idmap;
1328 	struct unix_address *addr;
1329 	struct dentry *dentry;
1330 	struct path parent;
1331 	int err;
1332 
1333 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1334 	addr = unix_create_addr(sunaddr, addr_len);
1335 	if (!addr)
1336 		return -ENOMEM;
1337 
1338 	/*
1339 	 * Get the parent directory, calculate the hash for last
1340 	 * component.
1341 	 */
1342 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1343 	if (IS_ERR(dentry)) {
1344 		err = PTR_ERR(dentry);
1345 		goto out;
1346 	}
1347 
1348 	/*
1349 	 * All right, let's create it.
1350 	 */
1351 	idmap = mnt_idmap(parent.mnt);
1352 	err = security_path_mknod(&parent, dentry, mode, 0);
1353 	if (!err)
1354 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1355 	if (err)
1356 		goto out_path;
1357 	err = mutex_lock_interruptible(&u->bindlock);
1358 	if (err)
1359 		goto out_unlink;
1360 	if (u->addr)
1361 		goto out_unlock;
1362 
1363 	old_hash = sk->sk_hash;
1364 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1365 	unix_table_double_lock(net, old_hash, new_hash);
1366 	u->path.mnt = mntget(parent.mnt);
1367 	u->path.dentry = dget(dentry);
1368 	__unix_set_addr_hash(net, sk, addr, new_hash);
1369 	unix_table_double_unlock(net, old_hash, new_hash);
1370 	unix_insert_bsd_socket(sk);
1371 	mutex_unlock(&u->bindlock);
1372 	done_path_create(&parent, dentry);
1373 	return 0;
1374 
1375 out_unlock:
1376 	mutex_unlock(&u->bindlock);
1377 	err = -EINVAL;
1378 out_unlink:
1379 	/* failed after successful mknod?  unlink what we'd created... */
1380 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1381 out_path:
1382 	done_path_create(&parent, dentry);
1383 out:
1384 	unix_release_addr(addr);
1385 	return err == -EEXIST ? -EADDRINUSE : err;
1386 }
1387 
1388 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1389 			      int addr_len)
1390 {
1391 	struct unix_sock *u = unix_sk(sk);
1392 	unsigned int new_hash, old_hash;
1393 	struct net *net = sock_net(sk);
1394 	struct unix_address *addr;
1395 	int err;
1396 
1397 	addr = unix_create_addr(sunaddr, addr_len);
1398 	if (!addr)
1399 		return -ENOMEM;
1400 
1401 	err = mutex_lock_interruptible(&u->bindlock);
1402 	if (err)
1403 		goto out;
1404 
1405 	if (u->addr) {
1406 		err = -EINVAL;
1407 		goto out_mutex;
1408 	}
1409 
1410 	old_hash = sk->sk_hash;
1411 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1412 	unix_table_double_lock(net, old_hash, new_hash);
1413 
1414 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1415 		goto out_spin;
1416 
1417 	__unix_set_addr_hash(net, sk, addr, new_hash);
1418 	unix_table_double_unlock(net, old_hash, new_hash);
1419 	mutex_unlock(&u->bindlock);
1420 	return 0;
1421 
1422 out_spin:
1423 	unix_table_double_unlock(net, old_hash, new_hash);
1424 	err = -EADDRINUSE;
1425 out_mutex:
1426 	mutex_unlock(&u->bindlock);
1427 out:
1428 	unix_release_addr(addr);
1429 	return err;
1430 }
1431 
1432 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1433 {
1434 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1435 	struct sock *sk = sock->sk;
1436 	int err;
1437 
1438 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1439 	    sunaddr->sun_family == AF_UNIX)
1440 		return unix_autobind(sk);
1441 
1442 	err = unix_validate_addr(sunaddr, addr_len);
1443 	if (err)
1444 		return err;
1445 
1446 	if (sunaddr->sun_path[0])
1447 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1448 	else
1449 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1450 
1451 	return err;
1452 }
1453 
1454 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1455 {
1456 	if (unlikely(sk1 == sk2) || !sk2) {
1457 		unix_state_lock(sk1);
1458 		return;
1459 	}
1460 
1461 	if (sk1 > sk2)
1462 		swap(sk1, sk2);
1463 
1464 	unix_state_lock(sk1);
1465 	unix_state_lock(sk2);
1466 }
1467 
1468 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1469 {
1470 	if (unlikely(sk1 == sk2) || !sk2) {
1471 		unix_state_unlock(sk1);
1472 		return;
1473 	}
1474 	unix_state_unlock(sk1);
1475 	unix_state_unlock(sk2);
1476 }
1477 
1478 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1479 			      int alen, int flags)
1480 {
1481 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1482 	struct sock *sk = sock->sk;
1483 	struct sock *other;
1484 	int err;
1485 
1486 	err = -EINVAL;
1487 	if (alen < offsetofend(struct sockaddr, sa_family))
1488 		goto out;
1489 
1490 	if (addr->sa_family != AF_UNSPEC) {
1491 		err = unix_validate_addr(sunaddr, alen);
1492 		if (err)
1493 			goto out;
1494 
1495 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1496 		if (err)
1497 			goto out;
1498 
1499 		if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) {
1500 			err = unix_autobind(sk);
1501 			if (err)
1502 				goto out;
1503 		}
1504 
1505 restart:
1506 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0);
1507 		if (IS_ERR(other)) {
1508 			err = PTR_ERR(other);
1509 			goto out;
1510 		}
1511 
1512 		unix_state_double_lock(sk, other);
1513 
1514 		/* Apparently VFS overslept socket death. Retry. */
1515 		if (sock_flag(other, SOCK_DEAD)) {
1516 			unix_state_double_unlock(sk, other);
1517 			sock_put(other);
1518 			goto restart;
1519 		}
1520 
1521 		err = -EPERM;
1522 		if (!unix_may_send(sk, other))
1523 			goto out_unlock;
1524 
1525 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1526 		if (err)
1527 			goto out_unlock;
1528 
1529 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1530 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1531 	} else {
1532 		/*
1533 		 *	1003.1g breaking connected state with AF_UNSPEC
1534 		 */
1535 		other = NULL;
1536 		unix_state_double_lock(sk, other);
1537 	}
1538 
1539 	/*
1540 	 * If it was connected, reconnect.
1541 	 */
1542 	if (unix_peer(sk)) {
1543 		struct sock *old_peer = unix_peer(sk);
1544 
1545 		unix_peer(sk) = other;
1546 		if (!other)
1547 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1548 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1549 
1550 		unix_state_double_unlock(sk, other);
1551 
1552 		if (other != old_peer) {
1553 			unix_dgram_disconnected(sk, old_peer);
1554 
1555 			unix_state_lock(old_peer);
1556 			if (!unix_peer(old_peer))
1557 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1558 			unix_state_unlock(old_peer);
1559 		}
1560 
1561 		sock_put(old_peer);
1562 	} else {
1563 		unix_peer(sk) = other;
1564 		unix_state_double_unlock(sk, other);
1565 	}
1566 
1567 	return 0;
1568 
1569 out_unlock:
1570 	unix_state_double_unlock(sk, other);
1571 	sock_put(other);
1572 out:
1573 	return err;
1574 }
1575 
1576 static long unix_wait_for_peer(struct sock *other, long timeo)
1577 {
1578 	struct unix_sock *u = unix_sk(other);
1579 	int sched;
1580 	DEFINE_WAIT(wait);
1581 
1582 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1583 
1584 	sched = !sock_flag(other, SOCK_DEAD) &&
1585 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1586 		unix_recvq_full_lockless(other);
1587 
1588 	unix_state_unlock(other);
1589 
1590 	if (sched)
1591 		timeo = schedule_timeout(timeo);
1592 
1593 	finish_wait(&u->peer_wait, &wait);
1594 	return timeo;
1595 }
1596 
1597 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1598 			       int addr_len, int flags)
1599 {
1600 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1601 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1602 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1603 	struct unix_peercred peercred = {};
1604 	struct net *net = sock_net(sk);
1605 	struct sk_buff *skb = NULL;
1606 	unsigned char state;
1607 	long timeo;
1608 	int err;
1609 
1610 	err = unix_validate_addr(sunaddr, addr_len);
1611 	if (err)
1612 		goto out;
1613 
1614 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1615 	if (err)
1616 		goto out;
1617 
1618 	if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
1619 		err = unix_autobind(sk);
1620 		if (err)
1621 			goto out;
1622 	}
1623 
1624 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1625 
1626 	/* First of all allocate resources.
1627 	 * If we will make it after state is locked,
1628 	 * we will have to recheck all again in any case.
1629 	 */
1630 
1631 	/* create new sock for complete connection */
1632 	newsk = unix_create1(net, NULL, 0, sock->type);
1633 	if (IS_ERR(newsk)) {
1634 		err = PTR_ERR(newsk);
1635 		goto out;
1636 	}
1637 
1638 	err = prepare_peercred(&peercred);
1639 	if (err)
1640 		goto out;
1641 
1642 	/* Allocate skb for sending to listening sock */
1643 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1644 	if (!skb) {
1645 		err = -ENOMEM;
1646 		goto out_free_sk;
1647 	}
1648 
1649 restart:
1650 	/*  Find listening sock. */
1651 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags);
1652 	if (IS_ERR(other)) {
1653 		err = PTR_ERR(other);
1654 		goto out_free_skb;
1655 	}
1656 
1657 	unix_state_lock(other);
1658 
1659 	/* Apparently VFS overslept socket death. Retry. */
1660 	if (sock_flag(other, SOCK_DEAD)) {
1661 		unix_state_unlock(other);
1662 		sock_put(other);
1663 		goto restart;
1664 	}
1665 
1666 	if (other->sk_state != TCP_LISTEN ||
1667 	    other->sk_shutdown & RCV_SHUTDOWN) {
1668 		err = -ECONNREFUSED;
1669 		goto out_unlock;
1670 	}
1671 
1672 	if (unix_recvq_full_lockless(other)) {
1673 		if (!timeo) {
1674 			err = -EAGAIN;
1675 			goto out_unlock;
1676 		}
1677 
1678 		timeo = unix_wait_for_peer(other, timeo);
1679 		sock_put(other);
1680 
1681 		err = sock_intr_errno(timeo);
1682 		if (signal_pending(current))
1683 			goto out_free_skb;
1684 
1685 		goto restart;
1686 	}
1687 
1688 	/* self connect and simultaneous connect are eliminated
1689 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1690 	 */
1691 	state = READ_ONCE(sk->sk_state);
1692 	if (unlikely(state != TCP_CLOSE)) {
1693 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1694 		goto out_unlock;
1695 	}
1696 
1697 	unix_state_lock(sk);
1698 
1699 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1700 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1701 		unix_state_unlock(sk);
1702 		goto out_unlock;
1703 	}
1704 
1705 	err = security_unix_stream_connect(sk, other, newsk);
1706 	if (err) {
1707 		unix_state_unlock(sk);
1708 		goto out_unlock;
1709 	}
1710 
1711 	/* The way is open! Fastly set all the necessary fields... */
1712 
1713 	sock_hold(sk);
1714 	unix_peer(newsk) = sk;
1715 	newsk->sk_state = TCP_ESTABLISHED;
1716 	newsk->sk_type = sk->sk_type;
1717 	newsk->sk_scm_recv_flags = other->sk_scm_recv_flags;
1718 	init_peercred(newsk, &peercred);
1719 
1720 	newu = unix_sk(newsk);
1721 	newu->listener = other;
1722 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1723 	otheru = unix_sk(other);
1724 
1725 	/* copy address information from listening to new sock
1726 	 *
1727 	 * The contents of *(otheru->addr) and otheru->path
1728 	 * are seen fully set up here, since we have found
1729 	 * otheru in hash under its lock.  Insertion into the
1730 	 * hash chain we'd found it in had been done in an
1731 	 * earlier critical area protected by the chain's lock,
1732 	 * the same one where we'd set *(otheru->addr) contents,
1733 	 * as well as otheru->path and otheru->addr itself.
1734 	 *
1735 	 * Using smp_store_release() here to set newu->addr
1736 	 * is enough to make those stores, as well as stores
1737 	 * to newu->path visible to anyone who gets newu->addr
1738 	 * by smp_load_acquire().  IOW, the same warranties
1739 	 * as for unix_sock instances bound in unix_bind() or
1740 	 * in unix_autobind().
1741 	 */
1742 	if (otheru->path.dentry) {
1743 		path_get(&otheru->path);
1744 		newu->path = otheru->path;
1745 	}
1746 	refcount_inc(&otheru->addr->refcnt);
1747 	smp_store_release(&newu->addr, otheru->addr);
1748 
1749 	/* Set credentials */
1750 	copy_peercred(sk, other);
1751 
1752 	sock->state	= SS_CONNECTED;
1753 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1754 	sock_hold(newsk);
1755 
1756 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1757 	unix_peer(sk)	= newsk;
1758 
1759 	unix_state_unlock(sk);
1760 
1761 	/* take ten and send info to listening sock */
1762 	spin_lock(&other->sk_receive_queue.lock);
1763 	__skb_queue_tail(&other->sk_receive_queue, skb);
1764 	spin_unlock(&other->sk_receive_queue.lock);
1765 	unix_state_unlock(other);
1766 	other->sk_data_ready(other);
1767 	sock_put(other);
1768 	return 0;
1769 
1770 out_unlock:
1771 	unix_state_unlock(other);
1772 	sock_put(other);
1773 out_free_skb:
1774 	consume_skb(skb);
1775 out_free_sk:
1776 	unix_release_sock(newsk, 0);
1777 out:
1778 	drop_peercred(&peercred);
1779 	return err;
1780 }
1781 
1782 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1783 {
1784 	struct unix_peercred ska_peercred = {}, skb_peercred = {};
1785 	struct sock *ska = socka->sk, *skb = sockb->sk;
1786 	int err;
1787 
1788 	err = prepare_peercred(&ska_peercred);
1789 	if (err)
1790 		return err;
1791 
1792 	err = prepare_peercred(&skb_peercred);
1793 	if (err) {
1794 		drop_peercred(&ska_peercred);
1795 		return err;
1796 	}
1797 
1798 	/* Join our sockets back to back */
1799 	sock_hold(ska);
1800 	sock_hold(skb);
1801 	unix_peer(ska) = skb;
1802 	unix_peer(skb) = ska;
1803 	init_peercred(ska, &ska_peercred);
1804 	init_peercred(skb, &skb_peercred);
1805 
1806 	ska->sk_state = TCP_ESTABLISHED;
1807 	skb->sk_state = TCP_ESTABLISHED;
1808 	socka->state  = SS_CONNECTED;
1809 	sockb->state  = SS_CONNECTED;
1810 	return 0;
1811 }
1812 
1813 static int unix_accept(struct socket *sock, struct socket *newsock,
1814 		       struct proto_accept_arg *arg)
1815 {
1816 	struct sock *sk = sock->sk;
1817 	struct sk_buff *skb;
1818 	struct sock *tsk;
1819 
1820 	arg->err = -EOPNOTSUPP;
1821 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1822 		goto out;
1823 
1824 	arg->err = -EINVAL;
1825 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1826 		goto out;
1827 
1828 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1829 	 * so that no locks are necessary.
1830 	 */
1831 
1832 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1833 				&arg->err);
1834 	if (!skb) {
1835 		/* This means receive shutdown. */
1836 		if (arg->err == 0)
1837 			arg->err = -EINVAL;
1838 		goto out;
1839 	}
1840 
1841 	tsk = skb->sk;
1842 	skb_free_datagram(sk, skb);
1843 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1844 
1845 	/* attach accepted sock to socket */
1846 	unix_state_lock(tsk);
1847 	unix_update_edges(unix_sk(tsk));
1848 	newsock->state = SS_CONNECTED;
1849 	sock_graft(tsk, newsock);
1850 	unix_state_unlock(tsk);
1851 	return 0;
1852 
1853 out:
1854 	return arg->err;
1855 }
1856 
1857 
1858 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1859 {
1860 	struct sock *sk = sock->sk;
1861 	struct unix_address *addr;
1862 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1863 	int err = 0;
1864 
1865 	if (peer) {
1866 		sk = unix_peer_get(sk);
1867 
1868 		err = -ENOTCONN;
1869 		if (!sk)
1870 			goto out;
1871 		err = 0;
1872 	} else {
1873 		sock_hold(sk);
1874 	}
1875 
1876 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1877 	if (!addr) {
1878 		sunaddr->sun_family = AF_UNIX;
1879 		sunaddr->sun_path[0] = 0;
1880 		err = offsetof(struct sockaddr_un, sun_path);
1881 	} else {
1882 		err = addr->len;
1883 		memcpy(sunaddr, addr->name, addr->len);
1884 
1885 		if (peer)
1886 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1887 					       CGROUP_UNIX_GETPEERNAME);
1888 		else
1889 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1890 					       CGROUP_UNIX_GETSOCKNAME);
1891 	}
1892 	sock_put(sk);
1893 out:
1894 	return err;
1895 }
1896 
1897 /* The "user->unix_inflight" variable is protected by the garbage
1898  * collection lock, and we just read it locklessly here. If you go
1899  * over the limit, there might be a tiny race in actually noticing
1900  * it across threads. Tough.
1901  */
1902 static inline bool too_many_unix_fds(struct task_struct *p)
1903 {
1904 	struct user_struct *user = current_user();
1905 
1906 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1907 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1908 	return false;
1909 }
1910 
1911 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1912 {
1913 	if (too_many_unix_fds(current))
1914 		return -ETOOMANYREFS;
1915 
1916 	UNIXCB(skb).fp = scm->fp;
1917 	scm->fp = NULL;
1918 
1919 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1920 		return -ENOMEM;
1921 
1922 	return 0;
1923 }
1924 
1925 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1926 {
1927 	scm->fp = UNIXCB(skb).fp;
1928 	UNIXCB(skb).fp = NULL;
1929 
1930 	unix_destroy_fpl(scm->fp);
1931 }
1932 
1933 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1934 {
1935 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1936 }
1937 
1938 static void unix_destruct_scm(struct sk_buff *skb)
1939 {
1940 	struct scm_cookie scm;
1941 
1942 	memset(&scm, 0, sizeof(scm));
1943 	scm.pid = UNIXCB(skb).pid;
1944 	if (UNIXCB(skb).fp)
1945 		unix_detach_fds(&scm, skb);
1946 
1947 	/* Alas, it calls VFS */
1948 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1949 	scm_destroy(&scm);
1950 	sock_wfree(skb);
1951 }
1952 
1953 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1954 {
1955 	int err = 0;
1956 
1957 	UNIXCB(skb).pid = get_pid(scm->pid);
1958 	UNIXCB(skb).uid = scm->creds.uid;
1959 	UNIXCB(skb).gid = scm->creds.gid;
1960 	UNIXCB(skb).fp = NULL;
1961 	unix_get_secdata(scm, skb);
1962 	if (scm->fp && send_fds)
1963 		err = unix_attach_fds(scm, skb);
1964 
1965 	skb->destructor = unix_destruct_scm;
1966 	return err;
1967 }
1968 
1969 static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm)
1970 {
1971 	scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1972 	unix_set_secdata(scm, skb);
1973 }
1974 
1975 /**
1976  * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed.
1977  * @skb: skb to attach creds to.
1978  * @sk: Sender sock.
1979  * @other: Receiver sock.
1980  *
1981  * Some apps rely on write() giving SCM_CREDENTIALS
1982  * We include credentials if source or destination socket
1983  * asserted SOCK_PASSCRED.
1984  *
1985  * Context: May sleep.
1986  * Return: On success zero, on error a negative error code is returned.
1987  */
1988 static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk,
1989 				const struct sock *other)
1990 {
1991 	if (UNIXCB(skb).pid)
1992 		return 0;
1993 
1994 	if (unix_may_passcred(sk) || unix_may_passcred(other) ||
1995 	    !other->sk_socket) {
1996 		struct pid *pid;
1997 		int err;
1998 
1999 		pid = task_tgid(current);
2000 		err = pidfs_register_pid(pid);
2001 		if (unlikely(err))
2002 			return err;
2003 
2004 		UNIXCB(skb).pid = get_pid(pid);
2005 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
2006 	}
2007 
2008 	return 0;
2009 }
2010 
2011 static bool unix_skb_scm_eq(struct sk_buff *skb,
2012 			    struct scm_cookie *scm)
2013 {
2014 	return UNIXCB(skb).pid == scm->pid &&
2015 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
2016 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
2017 	       unix_secdata_eq(scm, skb);
2018 }
2019 
2020 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
2021 {
2022 	struct scm_fp_list *fp = UNIXCB(skb).fp;
2023 	struct unix_sock *u = unix_sk(sk);
2024 
2025 	if (unlikely(fp && fp->count)) {
2026 		atomic_add(fp->count, &u->scm_stat.nr_fds);
2027 		unix_add_edges(fp, u);
2028 	}
2029 }
2030 
2031 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
2032 {
2033 	struct scm_fp_list *fp = UNIXCB(skb).fp;
2034 	struct unix_sock *u = unix_sk(sk);
2035 
2036 	if (unlikely(fp && fp->count)) {
2037 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
2038 		unix_del_edges(fp);
2039 	}
2040 }
2041 
2042 /*
2043  *	Send AF_UNIX data.
2044  */
2045 
2046 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
2047 			      size_t len)
2048 {
2049 	struct sock *sk = sock->sk, *other = NULL;
2050 	struct unix_sock *u = unix_sk(sk);
2051 	struct scm_cookie scm;
2052 	struct sk_buff *skb;
2053 	int data_len = 0;
2054 	int sk_locked;
2055 	long timeo;
2056 	int err;
2057 
2058 	err = scm_send(sock, msg, &scm, false);
2059 	if (err < 0)
2060 		return err;
2061 
2062 	wait_for_unix_gc(scm.fp);
2063 
2064 	if (msg->msg_flags & MSG_OOB) {
2065 		err = -EOPNOTSUPP;
2066 		goto out;
2067 	}
2068 
2069 	if (msg->msg_namelen) {
2070 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
2071 		if (err)
2072 			goto out;
2073 
2074 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
2075 							    msg->msg_name,
2076 							    &msg->msg_namelen,
2077 							    NULL);
2078 		if (err)
2079 			goto out;
2080 	}
2081 
2082 	if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
2083 		err = unix_autobind(sk);
2084 		if (err)
2085 			goto out;
2086 	}
2087 
2088 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
2089 		err = -EMSGSIZE;
2090 		goto out;
2091 	}
2092 
2093 	if (len > SKB_MAX_ALLOC) {
2094 		data_len = min_t(size_t,
2095 				 len - SKB_MAX_ALLOC,
2096 				 MAX_SKB_FRAGS * PAGE_SIZE);
2097 		data_len = PAGE_ALIGN(data_len);
2098 
2099 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2100 	}
2101 
2102 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2103 				   msg->msg_flags & MSG_DONTWAIT, &err,
2104 				   PAGE_ALLOC_COSTLY_ORDER);
2105 	if (!skb)
2106 		goto out;
2107 
2108 	err = unix_scm_to_skb(&scm, skb, true);
2109 	if (err < 0)
2110 		goto out_free;
2111 
2112 	skb_put(skb, len - data_len);
2113 	skb->data_len = data_len;
2114 	skb->len = len;
2115 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2116 	if (err)
2117 		goto out_free;
2118 
2119 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2120 
2121 	if (msg->msg_namelen) {
2122 lookup:
2123 		other = unix_find_other(sock_net(sk), msg->msg_name,
2124 					msg->msg_namelen, sk->sk_type, 0);
2125 		if (IS_ERR(other)) {
2126 			err = PTR_ERR(other);
2127 			goto out_free;
2128 		}
2129 	} else {
2130 		other = unix_peer_get(sk);
2131 		if (!other) {
2132 			err = -ENOTCONN;
2133 			goto out_free;
2134 		}
2135 	}
2136 
2137 	if (sk_filter(other, skb) < 0) {
2138 		/* Toss the packet but do not return any error to the sender */
2139 		err = len;
2140 		goto out_sock_put;
2141 	}
2142 
2143 	err = unix_maybe_add_creds(skb, sk, other);
2144 	if (err)
2145 		goto out_sock_put;
2146 
2147 restart:
2148 	sk_locked = 0;
2149 	unix_state_lock(other);
2150 restart_locked:
2151 
2152 	if (!unix_may_send(sk, other)) {
2153 		err = -EPERM;
2154 		goto out_unlock;
2155 	}
2156 
2157 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2158 		/* Check with 1003.1g - what should datagram error */
2159 
2160 		unix_state_unlock(other);
2161 
2162 		if (sk->sk_type == SOCK_SEQPACKET) {
2163 			/* We are here only when racing with unix_release_sock()
2164 			 * is clearing @other. Never change state to TCP_CLOSE
2165 			 * unlike SOCK_DGRAM wants.
2166 			 */
2167 			err = -EPIPE;
2168 			goto out_sock_put;
2169 		}
2170 
2171 		if (!sk_locked)
2172 			unix_state_lock(sk);
2173 
2174 		if (unix_peer(sk) == other) {
2175 			unix_peer(sk) = NULL;
2176 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2177 
2178 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2179 			unix_state_unlock(sk);
2180 
2181 			unix_dgram_disconnected(sk, other);
2182 			sock_put(other);
2183 			err = -ECONNREFUSED;
2184 			goto out_sock_put;
2185 		}
2186 
2187 		unix_state_unlock(sk);
2188 
2189 		if (!msg->msg_namelen) {
2190 			err = -ECONNRESET;
2191 			goto out_sock_put;
2192 		}
2193 
2194 		sock_put(other);
2195 		goto lookup;
2196 	}
2197 
2198 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2199 		err = -EPIPE;
2200 		goto out_unlock;
2201 	}
2202 
2203 	if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2204 		err = -EPERM;
2205 		goto out_unlock;
2206 	}
2207 
2208 	if (sk->sk_type != SOCK_SEQPACKET) {
2209 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2210 		if (err)
2211 			goto out_unlock;
2212 	}
2213 
2214 	/* other == sk && unix_peer(other) != sk if
2215 	 * - unix_peer(sk) == NULL, destination address bound to sk
2216 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2217 	 */
2218 	if (other != sk &&
2219 	    unlikely(unix_peer(other) != sk &&
2220 	    unix_recvq_full_lockless(other))) {
2221 		if (timeo) {
2222 			timeo = unix_wait_for_peer(other, timeo);
2223 
2224 			err = sock_intr_errno(timeo);
2225 			if (signal_pending(current))
2226 				goto out_sock_put;
2227 
2228 			goto restart;
2229 		}
2230 
2231 		if (!sk_locked) {
2232 			unix_state_unlock(other);
2233 			unix_state_double_lock(sk, other);
2234 		}
2235 
2236 		if (unix_peer(sk) != other ||
2237 		    unix_dgram_peer_wake_me(sk, other)) {
2238 			err = -EAGAIN;
2239 			sk_locked = 1;
2240 			goto out_unlock;
2241 		}
2242 
2243 		if (!sk_locked) {
2244 			sk_locked = 1;
2245 			goto restart_locked;
2246 		}
2247 	}
2248 
2249 	if (unlikely(sk_locked))
2250 		unix_state_unlock(sk);
2251 
2252 	if (sock_flag(other, SOCK_RCVTSTAMP))
2253 		__net_timestamp(skb);
2254 
2255 	scm_stat_add(other, skb);
2256 	skb_queue_tail(&other->sk_receive_queue, skb);
2257 	unix_state_unlock(other);
2258 	other->sk_data_ready(other);
2259 	sock_put(other);
2260 	scm_destroy(&scm);
2261 	return len;
2262 
2263 out_unlock:
2264 	if (sk_locked)
2265 		unix_state_unlock(sk);
2266 	unix_state_unlock(other);
2267 out_sock_put:
2268 	sock_put(other);
2269 out_free:
2270 	consume_skb(skb);
2271 out:
2272 	scm_destroy(&scm);
2273 	return err;
2274 }
2275 
2276 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2277  * bytes, and a minimum of a full page.
2278  */
2279 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2280 
2281 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2282 static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other,
2283 		     struct scm_cookie *scm, bool fds_sent)
2284 {
2285 	struct unix_sock *ousk = unix_sk(other);
2286 	struct sk_buff *skb;
2287 	int err;
2288 
2289 	skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2290 
2291 	if (!skb)
2292 		return err;
2293 
2294 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2295 	if (err < 0)
2296 		goto out;
2297 
2298 	err = unix_maybe_add_creds(skb, sk, other);
2299 	if (err)
2300 		goto out;
2301 
2302 	skb_put(skb, 1);
2303 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2304 
2305 	if (err)
2306 		goto out;
2307 
2308 	unix_state_lock(other);
2309 
2310 	if (sock_flag(other, SOCK_DEAD) ||
2311 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2312 		err = -EPIPE;
2313 		goto out_unlock;
2314 	}
2315 
2316 	if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2317 		err = -EPERM;
2318 		goto out_unlock;
2319 	}
2320 
2321 	scm_stat_add(other, skb);
2322 
2323 	spin_lock(&other->sk_receive_queue.lock);
2324 	WRITE_ONCE(ousk->oob_skb, skb);
2325 	__skb_queue_tail(&other->sk_receive_queue, skb);
2326 	spin_unlock(&other->sk_receive_queue.lock);
2327 
2328 	sk_send_sigurg(other);
2329 	unix_state_unlock(other);
2330 	other->sk_data_ready(other);
2331 
2332 	return 0;
2333 out_unlock:
2334 	unix_state_unlock(other);
2335 out:
2336 	consume_skb(skb);
2337 	return err;
2338 }
2339 #endif
2340 
2341 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2342 			       size_t len)
2343 {
2344 	struct sock *sk = sock->sk;
2345 	struct sk_buff *skb = NULL;
2346 	struct sock *other = NULL;
2347 	struct scm_cookie scm;
2348 	bool fds_sent = false;
2349 	int err, sent = 0;
2350 
2351 	err = scm_send(sock, msg, &scm, false);
2352 	if (err < 0)
2353 		return err;
2354 
2355 	wait_for_unix_gc(scm.fp);
2356 
2357 	if (msg->msg_flags & MSG_OOB) {
2358 		err = -EOPNOTSUPP;
2359 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2360 		if (len)
2361 			len--;
2362 		else
2363 #endif
2364 			goto out_err;
2365 	}
2366 
2367 	if (msg->msg_namelen) {
2368 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2369 		goto out_err;
2370 	} else {
2371 		other = unix_peer(sk);
2372 		if (!other) {
2373 			err = -ENOTCONN;
2374 			goto out_err;
2375 		}
2376 	}
2377 
2378 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2379 		goto out_pipe;
2380 
2381 	while (sent < len) {
2382 		int size = len - sent;
2383 		int data_len;
2384 
2385 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2386 			skb = sock_alloc_send_pskb(sk, 0, 0,
2387 						   msg->msg_flags & MSG_DONTWAIT,
2388 						   &err, 0);
2389 		} else {
2390 			/* Keep two messages in the pipe so it schedules better */
2391 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2392 
2393 			/* allow fallback to order-0 allocations */
2394 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2395 
2396 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2397 
2398 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2399 
2400 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2401 						   msg->msg_flags & MSG_DONTWAIT, &err,
2402 						   get_order(UNIX_SKB_FRAGS_SZ));
2403 		}
2404 		if (!skb)
2405 			goto out_err;
2406 
2407 		/* Only send the fds in the first buffer */
2408 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2409 		if (err < 0)
2410 			goto out_free;
2411 
2412 		fds_sent = true;
2413 
2414 		err = unix_maybe_add_creds(skb, sk, other);
2415 		if (err)
2416 			goto out_free;
2417 
2418 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2419 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2420 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2421 						   sk->sk_allocation);
2422 			if (err < 0)
2423 				goto out_free;
2424 
2425 			size = err;
2426 			refcount_add(size, &sk->sk_wmem_alloc);
2427 		} else {
2428 			skb_put(skb, size - data_len);
2429 			skb->data_len = data_len;
2430 			skb->len = size;
2431 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2432 			if (err)
2433 				goto out_free;
2434 		}
2435 
2436 		unix_state_lock(other);
2437 
2438 		if (sock_flag(other, SOCK_DEAD) ||
2439 		    (other->sk_shutdown & RCV_SHUTDOWN))
2440 			goto out_pipe_unlock;
2441 
2442 		if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2443 			unix_state_unlock(other);
2444 			err = -EPERM;
2445 			goto out_free;
2446 		}
2447 
2448 		scm_stat_add(other, skb);
2449 		skb_queue_tail(&other->sk_receive_queue, skb);
2450 		unix_state_unlock(other);
2451 		other->sk_data_ready(other);
2452 		sent += size;
2453 	}
2454 
2455 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2456 	if (msg->msg_flags & MSG_OOB) {
2457 		err = queue_oob(sk, msg, other, &scm, fds_sent);
2458 		if (err)
2459 			goto out_err;
2460 		sent++;
2461 	}
2462 #endif
2463 
2464 	scm_destroy(&scm);
2465 
2466 	return sent;
2467 
2468 out_pipe_unlock:
2469 	unix_state_unlock(other);
2470 out_pipe:
2471 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2472 		send_sig(SIGPIPE, current, 0);
2473 	err = -EPIPE;
2474 out_free:
2475 	consume_skb(skb);
2476 out_err:
2477 	scm_destroy(&scm);
2478 	return sent ? : err;
2479 }
2480 
2481 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2482 				  size_t len)
2483 {
2484 	int err;
2485 	struct sock *sk = sock->sk;
2486 
2487 	err = sock_error(sk);
2488 	if (err)
2489 		return err;
2490 
2491 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2492 		return -ENOTCONN;
2493 
2494 	if (msg->msg_namelen)
2495 		msg->msg_namelen = 0;
2496 
2497 	return unix_dgram_sendmsg(sock, msg, len);
2498 }
2499 
2500 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2501 				  size_t size, int flags)
2502 {
2503 	struct sock *sk = sock->sk;
2504 
2505 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2506 		return -ENOTCONN;
2507 
2508 	return unix_dgram_recvmsg(sock, msg, size, flags);
2509 }
2510 
2511 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2512 {
2513 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2514 
2515 	if (addr) {
2516 		msg->msg_namelen = addr->len;
2517 		memcpy(msg->msg_name, addr->name, addr->len);
2518 	}
2519 }
2520 
2521 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2522 			 int flags)
2523 {
2524 	struct scm_cookie scm;
2525 	struct socket *sock = sk->sk_socket;
2526 	struct unix_sock *u = unix_sk(sk);
2527 	struct sk_buff *skb, *last;
2528 	long timeo;
2529 	int skip;
2530 	int err;
2531 
2532 	err = -EOPNOTSUPP;
2533 	if (flags&MSG_OOB)
2534 		goto out;
2535 
2536 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2537 
2538 	do {
2539 		mutex_lock(&u->iolock);
2540 
2541 		skip = sk_peek_offset(sk, flags);
2542 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2543 					      &skip, &err, &last);
2544 		if (skb) {
2545 			if (!(flags & MSG_PEEK))
2546 				scm_stat_del(sk, skb);
2547 			break;
2548 		}
2549 
2550 		mutex_unlock(&u->iolock);
2551 
2552 		if (err != -EAGAIN)
2553 			break;
2554 	} while (timeo &&
2555 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2556 					      &err, &timeo, last));
2557 
2558 	if (!skb) { /* implies iolock unlocked */
2559 		unix_state_lock(sk);
2560 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2561 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2562 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2563 			err = 0;
2564 		unix_state_unlock(sk);
2565 		goto out;
2566 	}
2567 
2568 	if (wq_has_sleeper(&u->peer_wait))
2569 		wake_up_interruptible_sync_poll(&u->peer_wait,
2570 						EPOLLOUT | EPOLLWRNORM |
2571 						EPOLLWRBAND);
2572 
2573 	if (msg->msg_name) {
2574 		unix_copy_addr(msg, skb->sk);
2575 
2576 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2577 						      msg->msg_name,
2578 						      &msg->msg_namelen);
2579 	}
2580 
2581 	if (size > skb->len - skip)
2582 		size = skb->len - skip;
2583 	else if (size < skb->len - skip)
2584 		msg->msg_flags |= MSG_TRUNC;
2585 
2586 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2587 	if (err)
2588 		goto out_free;
2589 
2590 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2591 		__sock_recv_timestamp(msg, sk, skb);
2592 
2593 	memset(&scm, 0, sizeof(scm));
2594 
2595 	unix_skb_to_scm(skb, &scm);
2596 
2597 	if (!(flags & MSG_PEEK)) {
2598 		if (UNIXCB(skb).fp)
2599 			unix_detach_fds(&scm, skb);
2600 
2601 		sk_peek_offset_bwd(sk, skb->len);
2602 	} else {
2603 		/* It is questionable: on PEEK we could:
2604 		   - do not return fds - good, but too simple 8)
2605 		   - return fds, and do not return them on read (old strategy,
2606 		     apparently wrong)
2607 		   - clone fds (I chose it for now, it is the most universal
2608 		     solution)
2609 
2610 		   POSIX 1003.1g does not actually define this clearly
2611 		   at all. POSIX 1003.1g doesn't define a lot of things
2612 		   clearly however!
2613 
2614 		*/
2615 
2616 		sk_peek_offset_fwd(sk, size);
2617 
2618 		if (UNIXCB(skb).fp)
2619 			unix_peek_fds(&scm, skb);
2620 	}
2621 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2622 
2623 	scm_recv_unix(sock, msg, &scm, flags);
2624 
2625 out_free:
2626 	skb_free_datagram(sk, skb);
2627 	mutex_unlock(&u->iolock);
2628 out:
2629 	return err;
2630 }
2631 
2632 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2633 			      int flags)
2634 {
2635 	struct sock *sk = sock->sk;
2636 
2637 #ifdef CONFIG_BPF_SYSCALL
2638 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2639 
2640 	if (prot != &unix_dgram_proto)
2641 		return prot->recvmsg(sk, msg, size, flags, NULL);
2642 #endif
2643 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2644 }
2645 
2646 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2647 {
2648 	struct unix_sock *u = unix_sk(sk);
2649 	struct sk_buff *skb;
2650 	int err;
2651 
2652 	mutex_lock(&u->iolock);
2653 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2654 	mutex_unlock(&u->iolock);
2655 	if (!skb)
2656 		return err;
2657 
2658 	return recv_actor(sk, skb);
2659 }
2660 
2661 /*
2662  *	Sleep until more data has arrived. But check for races..
2663  */
2664 static long unix_stream_data_wait(struct sock *sk, long timeo,
2665 				  struct sk_buff *last, unsigned int last_len,
2666 				  bool freezable)
2667 {
2668 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2669 	struct sk_buff *tail;
2670 	DEFINE_WAIT(wait);
2671 
2672 	unix_state_lock(sk);
2673 
2674 	for (;;) {
2675 		prepare_to_wait(sk_sleep(sk), &wait, state);
2676 
2677 		tail = skb_peek_tail(&sk->sk_receive_queue);
2678 		if (tail != last ||
2679 		    (tail && tail->len != last_len) ||
2680 		    sk->sk_err ||
2681 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2682 		    signal_pending(current) ||
2683 		    !timeo)
2684 			break;
2685 
2686 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2687 		unix_state_unlock(sk);
2688 		timeo = schedule_timeout(timeo);
2689 		unix_state_lock(sk);
2690 
2691 		if (sock_flag(sk, SOCK_DEAD))
2692 			break;
2693 
2694 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2695 	}
2696 
2697 	finish_wait(sk_sleep(sk), &wait);
2698 	unix_state_unlock(sk);
2699 	return timeo;
2700 }
2701 
2702 struct unix_stream_read_state {
2703 	int (*recv_actor)(struct sk_buff *, int, int,
2704 			  struct unix_stream_read_state *);
2705 	struct socket *socket;
2706 	struct msghdr *msg;
2707 	struct pipe_inode_info *pipe;
2708 	size_t size;
2709 	int flags;
2710 	unsigned int splice_flags;
2711 };
2712 
2713 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2714 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2715 {
2716 	struct sk_buff *oob_skb, *read_skb = NULL;
2717 	struct socket *sock = state->socket;
2718 	struct sock *sk = sock->sk;
2719 	struct unix_sock *u = unix_sk(sk);
2720 	int chunk = 1;
2721 
2722 	mutex_lock(&u->iolock);
2723 	unix_state_lock(sk);
2724 	spin_lock(&sk->sk_receive_queue.lock);
2725 
2726 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2727 		spin_unlock(&sk->sk_receive_queue.lock);
2728 		unix_state_unlock(sk);
2729 		mutex_unlock(&u->iolock);
2730 		return -EINVAL;
2731 	}
2732 
2733 	oob_skb = u->oob_skb;
2734 
2735 	if (!(state->flags & MSG_PEEK)) {
2736 		WRITE_ONCE(u->oob_skb, NULL);
2737 
2738 		if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue &&
2739 		    !unix_skb_len(oob_skb->prev)) {
2740 			read_skb = oob_skb->prev;
2741 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2742 		}
2743 	}
2744 
2745 	spin_unlock(&sk->sk_receive_queue.lock);
2746 	unix_state_unlock(sk);
2747 
2748 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2749 
2750 	if (!(state->flags & MSG_PEEK))
2751 		UNIXCB(oob_skb).consumed += 1;
2752 
2753 	mutex_unlock(&u->iolock);
2754 
2755 	consume_skb(read_skb);
2756 
2757 	if (chunk < 0)
2758 		return -EFAULT;
2759 
2760 	state->msg->msg_flags |= MSG_OOB;
2761 	return 1;
2762 }
2763 
2764 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2765 				  int flags, int copied)
2766 {
2767 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2768 	struct unix_sock *u = unix_sk(sk);
2769 
2770 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2771 		return skb;
2772 
2773 	spin_lock(&sk->sk_receive_queue.lock);
2774 
2775 	if (!unix_skb_len(skb)) {
2776 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2777 			skb = NULL;
2778 		} else if (flags & MSG_PEEK) {
2779 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2780 		} else {
2781 			read_skb = skb;
2782 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2783 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2784 		}
2785 
2786 		if (!skb)
2787 			goto unlock;
2788 	}
2789 
2790 	if (skb != u->oob_skb)
2791 		goto unlock;
2792 
2793 	if (copied) {
2794 		skb = NULL;
2795 	} else if (!(flags & MSG_PEEK)) {
2796 		WRITE_ONCE(u->oob_skb, NULL);
2797 
2798 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2799 			__skb_unlink(skb, &sk->sk_receive_queue);
2800 			unread_skb = skb;
2801 			skb = skb_peek(&sk->sk_receive_queue);
2802 		}
2803 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2804 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2805 	}
2806 
2807 unlock:
2808 	spin_unlock(&sk->sk_receive_queue.lock);
2809 
2810 	consume_skb(read_skb);
2811 	kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2812 
2813 	return skb;
2814 }
2815 #endif
2816 
2817 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2818 {
2819 	struct unix_sock *u = unix_sk(sk);
2820 	struct sk_buff *skb;
2821 	int err;
2822 
2823 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2824 		return -ENOTCONN;
2825 
2826 	mutex_lock(&u->iolock);
2827 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2828 	mutex_unlock(&u->iolock);
2829 	if (!skb)
2830 		return err;
2831 
2832 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2833 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2834 		bool drop = false;
2835 
2836 		unix_state_lock(sk);
2837 
2838 		if (sock_flag(sk, SOCK_DEAD)) {
2839 			unix_state_unlock(sk);
2840 			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2841 			return -ECONNRESET;
2842 		}
2843 
2844 		spin_lock(&sk->sk_receive_queue.lock);
2845 		if (likely(skb == u->oob_skb)) {
2846 			WRITE_ONCE(u->oob_skb, NULL);
2847 			drop = true;
2848 		}
2849 		spin_unlock(&sk->sk_receive_queue.lock);
2850 
2851 		unix_state_unlock(sk);
2852 
2853 		if (drop) {
2854 			kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2855 			return -EAGAIN;
2856 		}
2857 	}
2858 #endif
2859 
2860 	return recv_actor(sk, skb);
2861 }
2862 
2863 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2864 				    bool freezable)
2865 {
2866 	struct scm_cookie scm;
2867 	struct socket *sock = state->socket;
2868 	struct sock *sk = sock->sk;
2869 	struct unix_sock *u = unix_sk(sk);
2870 	int copied = 0;
2871 	int flags = state->flags;
2872 	int noblock = flags & MSG_DONTWAIT;
2873 	bool check_creds = false;
2874 	int target;
2875 	int err = 0;
2876 	long timeo;
2877 	int skip;
2878 	size_t size = state->size;
2879 	unsigned int last_len;
2880 
2881 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2882 		err = -EINVAL;
2883 		goto out;
2884 	}
2885 
2886 	if (unlikely(flags & MSG_OOB)) {
2887 		err = -EOPNOTSUPP;
2888 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2889 		err = unix_stream_recv_urg(state);
2890 #endif
2891 		goto out;
2892 	}
2893 
2894 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2895 	timeo = sock_rcvtimeo(sk, noblock);
2896 
2897 	memset(&scm, 0, sizeof(scm));
2898 
2899 	/* Lock the socket to prevent queue disordering
2900 	 * while sleeps in memcpy_tomsg
2901 	 */
2902 	mutex_lock(&u->iolock);
2903 
2904 	skip = max(sk_peek_offset(sk, flags), 0);
2905 
2906 	do {
2907 		struct sk_buff *skb, *last;
2908 		int chunk;
2909 
2910 redo:
2911 		unix_state_lock(sk);
2912 		if (sock_flag(sk, SOCK_DEAD)) {
2913 			err = -ECONNRESET;
2914 			goto unlock;
2915 		}
2916 		last = skb = skb_peek(&sk->sk_receive_queue);
2917 		last_len = last ? last->len : 0;
2918 
2919 again:
2920 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2921 		if (skb) {
2922 			skb = manage_oob(skb, sk, flags, copied);
2923 			if (!skb && copied) {
2924 				unix_state_unlock(sk);
2925 				break;
2926 			}
2927 		}
2928 #endif
2929 		if (skb == NULL) {
2930 			if (copied >= target)
2931 				goto unlock;
2932 
2933 			/*
2934 			 *	POSIX 1003.1g mandates this order.
2935 			 */
2936 
2937 			err = sock_error(sk);
2938 			if (err)
2939 				goto unlock;
2940 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2941 				goto unlock;
2942 
2943 			unix_state_unlock(sk);
2944 			if (!timeo) {
2945 				err = -EAGAIN;
2946 				break;
2947 			}
2948 
2949 			mutex_unlock(&u->iolock);
2950 
2951 			timeo = unix_stream_data_wait(sk, timeo, last,
2952 						      last_len, freezable);
2953 
2954 			if (signal_pending(current)) {
2955 				err = sock_intr_errno(timeo);
2956 				scm_destroy(&scm);
2957 				goto out;
2958 			}
2959 
2960 			mutex_lock(&u->iolock);
2961 			goto redo;
2962 unlock:
2963 			unix_state_unlock(sk);
2964 			break;
2965 		}
2966 
2967 		while (skip >= unix_skb_len(skb)) {
2968 			skip -= unix_skb_len(skb);
2969 			last = skb;
2970 			last_len = skb->len;
2971 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2972 			if (!skb)
2973 				goto again;
2974 		}
2975 
2976 		unix_state_unlock(sk);
2977 
2978 		if (check_creds) {
2979 			/* Never glue messages from different writers */
2980 			if (!unix_skb_scm_eq(skb, &scm))
2981 				break;
2982 		} else if (unix_may_passcred(sk)) {
2983 			/* Copy credentials */
2984 			unix_skb_to_scm(skb, &scm);
2985 			check_creds = true;
2986 		}
2987 
2988 		/* Copy address just once */
2989 		if (state->msg && state->msg->msg_name) {
2990 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2991 					 state->msg->msg_name);
2992 			unix_copy_addr(state->msg, skb->sk);
2993 
2994 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2995 							      state->msg->msg_name,
2996 							      &state->msg->msg_namelen);
2997 
2998 			sunaddr = NULL;
2999 		}
3000 
3001 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
3002 		chunk = state->recv_actor(skb, skip, chunk, state);
3003 		if (chunk < 0) {
3004 			if (copied == 0)
3005 				copied = -EFAULT;
3006 			break;
3007 		}
3008 		copied += chunk;
3009 		size -= chunk;
3010 
3011 		/* Mark read part of skb as used */
3012 		if (!(flags & MSG_PEEK)) {
3013 			UNIXCB(skb).consumed += chunk;
3014 
3015 			sk_peek_offset_bwd(sk, chunk);
3016 
3017 			if (UNIXCB(skb).fp) {
3018 				scm_stat_del(sk, skb);
3019 				unix_detach_fds(&scm, skb);
3020 			}
3021 
3022 			if (unix_skb_len(skb))
3023 				break;
3024 
3025 			skb_unlink(skb, &sk->sk_receive_queue);
3026 			consume_skb(skb);
3027 
3028 			if (scm.fp)
3029 				break;
3030 		} else {
3031 			/* It is questionable, see note in unix_dgram_recvmsg.
3032 			 */
3033 			if (UNIXCB(skb).fp)
3034 				unix_peek_fds(&scm, skb);
3035 
3036 			sk_peek_offset_fwd(sk, chunk);
3037 
3038 			if (UNIXCB(skb).fp)
3039 				break;
3040 
3041 			skip = 0;
3042 			last = skb;
3043 			last_len = skb->len;
3044 			unix_state_lock(sk);
3045 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
3046 			if (skb)
3047 				goto again;
3048 			unix_state_unlock(sk);
3049 			break;
3050 		}
3051 	} while (size);
3052 
3053 	mutex_unlock(&u->iolock);
3054 	if (state->msg)
3055 		scm_recv_unix(sock, state->msg, &scm, flags);
3056 	else
3057 		scm_destroy(&scm);
3058 out:
3059 	return copied ? : err;
3060 }
3061 
3062 static int unix_stream_read_actor(struct sk_buff *skb,
3063 				  int skip, int chunk,
3064 				  struct unix_stream_read_state *state)
3065 {
3066 	int ret;
3067 
3068 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
3069 				    state->msg, chunk);
3070 	return ret ?: chunk;
3071 }
3072 
3073 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
3074 			  size_t size, int flags)
3075 {
3076 	struct unix_stream_read_state state = {
3077 		.recv_actor = unix_stream_read_actor,
3078 		.socket = sk->sk_socket,
3079 		.msg = msg,
3080 		.size = size,
3081 		.flags = flags
3082 	};
3083 
3084 	return unix_stream_read_generic(&state, true);
3085 }
3086 
3087 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
3088 			       size_t size, int flags)
3089 {
3090 	struct unix_stream_read_state state = {
3091 		.recv_actor = unix_stream_read_actor,
3092 		.socket = sock,
3093 		.msg = msg,
3094 		.size = size,
3095 		.flags = flags
3096 	};
3097 
3098 #ifdef CONFIG_BPF_SYSCALL
3099 	struct sock *sk = sock->sk;
3100 	const struct proto *prot = READ_ONCE(sk->sk_prot);
3101 
3102 	if (prot != &unix_stream_proto)
3103 		return prot->recvmsg(sk, msg, size, flags, NULL);
3104 #endif
3105 	return unix_stream_read_generic(&state, true);
3106 }
3107 
3108 static int unix_stream_splice_actor(struct sk_buff *skb,
3109 				    int skip, int chunk,
3110 				    struct unix_stream_read_state *state)
3111 {
3112 	return skb_splice_bits(skb, state->socket->sk,
3113 			       UNIXCB(skb).consumed + skip,
3114 			       state->pipe, chunk, state->splice_flags);
3115 }
3116 
3117 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3118 				       struct pipe_inode_info *pipe,
3119 				       size_t size, unsigned int flags)
3120 {
3121 	struct unix_stream_read_state state = {
3122 		.recv_actor = unix_stream_splice_actor,
3123 		.socket = sock,
3124 		.pipe = pipe,
3125 		.size = size,
3126 		.splice_flags = flags,
3127 	};
3128 
3129 	if (unlikely(*ppos))
3130 		return -ESPIPE;
3131 
3132 	if (sock->file->f_flags & O_NONBLOCK ||
3133 	    flags & SPLICE_F_NONBLOCK)
3134 		state.flags = MSG_DONTWAIT;
3135 
3136 	return unix_stream_read_generic(&state, false);
3137 }
3138 
3139 static int unix_shutdown(struct socket *sock, int mode)
3140 {
3141 	struct sock *sk = sock->sk;
3142 	struct sock *other;
3143 
3144 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3145 		return -EINVAL;
3146 	/* This maps:
3147 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3148 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3149 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3150 	 */
3151 	++mode;
3152 
3153 	unix_state_lock(sk);
3154 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3155 	other = unix_peer(sk);
3156 	if (other)
3157 		sock_hold(other);
3158 	unix_state_unlock(sk);
3159 	sk->sk_state_change(sk);
3160 
3161 	if (other &&
3162 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3163 
3164 		int peer_mode = 0;
3165 		const struct proto *prot = READ_ONCE(other->sk_prot);
3166 
3167 		if (prot->unhash)
3168 			prot->unhash(other);
3169 		if (mode&RCV_SHUTDOWN)
3170 			peer_mode |= SEND_SHUTDOWN;
3171 		if (mode&SEND_SHUTDOWN)
3172 			peer_mode |= RCV_SHUTDOWN;
3173 		unix_state_lock(other);
3174 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3175 		unix_state_unlock(other);
3176 		other->sk_state_change(other);
3177 		if (peer_mode == SHUTDOWN_MASK)
3178 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3179 		else if (peer_mode & RCV_SHUTDOWN)
3180 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3181 	}
3182 	if (other)
3183 		sock_put(other);
3184 
3185 	return 0;
3186 }
3187 
3188 long unix_inq_len(struct sock *sk)
3189 {
3190 	struct sk_buff *skb;
3191 	long amount = 0;
3192 
3193 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3194 		return -EINVAL;
3195 
3196 	spin_lock(&sk->sk_receive_queue.lock);
3197 	if (sk->sk_type == SOCK_STREAM ||
3198 	    sk->sk_type == SOCK_SEQPACKET) {
3199 		skb_queue_walk(&sk->sk_receive_queue, skb)
3200 			amount += unix_skb_len(skb);
3201 	} else {
3202 		skb = skb_peek(&sk->sk_receive_queue);
3203 		if (skb)
3204 			amount = skb->len;
3205 	}
3206 	spin_unlock(&sk->sk_receive_queue.lock);
3207 
3208 	return amount;
3209 }
3210 EXPORT_SYMBOL_GPL(unix_inq_len);
3211 
3212 long unix_outq_len(struct sock *sk)
3213 {
3214 	return sk_wmem_alloc_get(sk);
3215 }
3216 EXPORT_SYMBOL_GPL(unix_outq_len);
3217 
3218 static int unix_open_file(struct sock *sk)
3219 {
3220 	struct file *f;
3221 	int fd;
3222 
3223 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3224 		return -EPERM;
3225 
3226 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3227 		return -ENOENT;
3228 
3229 	if (!unix_sk(sk)->path.dentry)
3230 		return -ENOENT;
3231 
3232 	fd = get_unused_fd_flags(O_CLOEXEC);
3233 	if (fd < 0)
3234 		return fd;
3235 
3236 	f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred());
3237 	if (IS_ERR(f)) {
3238 		put_unused_fd(fd);
3239 		return PTR_ERR(f);
3240 	}
3241 
3242 	fd_install(fd, f);
3243 	return fd;
3244 }
3245 
3246 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3247 {
3248 	struct sock *sk = sock->sk;
3249 	long amount = 0;
3250 	int err;
3251 
3252 	switch (cmd) {
3253 	case SIOCOUTQ:
3254 		amount = unix_outq_len(sk);
3255 		err = put_user(amount, (int __user *)arg);
3256 		break;
3257 	case SIOCINQ:
3258 		amount = unix_inq_len(sk);
3259 		if (amount < 0)
3260 			err = amount;
3261 		else
3262 			err = put_user(amount, (int __user *)arg);
3263 		break;
3264 	case SIOCUNIXFILE:
3265 		err = unix_open_file(sk);
3266 		break;
3267 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3268 	case SIOCATMARK:
3269 		{
3270 			struct unix_sock *u = unix_sk(sk);
3271 			struct sk_buff *skb;
3272 			int answ = 0;
3273 
3274 			mutex_lock(&u->iolock);
3275 
3276 			skb = skb_peek(&sk->sk_receive_queue);
3277 			if (skb) {
3278 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3279 				struct sk_buff *next_skb;
3280 
3281 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3282 
3283 				if (skb == oob_skb ||
3284 				    (!unix_skb_len(skb) &&
3285 				     (!oob_skb || next_skb == oob_skb)))
3286 					answ = 1;
3287 			}
3288 
3289 			mutex_unlock(&u->iolock);
3290 
3291 			err = put_user(answ, (int __user *)arg);
3292 		}
3293 		break;
3294 #endif
3295 	default:
3296 		err = -ENOIOCTLCMD;
3297 		break;
3298 	}
3299 	return err;
3300 }
3301 
3302 #ifdef CONFIG_COMPAT
3303 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3304 {
3305 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3306 }
3307 #endif
3308 
3309 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3310 {
3311 	struct sock *sk = sock->sk;
3312 	unsigned char state;
3313 	__poll_t mask;
3314 	u8 shutdown;
3315 
3316 	sock_poll_wait(file, sock, wait);
3317 	mask = 0;
3318 	shutdown = READ_ONCE(sk->sk_shutdown);
3319 	state = READ_ONCE(sk->sk_state);
3320 
3321 	/* exceptional events? */
3322 	if (READ_ONCE(sk->sk_err))
3323 		mask |= EPOLLERR;
3324 	if (shutdown == SHUTDOWN_MASK)
3325 		mask |= EPOLLHUP;
3326 	if (shutdown & RCV_SHUTDOWN)
3327 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3328 
3329 	/* readable? */
3330 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3331 		mask |= EPOLLIN | EPOLLRDNORM;
3332 	if (sk_is_readable(sk))
3333 		mask |= EPOLLIN | EPOLLRDNORM;
3334 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3335 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3336 		mask |= EPOLLPRI;
3337 #endif
3338 
3339 	/* Connection-based need to check for termination and startup */
3340 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3341 	    state == TCP_CLOSE)
3342 		mask |= EPOLLHUP;
3343 
3344 	/*
3345 	 * we set writable also when the other side has shut down the
3346 	 * connection. This prevents stuck sockets.
3347 	 */
3348 	if (unix_writable(sk, state))
3349 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3350 
3351 	return mask;
3352 }
3353 
3354 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3355 				    poll_table *wait)
3356 {
3357 	struct sock *sk = sock->sk, *other;
3358 	unsigned int writable;
3359 	unsigned char state;
3360 	__poll_t mask;
3361 	u8 shutdown;
3362 
3363 	sock_poll_wait(file, sock, wait);
3364 	mask = 0;
3365 	shutdown = READ_ONCE(sk->sk_shutdown);
3366 	state = READ_ONCE(sk->sk_state);
3367 
3368 	/* exceptional events? */
3369 	if (READ_ONCE(sk->sk_err) ||
3370 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3371 		mask |= EPOLLERR |
3372 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3373 
3374 	if (shutdown & RCV_SHUTDOWN)
3375 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3376 	if (shutdown == SHUTDOWN_MASK)
3377 		mask |= EPOLLHUP;
3378 
3379 	/* readable? */
3380 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3381 		mask |= EPOLLIN | EPOLLRDNORM;
3382 	if (sk_is_readable(sk))
3383 		mask |= EPOLLIN | EPOLLRDNORM;
3384 
3385 	/* Connection-based need to check for termination and startup */
3386 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3387 		mask |= EPOLLHUP;
3388 
3389 	/* No write status requested, avoid expensive OUT tests. */
3390 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3391 		return mask;
3392 
3393 	writable = unix_writable(sk, state);
3394 	if (writable) {
3395 		unix_state_lock(sk);
3396 
3397 		other = unix_peer(sk);
3398 		if (other && unix_peer(other) != sk &&
3399 		    unix_recvq_full_lockless(other) &&
3400 		    unix_dgram_peer_wake_me(sk, other))
3401 			writable = 0;
3402 
3403 		unix_state_unlock(sk);
3404 	}
3405 
3406 	if (writable)
3407 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3408 	else
3409 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3410 
3411 	return mask;
3412 }
3413 
3414 #ifdef CONFIG_PROC_FS
3415 
3416 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3417 
3418 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3419 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3420 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3421 
3422 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3423 {
3424 	unsigned long offset = get_offset(*pos);
3425 	unsigned long bucket = get_bucket(*pos);
3426 	unsigned long count = 0;
3427 	struct sock *sk;
3428 
3429 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3430 	     sk; sk = sk_next(sk)) {
3431 		if (++count == offset)
3432 			break;
3433 	}
3434 
3435 	return sk;
3436 }
3437 
3438 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3439 {
3440 	unsigned long bucket = get_bucket(*pos);
3441 	struct net *net = seq_file_net(seq);
3442 	struct sock *sk;
3443 
3444 	while (bucket < UNIX_HASH_SIZE) {
3445 		spin_lock(&net->unx.table.locks[bucket]);
3446 
3447 		sk = unix_from_bucket(seq, pos);
3448 		if (sk)
3449 			return sk;
3450 
3451 		spin_unlock(&net->unx.table.locks[bucket]);
3452 
3453 		*pos = set_bucket_offset(++bucket, 1);
3454 	}
3455 
3456 	return NULL;
3457 }
3458 
3459 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3460 				  loff_t *pos)
3461 {
3462 	unsigned long bucket = get_bucket(*pos);
3463 
3464 	sk = sk_next(sk);
3465 	if (sk)
3466 		return sk;
3467 
3468 
3469 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3470 
3471 	*pos = set_bucket_offset(++bucket, 1);
3472 
3473 	return unix_get_first(seq, pos);
3474 }
3475 
3476 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3477 {
3478 	if (!*pos)
3479 		return SEQ_START_TOKEN;
3480 
3481 	return unix_get_first(seq, pos);
3482 }
3483 
3484 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3485 {
3486 	++*pos;
3487 
3488 	if (v == SEQ_START_TOKEN)
3489 		return unix_get_first(seq, pos);
3490 
3491 	return unix_get_next(seq, v, pos);
3492 }
3493 
3494 static void unix_seq_stop(struct seq_file *seq, void *v)
3495 {
3496 	struct sock *sk = v;
3497 
3498 	if (sk)
3499 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3500 }
3501 
3502 static int unix_seq_show(struct seq_file *seq, void *v)
3503 {
3504 
3505 	if (v == SEQ_START_TOKEN)
3506 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3507 			 "Inode Path\n");
3508 	else {
3509 		struct sock *s = v;
3510 		struct unix_sock *u = unix_sk(s);
3511 		unix_state_lock(s);
3512 
3513 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3514 			s,
3515 			refcount_read(&s->sk_refcnt),
3516 			0,
3517 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3518 			s->sk_type,
3519 			s->sk_socket ?
3520 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3521 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3522 			sock_i_ino(s));
3523 
3524 		if (u->addr) {	// under a hash table lock here
3525 			int i, len;
3526 			seq_putc(seq, ' ');
3527 
3528 			i = 0;
3529 			len = u->addr->len -
3530 				offsetof(struct sockaddr_un, sun_path);
3531 			if (u->addr->name->sun_path[0]) {
3532 				len--;
3533 			} else {
3534 				seq_putc(seq, '@');
3535 				i++;
3536 			}
3537 			for ( ; i < len; i++)
3538 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3539 					 '@');
3540 		}
3541 		unix_state_unlock(s);
3542 		seq_putc(seq, '\n');
3543 	}
3544 
3545 	return 0;
3546 }
3547 
3548 static const struct seq_operations unix_seq_ops = {
3549 	.start  = unix_seq_start,
3550 	.next   = unix_seq_next,
3551 	.stop   = unix_seq_stop,
3552 	.show   = unix_seq_show,
3553 };
3554 
3555 #ifdef CONFIG_BPF_SYSCALL
3556 struct bpf_unix_iter_state {
3557 	struct seq_net_private p;
3558 	unsigned int cur_sk;
3559 	unsigned int end_sk;
3560 	unsigned int max_sk;
3561 	struct sock **batch;
3562 	bool st_bucket_done;
3563 };
3564 
3565 struct bpf_iter__unix {
3566 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3567 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3568 	uid_t uid __aligned(8);
3569 };
3570 
3571 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3572 			      struct unix_sock *unix_sk, uid_t uid)
3573 {
3574 	struct bpf_iter__unix ctx;
3575 
3576 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3577 	ctx.meta = meta;
3578 	ctx.unix_sk = unix_sk;
3579 	ctx.uid = uid;
3580 	return bpf_iter_run_prog(prog, &ctx);
3581 }
3582 
3583 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3584 
3585 {
3586 	struct bpf_unix_iter_state *iter = seq->private;
3587 	unsigned int expected = 1;
3588 	struct sock *sk;
3589 
3590 	sock_hold(start_sk);
3591 	iter->batch[iter->end_sk++] = start_sk;
3592 
3593 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3594 		if (iter->end_sk < iter->max_sk) {
3595 			sock_hold(sk);
3596 			iter->batch[iter->end_sk++] = sk;
3597 		}
3598 
3599 		expected++;
3600 	}
3601 
3602 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3603 
3604 	return expected;
3605 }
3606 
3607 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3608 {
3609 	while (iter->cur_sk < iter->end_sk)
3610 		sock_put(iter->batch[iter->cur_sk++]);
3611 }
3612 
3613 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3614 				       unsigned int new_batch_sz)
3615 {
3616 	struct sock **new_batch;
3617 
3618 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3619 			     GFP_USER | __GFP_NOWARN);
3620 	if (!new_batch)
3621 		return -ENOMEM;
3622 
3623 	bpf_iter_unix_put_batch(iter);
3624 	kvfree(iter->batch);
3625 	iter->batch = new_batch;
3626 	iter->max_sk = new_batch_sz;
3627 
3628 	return 0;
3629 }
3630 
3631 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3632 					loff_t *pos)
3633 {
3634 	struct bpf_unix_iter_state *iter = seq->private;
3635 	unsigned int expected;
3636 	bool resized = false;
3637 	struct sock *sk;
3638 
3639 	if (iter->st_bucket_done)
3640 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3641 
3642 again:
3643 	/* Get a new batch */
3644 	iter->cur_sk = 0;
3645 	iter->end_sk = 0;
3646 
3647 	sk = unix_get_first(seq, pos);
3648 	if (!sk)
3649 		return NULL; /* Done */
3650 
3651 	expected = bpf_iter_unix_hold_batch(seq, sk);
3652 
3653 	if (iter->end_sk == expected) {
3654 		iter->st_bucket_done = true;
3655 		return sk;
3656 	}
3657 
3658 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3659 		resized = true;
3660 		goto again;
3661 	}
3662 
3663 	return sk;
3664 }
3665 
3666 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3667 {
3668 	if (!*pos)
3669 		return SEQ_START_TOKEN;
3670 
3671 	/* bpf iter does not support lseek, so it always
3672 	 * continue from where it was stop()-ped.
3673 	 */
3674 	return bpf_iter_unix_batch(seq, pos);
3675 }
3676 
3677 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3678 {
3679 	struct bpf_unix_iter_state *iter = seq->private;
3680 	struct sock *sk;
3681 
3682 	/* Whenever seq_next() is called, the iter->cur_sk is
3683 	 * done with seq_show(), so advance to the next sk in
3684 	 * the batch.
3685 	 */
3686 	if (iter->cur_sk < iter->end_sk)
3687 		sock_put(iter->batch[iter->cur_sk++]);
3688 
3689 	++*pos;
3690 
3691 	if (iter->cur_sk < iter->end_sk)
3692 		sk = iter->batch[iter->cur_sk];
3693 	else
3694 		sk = bpf_iter_unix_batch(seq, pos);
3695 
3696 	return sk;
3697 }
3698 
3699 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3700 {
3701 	struct bpf_iter_meta meta;
3702 	struct bpf_prog *prog;
3703 	struct sock *sk = v;
3704 	uid_t uid;
3705 	bool slow;
3706 	int ret;
3707 
3708 	if (v == SEQ_START_TOKEN)
3709 		return 0;
3710 
3711 	slow = lock_sock_fast(sk);
3712 
3713 	if (unlikely(sk_unhashed(sk))) {
3714 		ret = SEQ_SKIP;
3715 		goto unlock;
3716 	}
3717 
3718 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3719 	meta.seq = seq;
3720 	prog = bpf_iter_get_info(&meta, false);
3721 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3722 unlock:
3723 	unlock_sock_fast(sk, slow);
3724 	return ret;
3725 }
3726 
3727 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3728 {
3729 	struct bpf_unix_iter_state *iter = seq->private;
3730 	struct bpf_iter_meta meta;
3731 	struct bpf_prog *prog;
3732 
3733 	if (!v) {
3734 		meta.seq = seq;
3735 		prog = bpf_iter_get_info(&meta, true);
3736 		if (prog)
3737 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3738 	}
3739 
3740 	if (iter->cur_sk < iter->end_sk)
3741 		bpf_iter_unix_put_batch(iter);
3742 }
3743 
3744 static const struct seq_operations bpf_iter_unix_seq_ops = {
3745 	.start	= bpf_iter_unix_seq_start,
3746 	.next	= bpf_iter_unix_seq_next,
3747 	.stop	= bpf_iter_unix_seq_stop,
3748 	.show	= bpf_iter_unix_seq_show,
3749 };
3750 #endif
3751 #endif
3752 
3753 static const struct net_proto_family unix_family_ops = {
3754 	.family = PF_UNIX,
3755 	.create = unix_create,
3756 	.owner	= THIS_MODULE,
3757 };
3758 
3759 
3760 static int __net_init unix_net_init(struct net *net)
3761 {
3762 	int i;
3763 
3764 	net->unx.sysctl_max_dgram_qlen = 10;
3765 	if (unix_sysctl_register(net))
3766 		goto out;
3767 
3768 #ifdef CONFIG_PROC_FS
3769 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3770 			     sizeof(struct seq_net_private)))
3771 		goto err_sysctl;
3772 #endif
3773 
3774 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3775 					      sizeof(spinlock_t), GFP_KERNEL);
3776 	if (!net->unx.table.locks)
3777 		goto err_proc;
3778 
3779 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3780 						sizeof(struct hlist_head),
3781 						GFP_KERNEL);
3782 	if (!net->unx.table.buckets)
3783 		goto free_locks;
3784 
3785 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3786 		spin_lock_init(&net->unx.table.locks[i]);
3787 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3788 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3789 	}
3790 
3791 	return 0;
3792 
3793 free_locks:
3794 	kvfree(net->unx.table.locks);
3795 err_proc:
3796 #ifdef CONFIG_PROC_FS
3797 	remove_proc_entry("unix", net->proc_net);
3798 err_sysctl:
3799 #endif
3800 	unix_sysctl_unregister(net);
3801 out:
3802 	return -ENOMEM;
3803 }
3804 
3805 static void __net_exit unix_net_exit(struct net *net)
3806 {
3807 	kvfree(net->unx.table.buckets);
3808 	kvfree(net->unx.table.locks);
3809 	unix_sysctl_unregister(net);
3810 	remove_proc_entry("unix", net->proc_net);
3811 }
3812 
3813 static struct pernet_operations unix_net_ops = {
3814 	.init = unix_net_init,
3815 	.exit = unix_net_exit,
3816 };
3817 
3818 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3819 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3820 		     struct unix_sock *unix_sk, uid_t uid)
3821 
3822 #define INIT_BATCH_SZ 16
3823 
3824 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3825 {
3826 	struct bpf_unix_iter_state *iter = priv_data;
3827 	int err;
3828 
3829 	err = bpf_iter_init_seq_net(priv_data, aux);
3830 	if (err)
3831 		return err;
3832 
3833 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3834 	if (err) {
3835 		bpf_iter_fini_seq_net(priv_data);
3836 		return err;
3837 	}
3838 
3839 	return 0;
3840 }
3841 
3842 static void bpf_iter_fini_unix(void *priv_data)
3843 {
3844 	struct bpf_unix_iter_state *iter = priv_data;
3845 
3846 	bpf_iter_fini_seq_net(priv_data);
3847 	kvfree(iter->batch);
3848 }
3849 
3850 static const struct bpf_iter_seq_info unix_seq_info = {
3851 	.seq_ops		= &bpf_iter_unix_seq_ops,
3852 	.init_seq_private	= bpf_iter_init_unix,
3853 	.fini_seq_private	= bpf_iter_fini_unix,
3854 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3855 };
3856 
3857 static const struct bpf_func_proto *
3858 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3859 			     const struct bpf_prog *prog)
3860 {
3861 	switch (func_id) {
3862 	case BPF_FUNC_setsockopt:
3863 		return &bpf_sk_setsockopt_proto;
3864 	case BPF_FUNC_getsockopt:
3865 		return &bpf_sk_getsockopt_proto;
3866 	default:
3867 		return NULL;
3868 	}
3869 }
3870 
3871 static struct bpf_iter_reg unix_reg_info = {
3872 	.target			= "unix",
3873 	.ctx_arg_info_size	= 1,
3874 	.ctx_arg_info		= {
3875 		{ offsetof(struct bpf_iter__unix, unix_sk),
3876 		  PTR_TO_BTF_ID_OR_NULL },
3877 	},
3878 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3879 	.seq_info		= &unix_seq_info,
3880 };
3881 
3882 static void __init bpf_iter_register(void)
3883 {
3884 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3885 	if (bpf_iter_reg_target(&unix_reg_info))
3886 		pr_warn("Warning: could not register bpf iterator unix\n");
3887 }
3888 #endif
3889 
3890 static int __init af_unix_init(void)
3891 {
3892 	int i, rc = -1;
3893 
3894 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3895 
3896 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3897 		spin_lock_init(&bsd_socket_locks[i]);
3898 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3899 	}
3900 
3901 	rc = proto_register(&unix_dgram_proto, 1);
3902 	if (rc != 0) {
3903 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3904 		goto out;
3905 	}
3906 
3907 	rc = proto_register(&unix_stream_proto, 1);
3908 	if (rc != 0) {
3909 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3910 		proto_unregister(&unix_dgram_proto);
3911 		goto out;
3912 	}
3913 
3914 	sock_register(&unix_family_ops);
3915 	register_pernet_subsys(&unix_net_ops);
3916 	unix_bpf_build_proto();
3917 
3918 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3919 	bpf_iter_register();
3920 #endif
3921 
3922 out:
3923 	return rc;
3924 }
3925 
3926 /* Later than subsys_initcall() because we depend on stuff initialised there */
3927 fs_initcall(af_unix_init);
3928