xref: /linux/net/unix/af_unix.c (revision 09d7ff0694ea133c50ad905fd6e548c13f8af458)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/bpf-cgroup.h>
81 #include <linux/btf_ids.h>
82 #include <linux/dcache.h>
83 #include <linux/errno.h>
84 #include <linux/fcntl.h>
85 #include <linux/file.h>
86 #include <linux/filter.h>
87 #include <linux/fs.h>
88 #include <linux/init.h>
89 #include <linux/kernel.h>
90 #include <linux/mount.h>
91 #include <linux/namei.h>
92 #include <linux/poll.h>
93 #include <linux/proc_fs.h>
94 #include <linux/sched/signal.h>
95 #include <linux/security.h>
96 #include <linux/seq_file.h>
97 #include <linux/skbuff.h>
98 #include <linux/slab.h>
99 #include <linux/socket.h>
100 #include <linux/splice.h>
101 #include <linux/string.h>
102 #include <linux/uaccess.h>
103 #include <net/af_unix.h>
104 #include <net/net_namespace.h>
105 #include <net/scm.h>
106 #include <net/tcp_states.h>
107 #include <uapi/linux/sockios.h>
108 #include <uapi/linux/termios.h>
109 
110 #include "af_unix.h"
111 
112 static atomic_long_t unix_nr_socks;
113 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
114 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
115 
116 /* SMP locking strategy:
117  *    hash table is protected with spinlock.
118  *    each socket state is protected by separate spinlock.
119  */
120 #ifdef CONFIG_PROVE_LOCKING
121 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
122 
123 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
124 				  const struct lockdep_map *b)
125 {
126 	return cmp_ptr(a, b);
127 }
128 
129 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
130 				  const struct lockdep_map *_b)
131 {
132 	const struct unix_sock *a, *b;
133 
134 	a = container_of(_a, struct unix_sock, lock.dep_map);
135 	b = container_of(_b, struct unix_sock, lock.dep_map);
136 
137 	if (a->sk.sk_state == TCP_LISTEN) {
138 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
139 		 *
140 		 *   1. a is TCP_LISTEN.
141 		 *   2. b is not a.
142 		 *   3. concurrent connect(b -> a) must fail.
143 		 *
144 		 * Except for 2. & 3., the b's state can be any possible
145 		 * value due to concurrent connect() or listen().
146 		 *
147 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
148 		 * be expressed as lock_cmp_fn.
149 		 */
150 		switch (b->sk.sk_state) {
151 		case TCP_CLOSE:
152 		case TCP_ESTABLISHED:
153 		case TCP_LISTEN:
154 			return -1;
155 		default:
156 			/* Invalid case. */
157 			return 0;
158 		}
159 	}
160 
161 	/* Should never happen.  Just to be symmetric. */
162 	if (b->sk.sk_state == TCP_LISTEN) {
163 		switch (b->sk.sk_state) {
164 		case TCP_CLOSE:
165 		case TCP_ESTABLISHED:
166 			return 1;
167 		default:
168 			return 0;
169 		}
170 	}
171 
172 	/* unix_state_double_lock(): ascending address order. */
173 	return cmp_ptr(a, b);
174 }
175 
176 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
177 				  const struct lockdep_map *_b)
178 {
179 	const struct sock *a, *b;
180 
181 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
182 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
183 
184 	/* unix_collect_skb(): listener -> embryo order. */
185 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
186 		return -1;
187 
188 	/* Should never happen.  Just to be symmetric. */
189 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
190 		return 1;
191 
192 	return 0;
193 }
194 #endif
195 
196 static unsigned int unix_unbound_hash(struct sock *sk)
197 {
198 	unsigned long hash = (unsigned long)sk;
199 
200 	hash ^= hash >> 16;
201 	hash ^= hash >> 8;
202 	hash ^= sk->sk_type;
203 
204 	return hash & UNIX_HASH_MOD;
205 }
206 
207 static unsigned int unix_bsd_hash(struct inode *i)
208 {
209 	return i->i_ino & UNIX_HASH_MOD;
210 }
211 
212 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
213 				       int addr_len, int type)
214 {
215 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
216 	unsigned int hash;
217 
218 	hash = (__force unsigned int)csum_fold(csum);
219 	hash ^= hash >> 8;
220 	hash ^= type;
221 
222 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
223 }
224 
225 static void unix_table_double_lock(struct net *net,
226 				   unsigned int hash1, unsigned int hash2)
227 {
228 	if (hash1 == hash2) {
229 		spin_lock(&net->unx.table.locks[hash1]);
230 		return;
231 	}
232 
233 	if (hash1 > hash2)
234 		swap(hash1, hash2);
235 
236 	spin_lock(&net->unx.table.locks[hash1]);
237 	spin_lock(&net->unx.table.locks[hash2]);
238 }
239 
240 static void unix_table_double_unlock(struct net *net,
241 				     unsigned int hash1, unsigned int hash2)
242 {
243 	if (hash1 == hash2) {
244 		spin_unlock(&net->unx.table.locks[hash1]);
245 		return;
246 	}
247 
248 	spin_unlock(&net->unx.table.locks[hash1]);
249 	spin_unlock(&net->unx.table.locks[hash2]);
250 }
251 
252 #ifdef CONFIG_SECURITY_NETWORK
253 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
254 {
255 	UNIXCB(skb).secid = scm->secid;
256 }
257 
258 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
259 {
260 	scm->secid = UNIXCB(skb).secid;
261 }
262 
263 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
264 {
265 	return (scm->secid == UNIXCB(skb).secid);
266 }
267 #else
268 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
269 { }
270 
271 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
272 { }
273 
274 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
275 {
276 	return true;
277 }
278 #endif /* CONFIG_SECURITY_NETWORK */
279 
280 static inline int unix_may_send(struct sock *sk, struct sock *osk)
281 {
282 	return !unix_peer(osk) || unix_peer(osk) == sk;
283 }
284 
285 static inline int unix_recvq_full_lockless(const struct sock *sk)
286 {
287 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
288 }
289 
290 struct sock *unix_peer_get(struct sock *s)
291 {
292 	struct sock *peer;
293 
294 	unix_state_lock(s);
295 	peer = unix_peer(s);
296 	if (peer)
297 		sock_hold(peer);
298 	unix_state_unlock(s);
299 	return peer;
300 }
301 EXPORT_SYMBOL_GPL(unix_peer_get);
302 
303 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
304 					     int addr_len)
305 {
306 	struct unix_address *addr;
307 
308 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
309 	if (!addr)
310 		return NULL;
311 
312 	refcount_set(&addr->refcnt, 1);
313 	addr->len = addr_len;
314 	memcpy(addr->name, sunaddr, addr_len);
315 
316 	return addr;
317 }
318 
319 static inline void unix_release_addr(struct unix_address *addr)
320 {
321 	if (refcount_dec_and_test(&addr->refcnt))
322 		kfree(addr);
323 }
324 
325 /*
326  *	Check unix socket name:
327  *		- should be not zero length.
328  *	        - if started by not zero, should be NULL terminated (FS object)
329  *		- if started by zero, it is abstract name.
330  */
331 
332 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
333 {
334 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
335 	    addr_len > sizeof(*sunaddr))
336 		return -EINVAL;
337 
338 	if (sunaddr->sun_family != AF_UNIX)
339 		return -EINVAL;
340 
341 	return 0;
342 }
343 
344 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
345 {
346 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
347 	short offset = offsetof(struct sockaddr_storage, __data);
348 
349 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
350 
351 	/* This may look like an off by one error but it is a bit more
352 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
353 	 * sun_path[108] doesn't as such exist.  However in kernel space
354 	 * we are guaranteed that it is a valid memory location in our
355 	 * kernel address buffer because syscall functions always pass
356 	 * a pointer of struct sockaddr_storage which has a bigger buffer
357 	 * than 108.  Also, we must terminate sun_path for strlen() in
358 	 * getname_kernel().
359 	 */
360 	addr->__data[addr_len - offset] = 0;
361 
362 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
363 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
364 	 * know the actual buffer.
365 	 */
366 	return strlen(addr->__data) + offset + 1;
367 }
368 
369 static void __unix_remove_socket(struct sock *sk)
370 {
371 	sk_del_node_init(sk);
372 }
373 
374 static void __unix_insert_socket(struct net *net, struct sock *sk)
375 {
376 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
377 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
378 }
379 
380 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
381 				 struct unix_address *addr, unsigned int hash)
382 {
383 	__unix_remove_socket(sk);
384 	smp_store_release(&unix_sk(sk)->addr, addr);
385 
386 	sk->sk_hash = hash;
387 	__unix_insert_socket(net, sk);
388 }
389 
390 static void unix_remove_socket(struct net *net, struct sock *sk)
391 {
392 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
393 	__unix_remove_socket(sk);
394 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
395 }
396 
397 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
398 {
399 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
400 	__unix_insert_socket(net, sk);
401 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
402 }
403 
404 static void unix_insert_bsd_socket(struct sock *sk)
405 {
406 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
407 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
408 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
409 }
410 
411 static void unix_remove_bsd_socket(struct sock *sk)
412 {
413 	if (!hlist_unhashed(&sk->sk_bind_node)) {
414 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
415 		__sk_del_bind_node(sk);
416 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
417 
418 		sk_node_init(&sk->sk_bind_node);
419 	}
420 }
421 
422 static struct sock *__unix_find_socket_byname(struct net *net,
423 					      struct sockaddr_un *sunname,
424 					      int len, unsigned int hash)
425 {
426 	struct sock *s;
427 
428 	sk_for_each(s, &net->unx.table.buckets[hash]) {
429 		struct unix_sock *u = unix_sk(s);
430 
431 		if (u->addr->len == len &&
432 		    !memcmp(u->addr->name, sunname, len))
433 			return s;
434 	}
435 	return NULL;
436 }
437 
438 static inline struct sock *unix_find_socket_byname(struct net *net,
439 						   struct sockaddr_un *sunname,
440 						   int len, unsigned int hash)
441 {
442 	struct sock *s;
443 
444 	spin_lock(&net->unx.table.locks[hash]);
445 	s = __unix_find_socket_byname(net, sunname, len, hash);
446 	if (s)
447 		sock_hold(s);
448 	spin_unlock(&net->unx.table.locks[hash]);
449 	return s;
450 }
451 
452 static struct sock *unix_find_socket_byinode(struct inode *i)
453 {
454 	unsigned int hash = unix_bsd_hash(i);
455 	struct sock *s;
456 
457 	spin_lock(&bsd_socket_locks[hash]);
458 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
459 		struct dentry *dentry = unix_sk(s)->path.dentry;
460 
461 		if (dentry && d_backing_inode(dentry) == i) {
462 			sock_hold(s);
463 			spin_unlock(&bsd_socket_locks[hash]);
464 			return s;
465 		}
466 	}
467 	spin_unlock(&bsd_socket_locks[hash]);
468 	return NULL;
469 }
470 
471 /* Support code for asymmetrically connected dgram sockets
472  *
473  * If a datagram socket is connected to a socket not itself connected
474  * to the first socket (eg, /dev/log), clients may only enqueue more
475  * messages if the present receive queue of the server socket is not
476  * "too large". This means there's a second writeability condition
477  * poll and sendmsg need to test. The dgram recv code will do a wake
478  * up on the peer_wait wait queue of a socket upon reception of a
479  * datagram which needs to be propagated to sleeping would-be writers
480  * since these might not have sent anything so far. This can't be
481  * accomplished via poll_wait because the lifetime of the server
482  * socket might be less than that of its clients if these break their
483  * association with it or if the server socket is closed while clients
484  * are still connected to it and there's no way to inform "a polling
485  * implementation" that it should let go of a certain wait queue
486  *
487  * In order to propagate a wake up, a wait_queue_entry_t of the client
488  * socket is enqueued on the peer_wait queue of the server socket
489  * whose wake function does a wake_up on the ordinary client socket
490  * wait queue. This connection is established whenever a write (or
491  * poll for write) hit the flow control condition and broken when the
492  * association to the server socket is dissolved or after a wake up
493  * was relayed.
494  */
495 
496 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
497 				      void *key)
498 {
499 	struct unix_sock *u;
500 	wait_queue_head_t *u_sleep;
501 
502 	u = container_of(q, struct unix_sock, peer_wake);
503 
504 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
505 			    q);
506 	u->peer_wake.private = NULL;
507 
508 	/* relaying can only happen while the wq still exists */
509 	u_sleep = sk_sleep(&u->sk);
510 	if (u_sleep)
511 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
512 
513 	return 0;
514 }
515 
516 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
517 {
518 	struct unix_sock *u, *u_other;
519 	int rc;
520 
521 	u = unix_sk(sk);
522 	u_other = unix_sk(other);
523 	rc = 0;
524 	spin_lock(&u_other->peer_wait.lock);
525 
526 	if (!u->peer_wake.private) {
527 		u->peer_wake.private = other;
528 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
529 
530 		rc = 1;
531 	}
532 
533 	spin_unlock(&u_other->peer_wait.lock);
534 	return rc;
535 }
536 
537 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
538 					    struct sock *other)
539 {
540 	struct unix_sock *u, *u_other;
541 
542 	u = unix_sk(sk);
543 	u_other = unix_sk(other);
544 	spin_lock(&u_other->peer_wait.lock);
545 
546 	if (u->peer_wake.private == other) {
547 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
548 		u->peer_wake.private = NULL;
549 	}
550 
551 	spin_unlock(&u_other->peer_wait.lock);
552 }
553 
554 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
555 						   struct sock *other)
556 {
557 	unix_dgram_peer_wake_disconnect(sk, other);
558 	wake_up_interruptible_poll(sk_sleep(sk),
559 				   EPOLLOUT |
560 				   EPOLLWRNORM |
561 				   EPOLLWRBAND);
562 }
563 
564 /* preconditions:
565  *	- unix_peer(sk) == other
566  *	- association is stable
567  */
568 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
569 {
570 	int connected;
571 
572 	connected = unix_dgram_peer_wake_connect(sk, other);
573 
574 	/* If other is SOCK_DEAD, we want to make sure we signal
575 	 * POLLOUT, such that a subsequent write() can get a
576 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
577 	 * to other and its full, we will hang waiting for POLLOUT.
578 	 */
579 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
580 		return 1;
581 
582 	if (connected)
583 		unix_dgram_peer_wake_disconnect(sk, other);
584 
585 	return 0;
586 }
587 
588 static int unix_writable(const struct sock *sk, unsigned char state)
589 {
590 	return state != TCP_LISTEN &&
591 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
592 }
593 
594 static void unix_write_space(struct sock *sk)
595 {
596 	struct socket_wq *wq;
597 
598 	rcu_read_lock();
599 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
600 		wq = rcu_dereference(sk->sk_wq);
601 		if (skwq_has_sleeper(wq))
602 			wake_up_interruptible_sync_poll(&wq->wait,
603 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
604 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
605 	}
606 	rcu_read_unlock();
607 }
608 
609 /* When dgram socket disconnects (or changes its peer), we clear its receive
610  * queue of packets arrived from previous peer. First, it allows to do
611  * flow control based only on wmem_alloc; second, sk connected to peer
612  * may receive messages only from that peer. */
613 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
614 {
615 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
616 		skb_queue_purge_reason(&sk->sk_receive_queue,
617 				       SKB_DROP_REASON_UNIX_DISCONNECT);
618 
619 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
620 
621 		/* If one link of bidirectional dgram pipe is disconnected,
622 		 * we signal error. Messages are lost. Do not make this,
623 		 * when peer was not connected to us.
624 		 */
625 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
626 			WRITE_ONCE(other->sk_err, ECONNRESET);
627 			sk_error_report(other);
628 		}
629 	}
630 }
631 
632 static void unix_sock_destructor(struct sock *sk)
633 {
634 	struct unix_sock *u = unix_sk(sk);
635 
636 	skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
637 
638 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
639 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
640 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
641 	if (!sock_flag(sk, SOCK_DEAD)) {
642 		pr_info("Attempt to release alive unix socket: %p\n", sk);
643 		return;
644 	}
645 
646 	if (u->addr)
647 		unix_release_addr(u->addr);
648 
649 	atomic_long_dec(&unix_nr_socks);
650 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
651 #ifdef UNIX_REFCNT_DEBUG
652 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
653 		atomic_long_read(&unix_nr_socks));
654 #endif
655 }
656 
657 static void unix_release_sock(struct sock *sk, int embrion)
658 {
659 	struct unix_sock *u = unix_sk(sk);
660 	struct sock *skpair;
661 	struct sk_buff *skb;
662 	struct path path;
663 	int state;
664 
665 	unix_remove_socket(sock_net(sk), sk);
666 	unix_remove_bsd_socket(sk);
667 
668 	/* Clear state */
669 	unix_state_lock(sk);
670 	sock_orphan(sk);
671 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
672 	path	     = u->path;
673 	u->path.dentry = NULL;
674 	u->path.mnt = NULL;
675 	state = sk->sk_state;
676 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
677 
678 	skpair = unix_peer(sk);
679 	unix_peer(sk) = NULL;
680 
681 	unix_state_unlock(sk);
682 
683 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
684 	u->oob_skb = NULL;
685 #endif
686 
687 	wake_up_interruptible_all(&u->peer_wait);
688 
689 	if (skpair != NULL) {
690 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
691 			unix_state_lock(skpair);
692 			/* No more writes */
693 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
694 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
695 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
696 			unix_state_unlock(skpair);
697 			skpair->sk_state_change(skpair);
698 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
699 		}
700 
701 		unix_dgram_peer_wake_disconnect(sk, skpair);
702 		sock_put(skpair); /* It may now die */
703 	}
704 
705 	/* Try to flush out this socket. Throw out buffers at least */
706 
707 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
708 		if (state == TCP_LISTEN)
709 			unix_release_sock(skb->sk, 1);
710 
711 		/* passed fds are erased in the kfree_skb hook */
712 		kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
713 	}
714 
715 	if (path.dentry)
716 		path_put(&path);
717 
718 	sock_put(sk);
719 
720 	/* ---- Socket is dead now and most probably destroyed ---- */
721 
722 	/*
723 	 * Fixme: BSD difference: In BSD all sockets connected to us get
724 	 *	  ECONNRESET and we die on the spot. In Linux we behave
725 	 *	  like files and pipes do and wait for the last
726 	 *	  dereference.
727 	 *
728 	 * Can't we simply set sock->err?
729 	 *
730 	 *	  What the above comment does talk about? --ANK(980817)
731 	 */
732 
733 	if (READ_ONCE(unix_tot_inflight))
734 		unix_gc();		/* Garbage collect fds */
735 }
736 
737 static void init_peercred(struct sock *sk)
738 {
739 	sk->sk_peer_pid = get_pid(task_tgid(current));
740 	sk->sk_peer_cred = get_current_cred();
741 }
742 
743 static void update_peercred(struct sock *sk)
744 {
745 	const struct cred *old_cred;
746 	struct pid *old_pid;
747 
748 	spin_lock(&sk->sk_peer_lock);
749 	old_pid = sk->sk_peer_pid;
750 	old_cred = sk->sk_peer_cred;
751 	init_peercred(sk);
752 	spin_unlock(&sk->sk_peer_lock);
753 
754 	put_pid(old_pid);
755 	put_cred(old_cred);
756 }
757 
758 static void copy_peercred(struct sock *sk, struct sock *peersk)
759 {
760 	lockdep_assert_held(&unix_sk(peersk)->lock);
761 
762 	spin_lock(&sk->sk_peer_lock);
763 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
764 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
765 	spin_unlock(&sk->sk_peer_lock);
766 }
767 
768 static bool unix_may_passcred(const struct sock *sk)
769 {
770 	return sk->sk_scm_credentials || sk->sk_scm_pidfd;
771 }
772 
773 static int unix_listen(struct socket *sock, int backlog)
774 {
775 	int err;
776 	struct sock *sk = sock->sk;
777 	struct unix_sock *u = unix_sk(sk);
778 
779 	err = -EOPNOTSUPP;
780 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
781 		goto out;	/* Only stream/seqpacket sockets accept */
782 	err = -EINVAL;
783 	if (!READ_ONCE(u->addr))
784 		goto out;	/* No listens on an unbound socket */
785 	unix_state_lock(sk);
786 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
787 		goto out_unlock;
788 	if (backlog > sk->sk_max_ack_backlog)
789 		wake_up_interruptible_all(&u->peer_wait);
790 	sk->sk_max_ack_backlog	= backlog;
791 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
792 
793 	/* set credentials so connect can copy them */
794 	update_peercred(sk);
795 	err = 0;
796 
797 out_unlock:
798 	unix_state_unlock(sk);
799 out:
800 	return err;
801 }
802 
803 static int unix_release(struct socket *);
804 static int unix_bind(struct socket *, struct sockaddr *, int);
805 static int unix_stream_connect(struct socket *, struct sockaddr *,
806 			       int addr_len, int flags);
807 static int unix_socketpair(struct socket *, struct socket *);
808 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
809 static int unix_getname(struct socket *, struct sockaddr *, int);
810 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
811 static __poll_t unix_dgram_poll(struct file *, struct socket *,
812 				    poll_table *);
813 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
814 #ifdef CONFIG_COMPAT
815 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
816 #endif
817 static int unix_shutdown(struct socket *, int);
818 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
819 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
820 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
821 				       struct pipe_inode_info *, size_t size,
822 				       unsigned int flags);
823 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
824 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
825 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
826 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
827 static int unix_dgram_connect(struct socket *, struct sockaddr *,
828 			      int, int);
829 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
830 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
831 				  int);
832 
833 #ifdef CONFIG_PROC_FS
834 static int unix_count_nr_fds(struct sock *sk)
835 {
836 	struct sk_buff *skb;
837 	struct unix_sock *u;
838 	int nr_fds = 0;
839 
840 	spin_lock(&sk->sk_receive_queue.lock);
841 	skb = skb_peek(&sk->sk_receive_queue);
842 	while (skb) {
843 		u = unix_sk(skb->sk);
844 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
845 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
846 	}
847 	spin_unlock(&sk->sk_receive_queue.lock);
848 
849 	return nr_fds;
850 }
851 
852 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
853 {
854 	struct sock *sk = sock->sk;
855 	unsigned char s_state;
856 	struct unix_sock *u;
857 	int nr_fds = 0;
858 
859 	if (sk) {
860 		s_state = READ_ONCE(sk->sk_state);
861 		u = unix_sk(sk);
862 
863 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
864 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
865 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
866 		 */
867 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
868 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
869 		else if (s_state == TCP_LISTEN)
870 			nr_fds = unix_count_nr_fds(sk);
871 
872 		seq_printf(m, "scm_fds: %u\n", nr_fds);
873 	}
874 }
875 #else
876 #define unix_show_fdinfo NULL
877 #endif
878 
879 static const struct proto_ops unix_stream_ops = {
880 	.family =	PF_UNIX,
881 	.owner =	THIS_MODULE,
882 	.release =	unix_release,
883 	.bind =		unix_bind,
884 	.connect =	unix_stream_connect,
885 	.socketpair =	unix_socketpair,
886 	.accept =	unix_accept,
887 	.getname =	unix_getname,
888 	.poll =		unix_poll,
889 	.ioctl =	unix_ioctl,
890 #ifdef CONFIG_COMPAT
891 	.compat_ioctl =	unix_compat_ioctl,
892 #endif
893 	.listen =	unix_listen,
894 	.shutdown =	unix_shutdown,
895 	.sendmsg =	unix_stream_sendmsg,
896 	.recvmsg =	unix_stream_recvmsg,
897 	.read_skb =	unix_stream_read_skb,
898 	.mmap =		sock_no_mmap,
899 	.splice_read =	unix_stream_splice_read,
900 	.set_peek_off =	sk_set_peek_off,
901 	.show_fdinfo =	unix_show_fdinfo,
902 };
903 
904 static const struct proto_ops unix_dgram_ops = {
905 	.family =	PF_UNIX,
906 	.owner =	THIS_MODULE,
907 	.release =	unix_release,
908 	.bind =		unix_bind,
909 	.connect =	unix_dgram_connect,
910 	.socketpair =	unix_socketpair,
911 	.accept =	sock_no_accept,
912 	.getname =	unix_getname,
913 	.poll =		unix_dgram_poll,
914 	.ioctl =	unix_ioctl,
915 #ifdef CONFIG_COMPAT
916 	.compat_ioctl =	unix_compat_ioctl,
917 #endif
918 	.listen =	sock_no_listen,
919 	.shutdown =	unix_shutdown,
920 	.sendmsg =	unix_dgram_sendmsg,
921 	.read_skb =	unix_read_skb,
922 	.recvmsg =	unix_dgram_recvmsg,
923 	.mmap =		sock_no_mmap,
924 	.set_peek_off =	sk_set_peek_off,
925 	.show_fdinfo =	unix_show_fdinfo,
926 };
927 
928 static const struct proto_ops unix_seqpacket_ops = {
929 	.family =	PF_UNIX,
930 	.owner =	THIS_MODULE,
931 	.release =	unix_release,
932 	.bind =		unix_bind,
933 	.connect =	unix_stream_connect,
934 	.socketpair =	unix_socketpair,
935 	.accept =	unix_accept,
936 	.getname =	unix_getname,
937 	.poll =		unix_dgram_poll,
938 	.ioctl =	unix_ioctl,
939 #ifdef CONFIG_COMPAT
940 	.compat_ioctl =	unix_compat_ioctl,
941 #endif
942 	.listen =	unix_listen,
943 	.shutdown =	unix_shutdown,
944 	.sendmsg =	unix_seqpacket_sendmsg,
945 	.recvmsg =	unix_seqpacket_recvmsg,
946 	.mmap =		sock_no_mmap,
947 	.set_peek_off =	sk_set_peek_off,
948 	.show_fdinfo =	unix_show_fdinfo,
949 };
950 
951 static void unix_close(struct sock *sk, long timeout)
952 {
953 	/* Nothing to do here, unix socket does not need a ->close().
954 	 * This is merely for sockmap.
955 	 */
956 }
957 
958 static bool unix_bpf_bypass_getsockopt(int level, int optname)
959 {
960 	if (level == SOL_SOCKET) {
961 		switch (optname) {
962 		case SO_PEERPIDFD:
963 			return true;
964 		default:
965 			return false;
966 		}
967 	}
968 
969 	return false;
970 }
971 
972 struct proto unix_dgram_proto = {
973 	.name			= "UNIX",
974 	.owner			= THIS_MODULE,
975 	.obj_size		= sizeof(struct unix_sock),
976 	.close			= unix_close,
977 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
978 #ifdef CONFIG_BPF_SYSCALL
979 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
980 #endif
981 };
982 
983 struct proto unix_stream_proto = {
984 	.name			= "UNIX-STREAM",
985 	.owner			= THIS_MODULE,
986 	.obj_size		= sizeof(struct unix_sock),
987 	.close			= unix_close,
988 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
989 #ifdef CONFIG_BPF_SYSCALL
990 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
991 #endif
992 };
993 
994 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
995 {
996 	struct unix_sock *u;
997 	struct sock *sk;
998 	int err;
999 
1000 	atomic_long_inc(&unix_nr_socks);
1001 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1002 		err = -ENFILE;
1003 		goto err;
1004 	}
1005 
1006 	if (type == SOCK_STREAM)
1007 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1008 	else /*dgram and  seqpacket */
1009 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1010 
1011 	if (!sk) {
1012 		err = -ENOMEM;
1013 		goto err;
1014 	}
1015 
1016 	sock_init_data(sock, sk);
1017 
1018 	sk->sk_scm_rights	= 1;
1019 	sk->sk_hash		= unix_unbound_hash(sk);
1020 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1021 	sk->sk_write_space	= unix_write_space;
1022 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1023 	sk->sk_destruct		= unix_sock_destructor;
1024 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1025 
1026 	u = unix_sk(sk);
1027 	u->listener = NULL;
1028 	u->vertex = NULL;
1029 	u->path.dentry = NULL;
1030 	u->path.mnt = NULL;
1031 	spin_lock_init(&u->lock);
1032 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1033 	mutex_init(&u->iolock); /* single task reading lock */
1034 	mutex_init(&u->bindlock); /* single task binding lock */
1035 	init_waitqueue_head(&u->peer_wait);
1036 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1037 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1038 	unix_insert_unbound_socket(net, sk);
1039 
1040 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1041 
1042 	return sk;
1043 
1044 err:
1045 	atomic_long_dec(&unix_nr_socks);
1046 	return ERR_PTR(err);
1047 }
1048 
1049 static int unix_create(struct net *net, struct socket *sock, int protocol,
1050 		       int kern)
1051 {
1052 	struct sock *sk;
1053 
1054 	if (protocol && protocol != PF_UNIX)
1055 		return -EPROTONOSUPPORT;
1056 
1057 	sock->state = SS_UNCONNECTED;
1058 
1059 	switch (sock->type) {
1060 	case SOCK_STREAM:
1061 		sock->ops = &unix_stream_ops;
1062 		break;
1063 		/*
1064 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1065 		 *	nothing uses it.
1066 		 */
1067 	case SOCK_RAW:
1068 		sock->type = SOCK_DGRAM;
1069 		fallthrough;
1070 	case SOCK_DGRAM:
1071 		sock->ops = &unix_dgram_ops;
1072 		break;
1073 	case SOCK_SEQPACKET:
1074 		sock->ops = &unix_seqpacket_ops;
1075 		break;
1076 	default:
1077 		return -ESOCKTNOSUPPORT;
1078 	}
1079 
1080 	sk = unix_create1(net, sock, kern, sock->type);
1081 	if (IS_ERR(sk))
1082 		return PTR_ERR(sk);
1083 
1084 	return 0;
1085 }
1086 
1087 static int unix_release(struct socket *sock)
1088 {
1089 	struct sock *sk = sock->sk;
1090 
1091 	if (!sk)
1092 		return 0;
1093 
1094 	sk->sk_prot->close(sk, 0);
1095 	unix_release_sock(sk, 0);
1096 	sock->sk = NULL;
1097 
1098 	return 0;
1099 }
1100 
1101 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1102 				  int type)
1103 {
1104 	struct inode *inode;
1105 	struct path path;
1106 	struct sock *sk;
1107 	int err;
1108 
1109 	unix_mkname_bsd(sunaddr, addr_len);
1110 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1111 	if (err)
1112 		goto fail;
1113 
1114 	err = path_permission(&path, MAY_WRITE);
1115 	if (err)
1116 		goto path_put;
1117 
1118 	err = -ECONNREFUSED;
1119 	inode = d_backing_inode(path.dentry);
1120 	if (!S_ISSOCK(inode->i_mode))
1121 		goto path_put;
1122 
1123 	sk = unix_find_socket_byinode(inode);
1124 	if (!sk)
1125 		goto path_put;
1126 
1127 	err = -EPROTOTYPE;
1128 	if (sk->sk_type == type)
1129 		touch_atime(&path);
1130 	else
1131 		goto sock_put;
1132 
1133 	path_put(&path);
1134 
1135 	return sk;
1136 
1137 sock_put:
1138 	sock_put(sk);
1139 path_put:
1140 	path_put(&path);
1141 fail:
1142 	return ERR_PTR(err);
1143 }
1144 
1145 static struct sock *unix_find_abstract(struct net *net,
1146 				       struct sockaddr_un *sunaddr,
1147 				       int addr_len, int type)
1148 {
1149 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1150 	struct dentry *dentry;
1151 	struct sock *sk;
1152 
1153 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1154 	if (!sk)
1155 		return ERR_PTR(-ECONNREFUSED);
1156 
1157 	dentry = unix_sk(sk)->path.dentry;
1158 	if (dentry)
1159 		touch_atime(&unix_sk(sk)->path);
1160 
1161 	return sk;
1162 }
1163 
1164 static struct sock *unix_find_other(struct net *net,
1165 				    struct sockaddr_un *sunaddr,
1166 				    int addr_len, int type)
1167 {
1168 	struct sock *sk;
1169 
1170 	if (sunaddr->sun_path[0])
1171 		sk = unix_find_bsd(sunaddr, addr_len, type);
1172 	else
1173 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1174 
1175 	return sk;
1176 }
1177 
1178 static int unix_autobind(struct sock *sk)
1179 {
1180 	struct unix_sock *u = unix_sk(sk);
1181 	unsigned int new_hash, old_hash;
1182 	struct net *net = sock_net(sk);
1183 	struct unix_address *addr;
1184 	u32 lastnum, ordernum;
1185 	int err;
1186 
1187 	err = mutex_lock_interruptible(&u->bindlock);
1188 	if (err)
1189 		return err;
1190 
1191 	if (u->addr)
1192 		goto out;
1193 
1194 	err = -ENOMEM;
1195 	addr = kzalloc(sizeof(*addr) +
1196 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1197 	if (!addr)
1198 		goto out;
1199 
1200 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1201 	addr->name->sun_family = AF_UNIX;
1202 	refcount_set(&addr->refcnt, 1);
1203 
1204 	old_hash = sk->sk_hash;
1205 	ordernum = get_random_u32();
1206 	lastnum = ordernum & 0xFFFFF;
1207 retry:
1208 	ordernum = (ordernum + 1) & 0xFFFFF;
1209 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1210 
1211 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1212 	unix_table_double_lock(net, old_hash, new_hash);
1213 
1214 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1215 		unix_table_double_unlock(net, old_hash, new_hash);
1216 
1217 		/* __unix_find_socket_byname() may take long time if many names
1218 		 * are already in use.
1219 		 */
1220 		cond_resched();
1221 
1222 		if (ordernum == lastnum) {
1223 			/* Give up if all names seems to be in use. */
1224 			err = -ENOSPC;
1225 			unix_release_addr(addr);
1226 			goto out;
1227 		}
1228 
1229 		goto retry;
1230 	}
1231 
1232 	__unix_set_addr_hash(net, sk, addr, new_hash);
1233 	unix_table_double_unlock(net, old_hash, new_hash);
1234 	err = 0;
1235 
1236 out:	mutex_unlock(&u->bindlock);
1237 	return err;
1238 }
1239 
1240 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1241 			 int addr_len)
1242 {
1243 	umode_t mode = S_IFSOCK |
1244 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1245 	struct unix_sock *u = unix_sk(sk);
1246 	unsigned int new_hash, old_hash;
1247 	struct net *net = sock_net(sk);
1248 	struct mnt_idmap *idmap;
1249 	struct unix_address *addr;
1250 	struct dentry *dentry;
1251 	struct path parent;
1252 	int err;
1253 
1254 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1255 	addr = unix_create_addr(sunaddr, addr_len);
1256 	if (!addr)
1257 		return -ENOMEM;
1258 
1259 	/*
1260 	 * Get the parent directory, calculate the hash for last
1261 	 * component.
1262 	 */
1263 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1264 	if (IS_ERR(dentry)) {
1265 		err = PTR_ERR(dentry);
1266 		goto out;
1267 	}
1268 
1269 	/*
1270 	 * All right, let's create it.
1271 	 */
1272 	idmap = mnt_idmap(parent.mnt);
1273 	err = security_path_mknod(&parent, dentry, mode, 0);
1274 	if (!err)
1275 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1276 	if (err)
1277 		goto out_path;
1278 	err = mutex_lock_interruptible(&u->bindlock);
1279 	if (err)
1280 		goto out_unlink;
1281 	if (u->addr)
1282 		goto out_unlock;
1283 
1284 	old_hash = sk->sk_hash;
1285 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1286 	unix_table_double_lock(net, old_hash, new_hash);
1287 	u->path.mnt = mntget(parent.mnt);
1288 	u->path.dentry = dget(dentry);
1289 	__unix_set_addr_hash(net, sk, addr, new_hash);
1290 	unix_table_double_unlock(net, old_hash, new_hash);
1291 	unix_insert_bsd_socket(sk);
1292 	mutex_unlock(&u->bindlock);
1293 	done_path_create(&parent, dentry);
1294 	return 0;
1295 
1296 out_unlock:
1297 	mutex_unlock(&u->bindlock);
1298 	err = -EINVAL;
1299 out_unlink:
1300 	/* failed after successful mknod?  unlink what we'd created... */
1301 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1302 out_path:
1303 	done_path_create(&parent, dentry);
1304 out:
1305 	unix_release_addr(addr);
1306 	return err == -EEXIST ? -EADDRINUSE : err;
1307 }
1308 
1309 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1310 			      int addr_len)
1311 {
1312 	struct unix_sock *u = unix_sk(sk);
1313 	unsigned int new_hash, old_hash;
1314 	struct net *net = sock_net(sk);
1315 	struct unix_address *addr;
1316 	int err;
1317 
1318 	addr = unix_create_addr(sunaddr, addr_len);
1319 	if (!addr)
1320 		return -ENOMEM;
1321 
1322 	err = mutex_lock_interruptible(&u->bindlock);
1323 	if (err)
1324 		goto out;
1325 
1326 	if (u->addr) {
1327 		err = -EINVAL;
1328 		goto out_mutex;
1329 	}
1330 
1331 	old_hash = sk->sk_hash;
1332 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1333 	unix_table_double_lock(net, old_hash, new_hash);
1334 
1335 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1336 		goto out_spin;
1337 
1338 	__unix_set_addr_hash(net, sk, addr, new_hash);
1339 	unix_table_double_unlock(net, old_hash, new_hash);
1340 	mutex_unlock(&u->bindlock);
1341 	return 0;
1342 
1343 out_spin:
1344 	unix_table_double_unlock(net, old_hash, new_hash);
1345 	err = -EADDRINUSE;
1346 out_mutex:
1347 	mutex_unlock(&u->bindlock);
1348 out:
1349 	unix_release_addr(addr);
1350 	return err;
1351 }
1352 
1353 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1354 {
1355 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1356 	struct sock *sk = sock->sk;
1357 	int err;
1358 
1359 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1360 	    sunaddr->sun_family == AF_UNIX)
1361 		return unix_autobind(sk);
1362 
1363 	err = unix_validate_addr(sunaddr, addr_len);
1364 	if (err)
1365 		return err;
1366 
1367 	if (sunaddr->sun_path[0])
1368 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1369 	else
1370 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1371 
1372 	return err;
1373 }
1374 
1375 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1376 {
1377 	if (unlikely(sk1 == sk2) || !sk2) {
1378 		unix_state_lock(sk1);
1379 		return;
1380 	}
1381 
1382 	if (sk1 > sk2)
1383 		swap(sk1, sk2);
1384 
1385 	unix_state_lock(sk1);
1386 	unix_state_lock(sk2);
1387 }
1388 
1389 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1390 {
1391 	if (unlikely(sk1 == sk2) || !sk2) {
1392 		unix_state_unlock(sk1);
1393 		return;
1394 	}
1395 	unix_state_unlock(sk1);
1396 	unix_state_unlock(sk2);
1397 }
1398 
1399 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1400 			      int alen, int flags)
1401 {
1402 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1403 	struct sock *sk = sock->sk;
1404 	struct sock *other;
1405 	int err;
1406 
1407 	err = -EINVAL;
1408 	if (alen < offsetofend(struct sockaddr, sa_family))
1409 		goto out;
1410 
1411 	if (addr->sa_family != AF_UNSPEC) {
1412 		err = unix_validate_addr(sunaddr, alen);
1413 		if (err)
1414 			goto out;
1415 
1416 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1417 		if (err)
1418 			goto out;
1419 
1420 		if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) {
1421 			err = unix_autobind(sk);
1422 			if (err)
1423 				goto out;
1424 		}
1425 
1426 restart:
1427 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1428 		if (IS_ERR(other)) {
1429 			err = PTR_ERR(other);
1430 			goto out;
1431 		}
1432 
1433 		unix_state_double_lock(sk, other);
1434 
1435 		/* Apparently VFS overslept socket death. Retry. */
1436 		if (sock_flag(other, SOCK_DEAD)) {
1437 			unix_state_double_unlock(sk, other);
1438 			sock_put(other);
1439 			goto restart;
1440 		}
1441 
1442 		err = -EPERM;
1443 		if (!unix_may_send(sk, other))
1444 			goto out_unlock;
1445 
1446 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1447 		if (err)
1448 			goto out_unlock;
1449 
1450 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1451 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1452 	} else {
1453 		/*
1454 		 *	1003.1g breaking connected state with AF_UNSPEC
1455 		 */
1456 		other = NULL;
1457 		unix_state_double_lock(sk, other);
1458 	}
1459 
1460 	/*
1461 	 * If it was connected, reconnect.
1462 	 */
1463 	if (unix_peer(sk)) {
1464 		struct sock *old_peer = unix_peer(sk);
1465 
1466 		unix_peer(sk) = other;
1467 		if (!other)
1468 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1469 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1470 
1471 		unix_state_double_unlock(sk, other);
1472 
1473 		if (other != old_peer) {
1474 			unix_dgram_disconnected(sk, old_peer);
1475 
1476 			unix_state_lock(old_peer);
1477 			if (!unix_peer(old_peer))
1478 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1479 			unix_state_unlock(old_peer);
1480 		}
1481 
1482 		sock_put(old_peer);
1483 	} else {
1484 		unix_peer(sk) = other;
1485 		unix_state_double_unlock(sk, other);
1486 	}
1487 
1488 	return 0;
1489 
1490 out_unlock:
1491 	unix_state_double_unlock(sk, other);
1492 	sock_put(other);
1493 out:
1494 	return err;
1495 }
1496 
1497 static long unix_wait_for_peer(struct sock *other, long timeo)
1498 {
1499 	struct unix_sock *u = unix_sk(other);
1500 	int sched;
1501 	DEFINE_WAIT(wait);
1502 
1503 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1504 
1505 	sched = !sock_flag(other, SOCK_DEAD) &&
1506 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1507 		unix_recvq_full_lockless(other);
1508 
1509 	unix_state_unlock(other);
1510 
1511 	if (sched)
1512 		timeo = schedule_timeout(timeo);
1513 
1514 	finish_wait(&u->peer_wait, &wait);
1515 	return timeo;
1516 }
1517 
1518 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1519 			       int addr_len, int flags)
1520 {
1521 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1522 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1523 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1524 	struct net *net = sock_net(sk);
1525 	struct sk_buff *skb = NULL;
1526 	unsigned char state;
1527 	long timeo;
1528 	int err;
1529 
1530 	err = unix_validate_addr(sunaddr, addr_len);
1531 	if (err)
1532 		goto out;
1533 
1534 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1535 	if (err)
1536 		goto out;
1537 
1538 	if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
1539 		err = unix_autobind(sk);
1540 		if (err)
1541 			goto out;
1542 	}
1543 
1544 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1545 
1546 	/* First of all allocate resources.
1547 	 * If we will make it after state is locked,
1548 	 * we will have to recheck all again in any case.
1549 	 */
1550 
1551 	/* create new sock for complete connection */
1552 	newsk = unix_create1(net, NULL, 0, sock->type);
1553 	if (IS_ERR(newsk)) {
1554 		err = PTR_ERR(newsk);
1555 		goto out;
1556 	}
1557 
1558 	/* Allocate skb for sending to listening sock */
1559 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1560 	if (!skb) {
1561 		err = -ENOMEM;
1562 		goto out_free_sk;
1563 	}
1564 
1565 restart:
1566 	/*  Find listening sock. */
1567 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1568 	if (IS_ERR(other)) {
1569 		err = PTR_ERR(other);
1570 		goto out_free_skb;
1571 	}
1572 
1573 	unix_state_lock(other);
1574 
1575 	/* Apparently VFS overslept socket death. Retry. */
1576 	if (sock_flag(other, SOCK_DEAD)) {
1577 		unix_state_unlock(other);
1578 		sock_put(other);
1579 		goto restart;
1580 	}
1581 
1582 	if (other->sk_state != TCP_LISTEN ||
1583 	    other->sk_shutdown & RCV_SHUTDOWN) {
1584 		err = -ECONNREFUSED;
1585 		goto out_unlock;
1586 	}
1587 
1588 	if (unix_recvq_full_lockless(other)) {
1589 		if (!timeo) {
1590 			err = -EAGAIN;
1591 			goto out_unlock;
1592 		}
1593 
1594 		timeo = unix_wait_for_peer(other, timeo);
1595 		sock_put(other);
1596 
1597 		err = sock_intr_errno(timeo);
1598 		if (signal_pending(current))
1599 			goto out_free_skb;
1600 
1601 		goto restart;
1602 	}
1603 
1604 	/* self connect and simultaneous connect are eliminated
1605 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1606 	 */
1607 	state = READ_ONCE(sk->sk_state);
1608 	if (unlikely(state != TCP_CLOSE)) {
1609 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1610 		goto out_unlock;
1611 	}
1612 
1613 	unix_state_lock(sk);
1614 
1615 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1616 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1617 		unix_state_unlock(sk);
1618 		goto out_unlock;
1619 	}
1620 
1621 	err = security_unix_stream_connect(sk, other, newsk);
1622 	if (err) {
1623 		unix_state_unlock(sk);
1624 		goto out_unlock;
1625 	}
1626 
1627 	/* The way is open! Fastly set all the necessary fields... */
1628 
1629 	sock_hold(sk);
1630 	unix_peer(newsk) = sk;
1631 	newsk->sk_state = TCP_ESTABLISHED;
1632 	newsk->sk_type = sk->sk_type;
1633 	newsk->sk_scm_recv_flags = other->sk_scm_recv_flags;
1634 	init_peercred(newsk);
1635 
1636 	newu = unix_sk(newsk);
1637 	newu->listener = other;
1638 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1639 	otheru = unix_sk(other);
1640 
1641 	/* copy address information from listening to new sock
1642 	 *
1643 	 * The contents of *(otheru->addr) and otheru->path
1644 	 * are seen fully set up here, since we have found
1645 	 * otheru in hash under its lock.  Insertion into the
1646 	 * hash chain we'd found it in had been done in an
1647 	 * earlier critical area protected by the chain's lock,
1648 	 * the same one where we'd set *(otheru->addr) contents,
1649 	 * as well as otheru->path and otheru->addr itself.
1650 	 *
1651 	 * Using smp_store_release() here to set newu->addr
1652 	 * is enough to make those stores, as well as stores
1653 	 * to newu->path visible to anyone who gets newu->addr
1654 	 * by smp_load_acquire().  IOW, the same warranties
1655 	 * as for unix_sock instances bound in unix_bind() or
1656 	 * in unix_autobind().
1657 	 */
1658 	if (otheru->path.dentry) {
1659 		path_get(&otheru->path);
1660 		newu->path = otheru->path;
1661 	}
1662 	refcount_inc(&otheru->addr->refcnt);
1663 	smp_store_release(&newu->addr, otheru->addr);
1664 
1665 	/* Set credentials */
1666 	copy_peercred(sk, other);
1667 
1668 	sock->state	= SS_CONNECTED;
1669 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1670 	sock_hold(newsk);
1671 
1672 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1673 	unix_peer(sk)	= newsk;
1674 
1675 	unix_state_unlock(sk);
1676 
1677 	/* take ten and send info to listening sock */
1678 	spin_lock(&other->sk_receive_queue.lock);
1679 	__skb_queue_tail(&other->sk_receive_queue, skb);
1680 	spin_unlock(&other->sk_receive_queue.lock);
1681 	unix_state_unlock(other);
1682 	other->sk_data_ready(other);
1683 	sock_put(other);
1684 	return 0;
1685 
1686 out_unlock:
1687 	unix_state_unlock(other);
1688 	sock_put(other);
1689 out_free_skb:
1690 	consume_skb(skb);
1691 out_free_sk:
1692 	unix_release_sock(newsk, 0);
1693 out:
1694 	return err;
1695 }
1696 
1697 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1698 {
1699 	struct sock *ska = socka->sk, *skb = sockb->sk;
1700 
1701 	/* Join our sockets back to back */
1702 	sock_hold(ska);
1703 	sock_hold(skb);
1704 	unix_peer(ska) = skb;
1705 	unix_peer(skb) = ska;
1706 	init_peercred(ska);
1707 	init_peercred(skb);
1708 
1709 	ska->sk_state = TCP_ESTABLISHED;
1710 	skb->sk_state = TCP_ESTABLISHED;
1711 	socka->state  = SS_CONNECTED;
1712 	sockb->state  = SS_CONNECTED;
1713 	return 0;
1714 }
1715 
1716 static int unix_accept(struct socket *sock, struct socket *newsock,
1717 		       struct proto_accept_arg *arg)
1718 {
1719 	struct sock *sk = sock->sk;
1720 	struct sk_buff *skb;
1721 	struct sock *tsk;
1722 
1723 	arg->err = -EOPNOTSUPP;
1724 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1725 		goto out;
1726 
1727 	arg->err = -EINVAL;
1728 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1729 		goto out;
1730 
1731 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1732 	 * so that no locks are necessary.
1733 	 */
1734 
1735 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1736 				&arg->err);
1737 	if (!skb) {
1738 		/* This means receive shutdown. */
1739 		if (arg->err == 0)
1740 			arg->err = -EINVAL;
1741 		goto out;
1742 	}
1743 
1744 	tsk = skb->sk;
1745 	skb_free_datagram(sk, skb);
1746 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1747 
1748 	/* attach accepted sock to socket */
1749 	unix_state_lock(tsk);
1750 	unix_update_edges(unix_sk(tsk));
1751 	newsock->state = SS_CONNECTED;
1752 	sock_graft(tsk, newsock);
1753 	unix_state_unlock(tsk);
1754 	return 0;
1755 
1756 out:
1757 	return arg->err;
1758 }
1759 
1760 
1761 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1762 {
1763 	struct sock *sk = sock->sk;
1764 	struct unix_address *addr;
1765 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1766 	int err = 0;
1767 
1768 	if (peer) {
1769 		sk = unix_peer_get(sk);
1770 
1771 		err = -ENOTCONN;
1772 		if (!sk)
1773 			goto out;
1774 		err = 0;
1775 	} else {
1776 		sock_hold(sk);
1777 	}
1778 
1779 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1780 	if (!addr) {
1781 		sunaddr->sun_family = AF_UNIX;
1782 		sunaddr->sun_path[0] = 0;
1783 		err = offsetof(struct sockaddr_un, sun_path);
1784 	} else {
1785 		err = addr->len;
1786 		memcpy(sunaddr, addr->name, addr->len);
1787 
1788 		if (peer)
1789 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1790 					       CGROUP_UNIX_GETPEERNAME);
1791 		else
1792 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1793 					       CGROUP_UNIX_GETSOCKNAME);
1794 	}
1795 	sock_put(sk);
1796 out:
1797 	return err;
1798 }
1799 
1800 /* The "user->unix_inflight" variable is protected by the garbage
1801  * collection lock, and we just read it locklessly here. If you go
1802  * over the limit, there might be a tiny race in actually noticing
1803  * it across threads. Tough.
1804  */
1805 static inline bool too_many_unix_fds(struct task_struct *p)
1806 {
1807 	struct user_struct *user = current_user();
1808 
1809 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1810 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1811 	return false;
1812 }
1813 
1814 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1815 {
1816 	if (too_many_unix_fds(current))
1817 		return -ETOOMANYREFS;
1818 
1819 	UNIXCB(skb).fp = scm->fp;
1820 	scm->fp = NULL;
1821 
1822 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1823 		return -ENOMEM;
1824 
1825 	return 0;
1826 }
1827 
1828 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1829 {
1830 	scm->fp = UNIXCB(skb).fp;
1831 	UNIXCB(skb).fp = NULL;
1832 
1833 	unix_destroy_fpl(scm->fp);
1834 }
1835 
1836 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1837 {
1838 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1839 }
1840 
1841 static void unix_destruct_scm(struct sk_buff *skb)
1842 {
1843 	struct scm_cookie scm;
1844 
1845 	memset(&scm, 0, sizeof(scm));
1846 	scm.pid  = UNIXCB(skb).pid;
1847 	if (UNIXCB(skb).fp)
1848 		unix_detach_fds(&scm, skb);
1849 
1850 	/* Alas, it calls VFS */
1851 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1852 	scm_destroy(&scm);
1853 	sock_wfree(skb);
1854 }
1855 
1856 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1857 {
1858 	int err = 0;
1859 
1860 	UNIXCB(skb).pid = get_pid(scm->pid);
1861 	UNIXCB(skb).uid = scm->creds.uid;
1862 	UNIXCB(skb).gid = scm->creds.gid;
1863 	UNIXCB(skb).fp = NULL;
1864 	unix_get_secdata(scm, skb);
1865 	if (scm->fp && send_fds)
1866 		err = unix_attach_fds(scm, skb);
1867 
1868 	skb->destructor = unix_destruct_scm;
1869 	return err;
1870 }
1871 
1872 /*
1873  * Some apps rely on write() giving SCM_CREDENTIALS
1874  * We include credentials if source or destination socket
1875  * asserted SOCK_PASSCRED.
1876  */
1877 static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk,
1878 				 const struct sock *other)
1879 {
1880 	if (UNIXCB(skb).pid)
1881 		return;
1882 
1883 	if (unix_may_passcred(sk) || unix_may_passcred(other)) {
1884 		UNIXCB(skb).pid = get_pid(task_tgid(current));
1885 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1886 	}
1887 }
1888 
1889 static bool unix_skb_scm_eq(struct sk_buff *skb,
1890 			    struct scm_cookie *scm)
1891 {
1892 	return UNIXCB(skb).pid == scm->pid &&
1893 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1894 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1895 	       unix_secdata_eq(scm, skb);
1896 }
1897 
1898 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1899 {
1900 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1901 	struct unix_sock *u = unix_sk(sk);
1902 
1903 	if (unlikely(fp && fp->count)) {
1904 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1905 		unix_add_edges(fp, u);
1906 	}
1907 }
1908 
1909 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1910 {
1911 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1912 	struct unix_sock *u = unix_sk(sk);
1913 
1914 	if (unlikely(fp && fp->count)) {
1915 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1916 		unix_del_edges(fp);
1917 	}
1918 }
1919 
1920 /*
1921  *	Send AF_UNIX data.
1922  */
1923 
1924 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1925 			      size_t len)
1926 {
1927 	struct sock *sk = sock->sk, *other = NULL;
1928 	struct unix_sock *u = unix_sk(sk);
1929 	struct scm_cookie scm;
1930 	struct sk_buff *skb;
1931 	int data_len = 0;
1932 	int sk_locked;
1933 	long timeo;
1934 	int err;
1935 
1936 	err = scm_send(sock, msg, &scm, false);
1937 	if (err < 0)
1938 		return err;
1939 
1940 	wait_for_unix_gc(scm.fp);
1941 
1942 	if (msg->msg_flags & MSG_OOB) {
1943 		err = -EOPNOTSUPP;
1944 		goto out;
1945 	}
1946 
1947 	if (msg->msg_namelen) {
1948 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
1949 		if (err)
1950 			goto out;
1951 
1952 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1953 							    msg->msg_name,
1954 							    &msg->msg_namelen,
1955 							    NULL);
1956 		if (err)
1957 			goto out;
1958 	}
1959 
1960 	if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
1961 		err = unix_autobind(sk);
1962 		if (err)
1963 			goto out;
1964 	}
1965 
1966 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
1967 		err = -EMSGSIZE;
1968 		goto out;
1969 	}
1970 
1971 	if (len > SKB_MAX_ALLOC) {
1972 		data_len = min_t(size_t,
1973 				 len - SKB_MAX_ALLOC,
1974 				 MAX_SKB_FRAGS * PAGE_SIZE);
1975 		data_len = PAGE_ALIGN(data_len);
1976 
1977 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1978 	}
1979 
1980 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1981 				   msg->msg_flags & MSG_DONTWAIT, &err,
1982 				   PAGE_ALLOC_COSTLY_ORDER);
1983 	if (!skb)
1984 		goto out;
1985 
1986 	err = unix_scm_to_skb(&scm, skb, true);
1987 	if (err < 0)
1988 		goto out_free;
1989 
1990 	skb_put(skb, len - data_len);
1991 	skb->data_len = data_len;
1992 	skb->len = len;
1993 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1994 	if (err)
1995 		goto out_free;
1996 
1997 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1998 
1999 	if (msg->msg_namelen) {
2000 lookup:
2001 		other = unix_find_other(sock_net(sk), msg->msg_name,
2002 					msg->msg_namelen, sk->sk_type);
2003 		if (IS_ERR(other)) {
2004 			err = PTR_ERR(other);
2005 			goto out_free;
2006 		}
2007 	} else {
2008 		other = unix_peer_get(sk);
2009 		if (!other) {
2010 			err = -ENOTCONN;
2011 			goto out_free;
2012 		}
2013 	}
2014 
2015 	if (sk_filter(other, skb) < 0) {
2016 		/* Toss the packet but do not return any error to the sender */
2017 		err = len;
2018 		goto out_sock_put;
2019 	}
2020 
2021 restart:
2022 	sk_locked = 0;
2023 	unix_state_lock(other);
2024 restart_locked:
2025 
2026 	if (!unix_may_send(sk, other)) {
2027 		err = -EPERM;
2028 		goto out_unlock;
2029 	}
2030 
2031 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2032 		/* Check with 1003.1g - what should datagram error */
2033 
2034 		unix_state_unlock(other);
2035 
2036 		if (sk->sk_type == SOCK_SEQPACKET) {
2037 			/* We are here only when racing with unix_release_sock()
2038 			 * is clearing @other. Never change state to TCP_CLOSE
2039 			 * unlike SOCK_DGRAM wants.
2040 			 */
2041 			err = -EPIPE;
2042 			goto out_sock_put;
2043 		}
2044 
2045 		if (!sk_locked)
2046 			unix_state_lock(sk);
2047 
2048 		if (unix_peer(sk) == other) {
2049 			unix_peer(sk) = NULL;
2050 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2051 
2052 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2053 			unix_state_unlock(sk);
2054 
2055 			unix_dgram_disconnected(sk, other);
2056 			sock_put(other);
2057 			err = -ECONNREFUSED;
2058 			goto out_sock_put;
2059 		}
2060 
2061 		unix_state_unlock(sk);
2062 
2063 		if (!msg->msg_namelen) {
2064 			err = -ECONNRESET;
2065 			goto out_sock_put;
2066 		}
2067 
2068 		sock_put(other);
2069 		goto lookup;
2070 	}
2071 
2072 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2073 		err = -EPIPE;
2074 		goto out_unlock;
2075 	}
2076 
2077 	if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2078 		err = -EPERM;
2079 		goto out_unlock;
2080 	}
2081 
2082 	if (sk->sk_type != SOCK_SEQPACKET) {
2083 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2084 		if (err)
2085 			goto out_unlock;
2086 	}
2087 
2088 	/* other == sk && unix_peer(other) != sk if
2089 	 * - unix_peer(sk) == NULL, destination address bound to sk
2090 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2091 	 */
2092 	if (other != sk &&
2093 	    unlikely(unix_peer(other) != sk &&
2094 	    unix_recvq_full_lockless(other))) {
2095 		if (timeo) {
2096 			timeo = unix_wait_for_peer(other, timeo);
2097 
2098 			err = sock_intr_errno(timeo);
2099 			if (signal_pending(current))
2100 				goto out_sock_put;
2101 
2102 			goto restart;
2103 		}
2104 
2105 		if (!sk_locked) {
2106 			unix_state_unlock(other);
2107 			unix_state_double_lock(sk, other);
2108 		}
2109 
2110 		if (unix_peer(sk) != other ||
2111 		    unix_dgram_peer_wake_me(sk, other)) {
2112 			err = -EAGAIN;
2113 			sk_locked = 1;
2114 			goto out_unlock;
2115 		}
2116 
2117 		if (!sk_locked) {
2118 			sk_locked = 1;
2119 			goto restart_locked;
2120 		}
2121 	}
2122 
2123 	if (unlikely(sk_locked))
2124 		unix_state_unlock(sk);
2125 
2126 	if (sock_flag(other, SOCK_RCVTSTAMP))
2127 		__net_timestamp(skb);
2128 
2129 	unix_maybe_add_creds(skb, sk, other);
2130 	scm_stat_add(other, skb);
2131 	skb_queue_tail(&other->sk_receive_queue, skb);
2132 	unix_state_unlock(other);
2133 	other->sk_data_ready(other);
2134 	sock_put(other);
2135 	scm_destroy(&scm);
2136 	return len;
2137 
2138 out_unlock:
2139 	if (sk_locked)
2140 		unix_state_unlock(sk);
2141 	unix_state_unlock(other);
2142 out_sock_put:
2143 	sock_put(other);
2144 out_free:
2145 	consume_skb(skb);
2146 out:
2147 	scm_destroy(&scm);
2148 	return err;
2149 }
2150 
2151 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2152  * bytes, and a minimum of a full page.
2153  */
2154 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2155 
2156 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2157 static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other,
2158 		     struct scm_cookie *scm, bool fds_sent)
2159 {
2160 	struct unix_sock *ousk = unix_sk(other);
2161 	struct sk_buff *skb;
2162 	int err;
2163 
2164 	skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2165 
2166 	if (!skb)
2167 		return err;
2168 
2169 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2170 	if (err < 0)
2171 		goto out;
2172 
2173 	skb_put(skb, 1);
2174 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2175 
2176 	if (err)
2177 		goto out;
2178 
2179 	unix_state_lock(other);
2180 
2181 	if (sock_flag(other, SOCK_DEAD) ||
2182 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2183 		err = -EPIPE;
2184 		goto out_unlock;
2185 	}
2186 
2187 	if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2188 		err = -EPERM;
2189 		goto out_unlock;
2190 	}
2191 
2192 	unix_maybe_add_creds(skb, sk, other);
2193 	scm_stat_add(other, skb);
2194 
2195 	spin_lock(&other->sk_receive_queue.lock);
2196 	WRITE_ONCE(ousk->oob_skb, skb);
2197 	__skb_queue_tail(&other->sk_receive_queue, skb);
2198 	spin_unlock(&other->sk_receive_queue.lock);
2199 
2200 	sk_send_sigurg(other);
2201 	unix_state_unlock(other);
2202 	other->sk_data_ready(other);
2203 
2204 	return 0;
2205 out_unlock:
2206 	unix_state_unlock(other);
2207 out:
2208 	consume_skb(skb);
2209 	return err;
2210 }
2211 #endif
2212 
2213 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2214 			       size_t len)
2215 {
2216 	struct sock *sk = sock->sk;
2217 	struct sk_buff *skb = NULL;
2218 	struct sock *other = NULL;
2219 	struct scm_cookie scm;
2220 	bool fds_sent = false;
2221 	int err, sent = 0;
2222 
2223 	err = scm_send(sock, msg, &scm, false);
2224 	if (err < 0)
2225 		return err;
2226 
2227 	wait_for_unix_gc(scm.fp);
2228 
2229 	if (msg->msg_flags & MSG_OOB) {
2230 		err = -EOPNOTSUPP;
2231 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2232 		if (len)
2233 			len--;
2234 		else
2235 #endif
2236 			goto out_err;
2237 	}
2238 
2239 	if (msg->msg_namelen) {
2240 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2241 		goto out_err;
2242 	} else {
2243 		other = unix_peer(sk);
2244 		if (!other) {
2245 			err = -ENOTCONN;
2246 			goto out_err;
2247 		}
2248 	}
2249 
2250 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2251 		goto out_pipe;
2252 
2253 	while (sent < len) {
2254 		int size = len - sent;
2255 		int data_len;
2256 
2257 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2258 			skb = sock_alloc_send_pskb(sk, 0, 0,
2259 						   msg->msg_flags & MSG_DONTWAIT,
2260 						   &err, 0);
2261 		} else {
2262 			/* Keep two messages in the pipe so it schedules better */
2263 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2264 
2265 			/* allow fallback to order-0 allocations */
2266 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2267 
2268 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2269 
2270 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2271 
2272 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2273 						   msg->msg_flags & MSG_DONTWAIT, &err,
2274 						   get_order(UNIX_SKB_FRAGS_SZ));
2275 		}
2276 		if (!skb)
2277 			goto out_err;
2278 
2279 		/* Only send the fds in the first buffer */
2280 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2281 		if (err < 0)
2282 			goto out_free;
2283 
2284 		fds_sent = true;
2285 
2286 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2287 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2288 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2289 						   sk->sk_allocation);
2290 			if (err < 0)
2291 				goto out_free;
2292 
2293 			size = err;
2294 			refcount_add(size, &sk->sk_wmem_alloc);
2295 		} else {
2296 			skb_put(skb, size - data_len);
2297 			skb->data_len = data_len;
2298 			skb->len = size;
2299 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2300 			if (err)
2301 				goto out_free;
2302 		}
2303 
2304 		unix_state_lock(other);
2305 
2306 		if (sock_flag(other, SOCK_DEAD) ||
2307 		    (other->sk_shutdown & RCV_SHUTDOWN))
2308 			goto out_pipe_unlock;
2309 
2310 		if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2311 			unix_state_unlock(other);
2312 			err = -EPERM;
2313 			goto out_free;
2314 		}
2315 
2316 		unix_maybe_add_creds(skb, sk, other);
2317 		scm_stat_add(other, skb);
2318 		skb_queue_tail(&other->sk_receive_queue, skb);
2319 		unix_state_unlock(other);
2320 		other->sk_data_ready(other);
2321 		sent += size;
2322 	}
2323 
2324 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2325 	if (msg->msg_flags & MSG_OOB) {
2326 		err = queue_oob(sk, msg, other, &scm, fds_sent);
2327 		if (err)
2328 			goto out_err;
2329 		sent++;
2330 	}
2331 #endif
2332 
2333 	scm_destroy(&scm);
2334 
2335 	return sent;
2336 
2337 out_pipe_unlock:
2338 	unix_state_unlock(other);
2339 out_pipe:
2340 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2341 		send_sig(SIGPIPE, current, 0);
2342 	err = -EPIPE;
2343 out_free:
2344 	consume_skb(skb);
2345 out_err:
2346 	scm_destroy(&scm);
2347 	return sent ? : err;
2348 }
2349 
2350 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2351 				  size_t len)
2352 {
2353 	int err;
2354 	struct sock *sk = sock->sk;
2355 
2356 	err = sock_error(sk);
2357 	if (err)
2358 		return err;
2359 
2360 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2361 		return -ENOTCONN;
2362 
2363 	if (msg->msg_namelen)
2364 		msg->msg_namelen = 0;
2365 
2366 	return unix_dgram_sendmsg(sock, msg, len);
2367 }
2368 
2369 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2370 				  size_t size, int flags)
2371 {
2372 	struct sock *sk = sock->sk;
2373 
2374 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2375 		return -ENOTCONN;
2376 
2377 	return unix_dgram_recvmsg(sock, msg, size, flags);
2378 }
2379 
2380 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2381 {
2382 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2383 
2384 	if (addr) {
2385 		msg->msg_namelen = addr->len;
2386 		memcpy(msg->msg_name, addr->name, addr->len);
2387 	}
2388 }
2389 
2390 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2391 			 int flags)
2392 {
2393 	struct scm_cookie scm;
2394 	struct socket *sock = sk->sk_socket;
2395 	struct unix_sock *u = unix_sk(sk);
2396 	struct sk_buff *skb, *last;
2397 	long timeo;
2398 	int skip;
2399 	int err;
2400 
2401 	err = -EOPNOTSUPP;
2402 	if (flags&MSG_OOB)
2403 		goto out;
2404 
2405 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2406 
2407 	do {
2408 		mutex_lock(&u->iolock);
2409 
2410 		skip = sk_peek_offset(sk, flags);
2411 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2412 					      &skip, &err, &last);
2413 		if (skb) {
2414 			if (!(flags & MSG_PEEK))
2415 				scm_stat_del(sk, skb);
2416 			break;
2417 		}
2418 
2419 		mutex_unlock(&u->iolock);
2420 
2421 		if (err != -EAGAIN)
2422 			break;
2423 	} while (timeo &&
2424 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2425 					      &err, &timeo, last));
2426 
2427 	if (!skb) { /* implies iolock unlocked */
2428 		unix_state_lock(sk);
2429 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2430 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2431 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2432 			err = 0;
2433 		unix_state_unlock(sk);
2434 		goto out;
2435 	}
2436 
2437 	if (wq_has_sleeper(&u->peer_wait))
2438 		wake_up_interruptible_sync_poll(&u->peer_wait,
2439 						EPOLLOUT | EPOLLWRNORM |
2440 						EPOLLWRBAND);
2441 
2442 	if (msg->msg_name) {
2443 		unix_copy_addr(msg, skb->sk);
2444 
2445 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2446 						      msg->msg_name,
2447 						      &msg->msg_namelen);
2448 	}
2449 
2450 	if (size > skb->len - skip)
2451 		size = skb->len - skip;
2452 	else if (size < skb->len - skip)
2453 		msg->msg_flags |= MSG_TRUNC;
2454 
2455 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2456 	if (err)
2457 		goto out_free;
2458 
2459 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2460 		__sock_recv_timestamp(msg, sk, skb);
2461 
2462 	memset(&scm, 0, sizeof(scm));
2463 
2464 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2465 	unix_set_secdata(&scm, skb);
2466 
2467 	if (!(flags & MSG_PEEK)) {
2468 		if (UNIXCB(skb).fp)
2469 			unix_detach_fds(&scm, skb);
2470 
2471 		sk_peek_offset_bwd(sk, skb->len);
2472 	} else {
2473 		/* It is questionable: on PEEK we could:
2474 		   - do not return fds - good, but too simple 8)
2475 		   - return fds, and do not return them on read (old strategy,
2476 		     apparently wrong)
2477 		   - clone fds (I chose it for now, it is the most universal
2478 		     solution)
2479 
2480 		   POSIX 1003.1g does not actually define this clearly
2481 		   at all. POSIX 1003.1g doesn't define a lot of things
2482 		   clearly however!
2483 
2484 		*/
2485 
2486 		sk_peek_offset_fwd(sk, size);
2487 
2488 		if (UNIXCB(skb).fp)
2489 			unix_peek_fds(&scm, skb);
2490 	}
2491 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2492 
2493 	scm_recv_unix(sock, msg, &scm, flags);
2494 
2495 out_free:
2496 	skb_free_datagram(sk, skb);
2497 	mutex_unlock(&u->iolock);
2498 out:
2499 	return err;
2500 }
2501 
2502 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2503 			      int flags)
2504 {
2505 	struct sock *sk = sock->sk;
2506 
2507 #ifdef CONFIG_BPF_SYSCALL
2508 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2509 
2510 	if (prot != &unix_dgram_proto)
2511 		return prot->recvmsg(sk, msg, size, flags, NULL);
2512 #endif
2513 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2514 }
2515 
2516 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2517 {
2518 	struct unix_sock *u = unix_sk(sk);
2519 	struct sk_buff *skb;
2520 	int err;
2521 
2522 	mutex_lock(&u->iolock);
2523 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2524 	mutex_unlock(&u->iolock);
2525 	if (!skb)
2526 		return err;
2527 
2528 	return recv_actor(sk, skb);
2529 }
2530 
2531 /*
2532  *	Sleep until more data has arrived. But check for races..
2533  */
2534 static long unix_stream_data_wait(struct sock *sk, long timeo,
2535 				  struct sk_buff *last, unsigned int last_len,
2536 				  bool freezable)
2537 {
2538 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2539 	struct sk_buff *tail;
2540 	DEFINE_WAIT(wait);
2541 
2542 	unix_state_lock(sk);
2543 
2544 	for (;;) {
2545 		prepare_to_wait(sk_sleep(sk), &wait, state);
2546 
2547 		tail = skb_peek_tail(&sk->sk_receive_queue);
2548 		if (tail != last ||
2549 		    (tail && tail->len != last_len) ||
2550 		    sk->sk_err ||
2551 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2552 		    signal_pending(current) ||
2553 		    !timeo)
2554 			break;
2555 
2556 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2557 		unix_state_unlock(sk);
2558 		timeo = schedule_timeout(timeo);
2559 		unix_state_lock(sk);
2560 
2561 		if (sock_flag(sk, SOCK_DEAD))
2562 			break;
2563 
2564 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2565 	}
2566 
2567 	finish_wait(sk_sleep(sk), &wait);
2568 	unix_state_unlock(sk);
2569 	return timeo;
2570 }
2571 
2572 static unsigned int unix_skb_len(const struct sk_buff *skb)
2573 {
2574 	return skb->len - UNIXCB(skb).consumed;
2575 }
2576 
2577 struct unix_stream_read_state {
2578 	int (*recv_actor)(struct sk_buff *, int, int,
2579 			  struct unix_stream_read_state *);
2580 	struct socket *socket;
2581 	struct msghdr *msg;
2582 	struct pipe_inode_info *pipe;
2583 	size_t size;
2584 	int flags;
2585 	unsigned int splice_flags;
2586 };
2587 
2588 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2589 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2590 {
2591 	struct socket *sock = state->socket;
2592 	struct sock *sk = sock->sk;
2593 	struct unix_sock *u = unix_sk(sk);
2594 	int chunk = 1;
2595 	struct sk_buff *oob_skb;
2596 
2597 	mutex_lock(&u->iolock);
2598 	unix_state_lock(sk);
2599 	spin_lock(&sk->sk_receive_queue.lock);
2600 
2601 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2602 		spin_unlock(&sk->sk_receive_queue.lock);
2603 		unix_state_unlock(sk);
2604 		mutex_unlock(&u->iolock);
2605 		return -EINVAL;
2606 	}
2607 
2608 	oob_skb = u->oob_skb;
2609 
2610 	if (!(state->flags & MSG_PEEK))
2611 		WRITE_ONCE(u->oob_skb, NULL);
2612 
2613 	spin_unlock(&sk->sk_receive_queue.lock);
2614 	unix_state_unlock(sk);
2615 
2616 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2617 
2618 	if (!(state->flags & MSG_PEEK))
2619 		UNIXCB(oob_skb).consumed += 1;
2620 
2621 	mutex_unlock(&u->iolock);
2622 
2623 	if (chunk < 0)
2624 		return -EFAULT;
2625 
2626 	state->msg->msg_flags |= MSG_OOB;
2627 	return 1;
2628 }
2629 
2630 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2631 				  int flags, int copied)
2632 {
2633 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2634 	struct unix_sock *u = unix_sk(sk);
2635 
2636 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2637 		return skb;
2638 
2639 	spin_lock(&sk->sk_receive_queue.lock);
2640 
2641 	if (!unix_skb_len(skb)) {
2642 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2643 			skb = NULL;
2644 		} else if (flags & MSG_PEEK) {
2645 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2646 		} else {
2647 			read_skb = skb;
2648 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2649 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2650 		}
2651 
2652 		if (!skb)
2653 			goto unlock;
2654 	}
2655 
2656 	if (skb != u->oob_skb)
2657 		goto unlock;
2658 
2659 	if (copied) {
2660 		skb = NULL;
2661 	} else if (!(flags & MSG_PEEK)) {
2662 		WRITE_ONCE(u->oob_skb, NULL);
2663 
2664 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2665 			__skb_unlink(skb, &sk->sk_receive_queue);
2666 			unread_skb = skb;
2667 			skb = skb_peek(&sk->sk_receive_queue);
2668 		}
2669 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2670 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2671 	}
2672 
2673 unlock:
2674 	spin_unlock(&sk->sk_receive_queue.lock);
2675 
2676 	consume_skb(read_skb);
2677 	kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2678 
2679 	return skb;
2680 }
2681 #endif
2682 
2683 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2684 {
2685 	struct unix_sock *u = unix_sk(sk);
2686 	struct sk_buff *skb;
2687 	int err;
2688 
2689 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2690 		return -ENOTCONN;
2691 
2692 	mutex_lock(&u->iolock);
2693 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2694 	mutex_unlock(&u->iolock);
2695 	if (!skb)
2696 		return err;
2697 
2698 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2699 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2700 		bool drop = false;
2701 
2702 		unix_state_lock(sk);
2703 
2704 		if (sock_flag(sk, SOCK_DEAD)) {
2705 			unix_state_unlock(sk);
2706 			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2707 			return -ECONNRESET;
2708 		}
2709 
2710 		spin_lock(&sk->sk_receive_queue.lock);
2711 		if (likely(skb == u->oob_skb)) {
2712 			WRITE_ONCE(u->oob_skb, NULL);
2713 			drop = true;
2714 		}
2715 		spin_unlock(&sk->sk_receive_queue.lock);
2716 
2717 		unix_state_unlock(sk);
2718 
2719 		if (drop) {
2720 			kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2721 			return -EAGAIN;
2722 		}
2723 	}
2724 #endif
2725 
2726 	return recv_actor(sk, skb);
2727 }
2728 
2729 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2730 				    bool freezable)
2731 {
2732 	struct scm_cookie scm;
2733 	struct socket *sock = state->socket;
2734 	struct sock *sk = sock->sk;
2735 	struct unix_sock *u = unix_sk(sk);
2736 	int copied = 0;
2737 	int flags = state->flags;
2738 	int noblock = flags & MSG_DONTWAIT;
2739 	bool check_creds = false;
2740 	int target;
2741 	int err = 0;
2742 	long timeo;
2743 	int skip;
2744 	size_t size = state->size;
2745 	unsigned int last_len;
2746 
2747 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2748 		err = -EINVAL;
2749 		goto out;
2750 	}
2751 
2752 	if (unlikely(flags & MSG_OOB)) {
2753 		err = -EOPNOTSUPP;
2754 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2755 		err = unix_stream_recv_urg(state);
2756 #endif
2757 		goto out;
2758 	}
2759 
2760 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2761 	timeo = sock_rcvtimeo(sk, noblock);
2762 
2763 	memset(&scm, 0, sizeof(scm));
2764 
2765 	/* Lock the socket to prevent queue disordering
2766 	 * while sleeps in memcpy_tomsg
2767 	 */
2768 	mutex_lock(&u->iolock);
2769 
2770 	skip = max(sk_peek_offset(sk, flags), 0);
2771 
2772 	do {
2773 		struct sk_buff *skb, *last;
2774 		int chunk;
2775 
2776 redo:
2777 		unix_state_lock(sk);
2778 		if (sock_flag(sk, SOCK_DEAD)) {
2779 			err = -ECONNRESET;
2780 			goto unlock;
2781 		}
2782 		last = skb = skb_peek(&sk->sk_receive_queue);
2783 		last_len = last ? last->len : 0;
2784 
2785 again:
2786 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2787 		if (skb) {
2788 			skb = manage_oob(skb, sk, flags, copied);
2789 			if (!skb && copied) {
2790 				unix_state_unlock(sk);
2791 				break;
2792 			}
2793 		}
2794 #endif
2795 		if (skb == NULL) {
2796 			if (copied >= target)
2797 				goto unlock;
2798 
2799 			/*
2800 			 *	POSIX 1003.1g mandates this order.
2801 			 */
2802 
2803 			err = sock_error(sk);
2804 			if (err)
2805 				goto unlock;
2806 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2807 				goto unlock;
2808 
2809 			unix_state_unlock(sk);
2810 			if (!timeo) {
2811 				err = -EAGAIN;
2812 				break;
2813 			}
2814 
2815 			mutex_unlock(&u->iolock);
2816 
2817 			timeo = unix_stream_data_wait(sk, timeo, last,
2818 						      last_len, freezable);
2819 
2820 			if (signal_pending(current)) {
2821 				err = sock_intr_errno(timeo);
2822 				scm_destroy(&scm);
2823 				goto out;
2824 			}
2825 
2826 			mutex_lock(&u->iolock);
2827 			goto redo;
2828 unlock:
2829 			unix_state_unlock(sk);
2830 			break;
2831 		}
2832 
2833 		while (skip >= unix_skb_len(skb)) {
2834 			skip -= unix_skb_len(skb);
2835 			last = skb;
2836 			last_len = skb->len;
2837 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2838 			if (!skb)
2839 				goto again;
2840 		}
2841 
2842 		unix_state_unlock(sk);
2843 
2844 		if (check_creds) {
2845 			/* Never glue messages from different writers */
2846 			if (!unix_skb_scm_eq(skb, &scm))
2847 				break;
2848 		} else if (unix_may_passcred(sk)) {
2849 			/* Copy credentials */
2850 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2851 			unix_set_secdata(&scm, skb);
2852 			check_creds = true;
2853 		}
2854 
2855 		/* Copy address just once */
2856 		if (state->msg && state->msg->msg_name) {
2857 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2858 					 state->msg->msg_name);
2859 			unix_copy_addr(state->msg, skb->sk);
2860 
2861 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2862 							      state->msg->msg_name,
2863 							      &state->msg->msg_namelen);
2864 
2865 			sunaddr = NULL;
2866 		}
2867 
2868 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2869 		chunk = state->recv_actor(skb, skip, chunk, state);
2870 		if (chunk < 0) {
2871 			if (copied == 0)
2872 				copied = -EFAULT;
2873 			break;
2874 		}
2875 		copied += chunk;
2876 		size -= chunk;
2877 
2878 		/* Mark read part of skb as used */
2879 		if (!(flags & MSG_PEEK)) {
2880 			UNIXCB(skb).consumed += chunk;
2881 
2882 			sk_peek_offset_bwd(sk, chunk);
2883 
2884 			if (UNIXCB(skb).fp) {
2885 				scm_stat_del(sk, skb);
2886 				unix_detach_fds(&scm, skb);
2887 			}
2888 
2889 			if (unix_skb_len(skb))
2890 				break;
2891 
2892 			skb_unlink(skb, &sk->sk_receive_queue);
2893 			consume_skb(skb);
2894 
2895 			if (scm.fp)
2896 				break;
2897 		} else {
2898 			/* It is questionable, see note in unix_dgram_recvmsg.
2899 			 */
2900 			if (UNIXCB(skb).fp)
2901 				unix_peek_fds(&scm, skb);
2902 
2903 			sk_peek_offset_fwd(sk, chunk);
2904 
2905 			if (UNIXCB(skb).fp)
2906 				break;
2907 
2908 			skip = 0;
2909 			last = skb;
2910 			last_len = skb->len;
2911 			unix_state_lock(sk);
2912 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2913 			if (skb)
2914 				goto again;
2915 			unix_state_unlock(sk);
2916 			break;
2917 		}
2918 	} while (size);
2919 
2920 	mutex_unlock(&u->iolock);
2921 	if (state->msg)
2922 		scm_recv_unix(sock, state->msg, &scm, flags);
2923 	else
2924 		scm_destroy(&scm);
2925 out:
2926 	return copied ? : err;
2927 }
2928 
2929 static int unix_stream_read_actor(struct sk_buff *skb,
2930 				  int skip, int chunk,
2931 				  struct unix_stream_read_state *state)
2932 {
2933 	int ret;
2934 
2935 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2936 				    state->msg, chunk);
2937 	return ret ?: chunk;
2938 }
2939 
2940 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2941 			  size_t size, int flags)
2942 {
2943 	struct unix_stream_read_state state = {
2944 		.recv_actor = unix_stream_read_actor,
2945 		.socket = sk->sk_socket,
2946 		.msg = msg,
2947 		.size = size,
2948 		.flags = flags
2949 	};
2950 
2951 	return unix_stream_read_generic(&state, true);
2952 }
2953 
2954 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2955 			       size_t size, int flags)
2956 {
2957 	struct unix_stream_read_state state = {
2958 		.recv_actor = unix_stream_read_actor,
2959 		.socket = sock,
2960 		.msg = msg,
2961 		.size = size,
2962 		.flags = flags
2963 	};
2964 
2965 #ifdef CONFIG_BPF_SYSCALL
2966 	struct sock *sk = sock->sk;
2967 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2968 
2969 	if (prot != &unix_stream_proto)
2970 		return prot->recvmsg(sk, msg, size, flags, NULL);
2971 #endif
2972 	return unix_stream_read_generic(&state, true);
2973 }
2974 
2975 static int unix_stream_splice_actor(struct sk_buff *skb,
2976 				    int skip, int chunk,
2977 				    struct unix_stream_read_state *state)
2978 {
2979 	return skb_splice_bits(skb, state->socket->sk,
2980 			       UNIXCB(skb).consumed + skip,
2981 			       state->pipe, chunk, state->splice_flags);
2982 }
2983 
2984 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2985 				       struct pipe_inode_info *pipe,
2986 				       size_t size, unsigned int flags)
2987 {
2988 	struct unix_stream_read_state state = {
2989 		.recv_actor = unix_stream_splice_actor,
2990 		.socket = sock,
2991 		.pipe = pipe,
2992 		.size = size,
2993 		.splice_flags = flags,
2994 	};
2995 
2996 	if (unlikely(*ppos))
2997 		return -ESPIPE;
2998 
2999 	if (sock->file->f_flags & O_NONBLOCK ||
3000 	    flags & SPLICE_F_NONBLOCK)
3001 		state.flags = MSG_DONTWAIT;
3002 
3003 	return unix_stream_read_generic(&state, false);
3004 }
3005 
3006 static int unix_shutdown(struct socket *sock, int mode)
3007 {
3008 	struct sock *sk = sock->sk;
3009 	struct sock *other;
3010 
3011 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3012 		return -EINVAL;
3013 	/* This maps:
3014 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3015 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3016 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3017 	 */
3018 	++mode;
3019 
3020 	unix_state_lock(sk);
3021 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3022 	other = unix_peer(sk);
3023 	if (other)
3024 		sock_hold(other);
3025 	unix_state_unlock(sk);
3026 	sk->sk_state_change(sk);
3027 
3028 	if (other &&
3029 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3030 
3031 		int peer_mode = 0;
3032 		const struct proto *prot = READ_ONCE(other->sk_prot);
3033 
3034 		if (prot->unhash)
3035 			prot->unhash(other);
3036 		if (mode&RCV_SHUTDOWN)
3037 			peer_mode |= SEND_SHUTDOWN;
3038 		if (mode&SEND_SHUTDOWN)
3039 			peer_mode |= RCV_SHUTDOWN;
3040 		unix_state_lock(other);
3041 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3042 		unix_state_unlock(other);
3043 		other->sk_state_change(other);
3044 		if (peer_mode == SHUTDOWN_MASK)
3045 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3046 		else if (peer_mode & RCV_SHUTDOWN)
3047 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3048 	}
3049 	if (other)
3050 		sock_put(other);
3051 
3052 	return 0;
3053 }
3054 
3055 long unix_inq_len(struct sock *sk)
3056 {
3057 	struct sk_buff *skb;
3058 	long amount = 0;
3059 
3060 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3061 		return -EINVAL;
3062 
3063 	spin_lock(&sk->sk_receive_queue.lock);
3064 	if (sk->sk_type == SOCK_STREAM ||
3065 	    sk->sk_type == SOCK_SEQPACKET) {
3066 		skb_queue_walk(&sk->sk_receive_queue, skb)
3067 			amount += unix_skb_len(skb);
3068 	} else {
3069 		skb = skb_peek(&sk->sk_receive_queue);
3070 		if (skb)
3071 			amount = skb->len;
3072 	}
3073 	spin_unlock(&sk->sk_receive_queue.lock);
3074 
3075 	return amount;
3076 }
3077 EXPORT_SYMBOL_GPL(unix_inq_len);
3078 
3079 long unix_outq_len(struct sock *sk)
3080 {
3081 	return sk_wmem_alloc_get(sk);
3082 }
3083 EXPORT_SYMBOL_GPL(unix_outq_len);
3084 
3085 static int unix_open_file(struct sock *sk)
3086 {
3087 	struct path path;
3088 	struct file *f;
3089 	int fd;
3090 
3091 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3092 		return -EPERM;
3093 
3094 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3095 		return -ENOENT;
3096 
3097 	path = unix_sk(sk)->path;
3098 	if (!path.dentry)
3099 		return -ENOENT;
3100 
3101 	path_get(&path);
3102 
3103 	fd = get_unused_fd_flags(O_CLOEXEC);
3104 	if (fd < 0)
3105 		goto out;
3106 
3107 	f = dentry_open(&path, O_PATH, current_cred());
3108 	if (IS_ERR(f)) {
3109 		put_unused_fd(fd);
3110 		fd = PTR_ERR(f);
3111 		goto out;
3112 	}
3113 
3114 	fd_install(fd, f);
3115 out:
3116 	path_put(&path);
3117 
3118 	return fd;
3119 }
3120 
3121 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3122 {
3123 	struct sock *sk = sock->sk;
3124 	long amount = 0;
3125 	int err;
3126 
3127 	switch (cmd) {
3128 	case SIOCOUTQ:
3129 		amount = unix_outq_len(sk);
3130 		err = put_user(amount, (int __user *)arg);
3131 		break;
3132 	case SIOCINQ:
3133 		amount = unix_inq_len(sk);
3134 		if (amount < 0)
3135 			err = amount;
3136 		else
3137 			err = put_user(amount, (int __user *)arg);
3138 		break;
3139 	case SIOCUNIXFILE:
3140 		err = unix_open_file(sk);
3141 		break;
3142 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3143 	case SIOCATMARK:
3144 		{
3145 			struct unix_sock *u = unix_sk(sk);
3146 			struct sk_buff *skb;
3147 			int answ = 0;
3148 
3149 			mutex_lock(&u->iolock);
3150 
3151 			skb = skb_peek(&sk->sk_receive_queue);
3152 			if (skb) {
3153 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3154 				struct sk_buff *next_skb;
3155 
3156 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3157 
3158 				if (skb == oob_skb ||
3159 				    (!unix_skb_len(skb) &&
3160 				     (!oob_skb || next_skb == oob_skb)))
3161 					answ = 1;
3162 			}
3163 
3164 			mutex_unlock(&u->iolock);
3165 
3166 			err = put_user(answ, (int __user *)arg);
3167 		}
3168 		break;
3169 #endif
3170 	default:
3171 		err = -ENOIOCTLCMD;
3172 		break;
3173 	}
3174 	return err;
3175 }
3176 
3177 #ifdef CONFIG_COMPAT
3178 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3179 {
3180 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3181 }
3182 #endif
3183 
3184 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3185 {
3186 	struct sock *sk = sock->sk;
3187 	unsigned char state;
3188 	__poll_t mask;
3189 	u8 shutdown;
3190 
3191 	sock_poll_wait(file, sock, wait);
3192 	mask = 0;
3193 	shutdown = READ_ONCE(sk->sk_shutdown);
3194 	state = READ_ONCE(sk->sk_state);
3195 
3196 	/* exceptional events? */
3197 	if (READ_ONCE(sk->sk_err))
3198 		mask |= EPOLLERR;
3199 	if (shutdown == SHUTDOWN_MASK)
3200 		mask |= EPOLLHUP;
3201 	if (shutdown & RCV_SHUTDOWN)
3202 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3203 
3204 	/* readable? */
3205 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3206 		mask |= EPOLLIN | EPOLLRDNORM;
3207 	if (sk_is_readable(sk))
3208 		mask |= EPOLLIN | EPOLLRDNORM;
3209 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3210 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3211 		mask |= EPOLLPRI;
3212 #endif
3213 
3214 	/* Connection-based need to check for termination and startup */
3215 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3216 	    state == TCP_CLOSE)
3217 		mask |= EPOLLHUP;
3218 
3219 	/*
3220 	 * we set writable also when the other side has shut down the
3221 	 * connection. This prevents stuck sockets.
3222 	 */
3223 	if (unix_writable(sk, state))
3224 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3225 
3226 	return mask;
3227 }
3228 
3229 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3230 				    poll_table *wait)
3231 {
3232 	struct sock *sk = sock->sk, *other;
3233 	unsigned int writable;
3234 	unsigned char state;
3235 	__poll_t mask;
3236 	u8 shutdown;
3237 
3238 	sock_poll_wait(file, sock, wait);
3239 	mask = 0;
3240 	shutdown = READ_ONCE(sk->sk_shutdown);
3241 	state = READ_ONCE(sk->sk_state);
3242 
3243 	/* exceptional events? */
3244 	if (READ_ONCE(sk->sk_err) ||
3245 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3246 		mask |= EPOLLERR |
3247 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3248 
3249 	if (shutdown & RCV_SHUTDOWN)
3250 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3251 	if (shutdown == SHUTDOWN_MASK)
3252 		mask |= EPOLLHUP;
3253 
3254 	/* readable? */
3255 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3256 		mask |= EPOLLIN | EPOLLRDNORM;
3257 	if (sk_is_readable(sk))
3258 		mask |= EPOLLIN | EPOLLRDNORM;
3259 
3260 	/* Connection-based need to check for termination and startup */
3261 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3262 		mask |= EPOLLHUP;
3263 
3264 	/* No write status requested, avoid expensive OUT tests. */
3265 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3266 		return mask;
3267 
3268 	writable = unix_writable(sk, state);
3269 	if (writable) {
3270 		unix_state_lock(sk);
3271 
3272 		other = unix_peer(sk);
3273 		if (other && unix_peer(other) != sk &&
3274 		    unix_recvq_full_lockless(other) &&
3275 		    unix_dgram_peer_wake_me(sk, other))
3276 			writable = 0;
3277 
3278 		unix_state_unlock(sk);
3279 	}
3280 
3281 	if (writable)
3282 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3283 	else
3284 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3285 
3286 	return mask;
3287 }
3288 
3289 #ifdef CONFIG_PROC_FS
3290 
3291 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3292 
3293 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3294 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3295 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3296 
3297 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3298 {
3299 	unsigned long offset = get_offset(*pos);
3300 	unsigned long bucket = get_bucket(*pos);
3301 	unsigned long count = 0;
3302 	struct sock *sk;
3303 
3304 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3305 	     sk; sk = sk_next(sk)) {
3306 		if (++count == offset)
3307 			break;
3308 	}
3309 
3310 	return sk;
3311 }
3312 
3313 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3314 {
3315 	unsigned long bucket = get_bucket(*pos);
3316 	struct net *net = seq_file_net(seq);
3317 	struct sock *sk;
3318 
3319 	while (bucket < UNIX_HASH_SIZE) {
3320 		spin_lock(&net->unx.table.locks[bucket]);
3321 
3322 		sk = unix_from_bucket(seq, pos);
3323 		if (sk)
3324 			return sk;
3325 
3326 		spin_unlock(&net->unx.table.locks[bucket]);
3327 
3328 		*pos = set_bucket_offset(++bucket, 1);
3329 	}
3330 
3331 	return NULL;
3332 }
3333 
3334 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3335 				  loff_t *pos)
3336 {
3337 	unsigned long bucket = get_bucket(*pos);
3338 
3339 	sk = sk_next(sk);
3340 	if (sk)
3341 		return sk;
3342 
3343 
3344 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3345 
3346 	*pos = set_bucket_offset(++bucket, 1);
3347 
3348 	return unix_get_first(seq, pos);
3349 }
3350 
3351 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3352 {
3353 	if (!*pos)
3354 		return SEQ_START_TOKEN;
3355 
3356 	return unix_get_first(seq, pos);
3357 }
3358 
3359 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3360 {
3361 	++*pos;
3362 
3363 	if (v == SEQ_START_TOKEN)
3364 		return unix_get_first(seq, pos);
3365 
3366 	return unix_get_next(seq, v, pos);
3367 }
3368 
3369 static void unix_seq_stop(struct seq_file *seq, void *v)
3370 {
3371 	struct sock *sk = v;
3372 
3373 	if (sk)
3374 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3375 }
3376 
3377 static int unix_seq_show(struct seq_file *seq, void *v)
3378 {
3379 
3380 	if (v == SEQ_START_TOKEN)
3381 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3382 			 "Inode Path\n");
3383 	else {
3384 		struct sock *s = v;
3385 		struct unix_sock *u = unix_sk(s);
3386 		unix_state_lock(s);
3387 
3388 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3389 			s,
3390 			refcount_read(&s->sk_refcnt),
3391 			0,
3392 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3393 			s->sk_type,
3394 			s->sk_socket ?
3395 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3396 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3397 			sock_i_ino(s));
3398 
3399 		if (u->addr) {	// under a hash table lock here
3400 			int i, len;
3401 			seq_putc(seq, ' ');
3402 
3403 			i = 0;
3404 			len = u->addr->len -
3405 				offsetof(struct sockaddr_un, sun_path);
3406 			if (u->addr->name->sun_path[0]) {
3407 				len--;
3408 			} else {
3409 				seq_putc(seq, '@');
3410 				i++;
3411 			}
3412 			for ( ; i < len; i++)
3413 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3414 					 '@');
3415 		}
3416 		unix_state_unlock(s);
3417 		seq_putc(seq, '\n');
3418 	}
3419 
3420 	return 0;
3421 }
3422 
3423 static const struct seq_operations unix_seq_ops = {
3424 	.start  = unix_seq_start,
3425 	.next   = unix_seq_next,
3426 	.stop   = unix_seq_stop,
3427 	.show   = unix_seq_show,
3428 };
3429 
3430 #ifdef CONFIG_BPF_SYSCALL
3431 struct bpf_unix_iter_state {
3432 	struct seq_net_private p;
3433 	unsigned int cur_sk;
3434 	unsigned int end_sk;
3435 	unsigned int max_sk;
3436 	struct sock **batch;
3437 	bool st_bucket_done;
3438 };
3439 
3440 struct bpf_iter__unix {
3441 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3442 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3443 	uid_t uid __aligned(8);
3444 };
3445 
3446 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3447 			      struct unix_sock *unix_sk, uid_t uid)
3448 {
3449 	struct bpf_iter__unix ctx;
3450 
3451 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3452 	ctx.meta = meta;
3453 	ctx.unix_sk = unix_sk;
3454 	ctx.uid = uid;
3455 	return bpf_iter_run_prog(prog, &ctx);
3456 }
3457 
3458 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3459 
3460 {
3461 	struct bpf_unix_iter_state *iter = seq->private;
3462 	unsigned int expected = 1;
3463 	struct sock *sk;
3464 
3465 	sock_hold(start_sk);
3466 	iter->batch[iter->end_sk++] = start_sk;
3467 
3468 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3469 		if (iter->end_sk < iter->max_sk) {
3470 			sock_hold(sk);
3471 			iter->batch[iter->end_sk++] = sk;
3472 		}
3473 
3474 		expected++;
3475 	}
3476 
3477 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3478 
3479 	return expected;
3480 }
3481 
3482 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3483 {
3484 	while (iter->cur_sk < iter->end_sk)
3485 		sock_put(iter->batch[iter->cur_sk++]);
3486 }
3487 
3488 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3489 				       unsigned int new_batch_sz)
3490 {
3491 	struct sock **new_batch;
3492 
3493 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3494 			     GFP_USER | __GFP_NOWARN);
3495 	if (!new_batch)
3496 		return -ENOMEM;
3497 
3498 	bpf_iter_unix_put_batch(iter);
3499 	kvfree(iter->batch);
3500 	iter->batch = new_batch;
3501 	iter->max_sk = new_batch_sz;
3502 
3503 	return 0;
3504 }
3505 
3506 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3507 					loff_t *pos)
3508 {
3509 	struct bpf_unix_iter_state *iter = seq->private;
3510 	unsigned int expected;
3511 	bool resized = false;
3512 	struct sock *sk;
3513 
3514 	if (iter->st_bucket_done)
3515 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3516 
3517 again:
3518 	/* Get a new batch */
3519 	iter->cur_sk = 0;
3520 	iter->end_sk = 0;
3521 
3522 	sk = unix_get_first(seq, pos);
3523 	if (!sk)
3524 		return NULL; /* Done */
3525 
3526 	expected = bpf_iter_unix_hold_batch(seq, sk);
3527 
3528 	if (iter->end_sk == expected) {
3529 		iter->st_bucket_done = true;
3530 		return sk;
3531 	}
3532 
3533 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3534 		resized = true;
3535 		goto again;
3536 	}
3537 
3538 	return sk;
3539 }
3540 
3541 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3542 {
3543 	if (!*pos)
3544 		return SEQ_START_TOKEN;
3545 
3546 	/* bpf iter does not support lseek, so it always
3547 	 * continue from where it was stop()-ped.
3548 	 */
3549 	return bpf_iter_unix_batch(seq, pos);
3550 }
3551 
3552 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3553 {
3554 	struct bpf_unix_iter_state *iter = seq->private;
3555 	struct sock *sk;
3556 
3557 	/* Whenever seq_next() is called, the iter->cur_sk is
3558 	 * done with seq_show(), so advance to the next sk in
3559 	 * the batch.
3560 	 */
3561 	if (iter->cur_sk < iter->end_sk)
3562 		sock_put(iter->batch[iter->cur_sk++]);
3563 
3564 	++*pos;
3565 
3566 	if (iter->cur_sk < iter->end_sk)
3567 		sk = iter->batch[iter->cur_sk];
3568 	else
3569 		sk = bpf_iter_unix_batch(seq, pos);
3570 
3571 	return sk;
3572 }
3573 
3574 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3575 {
3576 	struct bpf_iter_meta meta;
3577 	struct bpf_prog *prog;
3578 	struct sock *sk = v;
3579 	uid_t uid;
3580 	bool slow;
3581 	int ret;
3582 
3583 	if (v == SEQ_START_TOKEN)
3584 		return 0;
3585 
3586 	slow = lock_sock_fast(sk);
3587 
3588 	if (unlikely(sk_unhashed(sk))) {
3589 		ret = SEQ_SKIP;
3590 		goto unlock;
3591 	}
3592 
3593 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3594 	meta.seq = seq;
3595 	prog = bpf_iter_get_info(&meta, false);
3596 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3597 unlock:
3598 	unlock_sock_fast(sk, slow);
3599 	return ret;
3600 }
3601 
3602 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3603 {
3604 	struct bpf_unix_iter_state *iter = seq->private;
3605 	struct bpf_iter_meta meta;
3606 	struct bpf_prog *prog;
3607 
3608 	if (!v) {
3609 		meta.seq = seq;
3610 		prog = bpf_iter_get_info(&meta, true);
3611 		if (prog)
3612 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3613 	}
3614 
3615 	if (iter->cur_sk < iter->end_sk)
3616 		bpf_iter_unix_put_batch(iter);
3617 }
3618 
3619 static const struct seq_operations bpf_iter_unix_seq_ops = {
3620 	.start	= bpf_iter_unix_seq_start,
3621 	.next	= bpf_iter_unix_seq_next,
3622 	.stop	= bpf_iter_unix_seq_stop,
3623 	.show	= bpf_iter_unix_seq_show,
3624 };
3625 #endif
3626 #endif
3627 
3628 static const struct net_proto_family unix_family_ops = {
3629 	.family = PF_UNIX,
3630 	.create = unix_create,
3631 	.owner	= THIS_MODULE,
3632 };
3633 
3634 
3635 static int __net_init unix_net_init(struct net *net)
3636 {
3637 	int i;
3638 
3639 	net->unx.sysctl_max_dgram_qlen = 10;
3640 	if (unix_sysctl_register(net))
3641 		goto out;
3642 
3643 #ifdef CONFIG_PROC_FS
3644 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3645 			     sizeof(struct seq_net_private)))
3646 		goto err_sysctl;
3647 #endif
3648 
3649 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3650 					      sizeof(spinlock_t), GFP_KERNEL);
3651 	if (!net->unx.table.locks)
3652 		goto err_proc;
3653 
3654 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3655 						sizeof(struct hlist_head),
3656 						GFP_KERNEL);
3657 	if (!net->unx.table.buckets)
3658 		goto free_locks;
3659 
3660 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3661 		spin_lock_init(&net->unx.table.locks[i]);
3662 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3663 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3664 	}
3665 
3666 	return 0;
3667 
3668 free_locks:
3669 	kvfree(net->unx.table.locks);
3670 err_proc:
3671 #ifdef CONFIG_PROC_FS
3672 	remove_proc_entry("unix", net->proc_net);
3673 err_sysctl:
3674 #endif
3675 	unix_sysctl_unregister(net);
3676 out:
3677 	return -ENOMEM;
3678 }
3679 
3680 static void __net_exit unix_net_exit(struct net *net)
3681 {
3682 	kvfree(net->unx.table.buckets);
3683 	kvfree(net->unx.table.locks);
3684 	unix_sysctl_unregister(net);
3685 	remove_proc_entry("unix", net->proc_net);
3686 }
3687 
3688 static struct pernet_operations unix_net_ops = {
3689 	.init = unix_net_init,
3690 	.exit = unix_net_exit,
3691 };
3692 
3693 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3694 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3695 		     struct unix_sock *unix_sk, uid_t uid)
3696 
3697 #define INIT_BATCH_SZ 16
3698 
3699 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3700 {
3701 	struct bpf_unix_iter_state *iter = priv_data;
3702 	int err;
3703 
3704 	err = bpf_iter_init_seq_net(priv_data, aux);
3705 	if (err)
3706 		return err;
3707 
3708 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3709 	if (err) {
3710 		bpf_iter_fini_seq_net(priv_data);
3711 		return err;
3712 	}
3713 
3714 	return 0;
3715 }
3716 
3717 static void bpf_iter_fini_unix(void *priv_data)
3718 {
3719 	struct bpf_unix_iter_state *iter = priv_data;
3720 
3721 	bpf_iter_fini_seq_net(priv_data);
3722 	kvfree(iter->batch);
3723 }
3724 
3725 static const struct bpf_iter_seq_info unix_seq_info = {
3726 	.seq_ops		= &bpf_iter_unix_seq_ops,
3727 	.init_seq_private	= bpf_iter_init_unix,
3728 	.fini_seq_private	= bpf_iter_fini_unix,
3729 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3730 };
3731 
3732 static const struct bpf_func_proto *
3733 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3734 			     const struct bpf_prog *prog)
3735 {
3736 	switch (func_id) {
3737 	case BPF_FUNC_setsockopt:
3738 		return &bpf_sk_setsockopt_proto;
3739 	case BPF_FUNC_getsockopt:
3740 		return &bpf_sk_getsockopt_proto;
3741 	default:
3742 		return NULL;
3743 	}
3744 }
3745 
3746 static struct bpf_iter_reg unix_reg_info = {
3747 	.target			= "unix",
3748 	.ctx_arg_info_size	= 1,
3749 	.ctx_arg_info		= {
3750 		{ offsetof(struct bpf_iter__unix, unix_sk),
3751 		  PTR_TO_BTF_ID_OR_NULL },
3752 	},
3753 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3754 	.seq_info		= &unix_seq_info,
3755 };
3756 
3757 static void __init bpf_iter_register(void)
3758 {
3759 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3760 	if (bpf_iter_reg_target(&unix_reg_info))
3761 		pr_warn("Warning: could not register bpf iterator unix\n");
3762 }
3763 #endif
3764 
3765 static int __init af_unix_init(void)
3766 {
3767 	int i, rc = -1;
3768 
3769 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3770 
3771 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3772 		spin_lock_init(&bsd_socket_locks[i]);
3773 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3774 	}
3775 
3776 	rc = proto_register(&unix_dgram_proto, 1);
3777 	if (rc != 0) {
3778 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3779 		goto out;
3780 	}
3781 
3782 	rc = proto_register(&unix_stream_proto, 1);
3783 	if (rc != 0) {
3784 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3785 		proto_unregister(&unix_dgram_proto);
3786 		goto out;
3787 	}
3788 
3789 	sock_register(&unix_family_ops);
3790 	register_pernet_subsys(&unix_net_ops);
3791 	unix_bpf_build_proto();
3792 
3793 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3794 	bpf_iter_register();
3795 #endif
3796 
3797 out:
3798 	return rc;
3799 }
3800 
3801 /* Later than subsys_initcall() because we depend on stuff initialised there */
3802 fs_initcall(af_unix_init);
3803