xref: /linux/net/unix/af_unix.c (revision f2ad904e923f70a80f478febf001f88dfd65a64c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 #ifdef CONFIG_PROVE_LOCKING
130 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
131 
132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133 				  const struct lockdep_map *b)
134 {
135 	return cmp_ptr(a, b);
136 }
137 
138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139 				  const struct lockdep_map *_b)
140 {
141 	const struct unix_sock *a, *b;
142 
143 	a = container_of(_a, struct unix_sock, lock.dep_map);
144 	b = container_of(_b, struct unix_sock, lock.dep_map);
145 
146 	if (a->sk.sk_state == TCP_LISTEN) {
147 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
148 		 *
149 		 *   1. a is TCP_LISTEN.
150 		 *   2. b is not a.
151 		 *   3. concurrent connect(b -> a) must fail.
152 		 *
153 		 * Except for 2. & 3., the b's state can be any possible
154 		 * value due to concurrent connect() or listen().
155 		 *
156 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157 		 * be expressed as lock_cmp_fn.
158 		 */
159 		switch (b->sk.sk_state) {
160 		case TCP_CLOSE:
161 		case TCP_ESTABLISHED:
162 		case TCP_LISTEN:
163 			return -1;
164 		default:
165 			/* Invalid case. */
166 			return 0;
167 		}
168 	}
169 
170 	/* Should never happen.  Just to be symmetric. */
171 	if (b->sk.sk_state == TCP_LISTEN) {
172 		switch (b->sk.sk_state) {
173 		case TCP_CLOSE:
174 		case TCP_ESTABLISHED:
175 			return 1;
176 		default:
177 			return 0;
178 		}
179 	}
180 
181 	/* unix_state_double_lock(): ascending address order. */
182 	return cmp_ptr(a, b);
183 }
184 
185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186 				  const struct lockdep_map *_b)
187 {
188 	const struct sock *a, *b;
189 
190 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192 
193 	/* unix_collect_skb(): listener -> embryo order. */
194 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195 		return -1;
196 
197 	/* Should never happen.  Just to be symmetric. */
198 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199 		return 1;
200 
201 	return 0;
202 }
203 #endif
204 
205 static unsigned int unix_unbound_hash(struct sock *sk)
206 {
207 	unsigned long hash = (unsigned long)sk;
208 
209 	hash ^= hash >> 16;
210 	hash ^= hash >> 8;
211 	hash ^= sk->sk_type;
212 
213 	return hash & UNIX_HASH_MOD;
214 }
215 
216 static unsigned int unix_bsd_hash(struct inode *i)
217 {
218 	return i->i_ino & UNIX_HASH_MOD;
219 }
220 
221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222 				       int addr_len, int type)
223 {
224 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
225 	unsigned int hash;
226 
227 	hash = (__force unsigned int)csum_fold(csum);
228 	hash ^= hash >> 8;
229 	hash ^= type;
230 
231 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232 }
233 
234 static void unix_table_double_lock(struct net *net,
235 				   unsigned int hash1, unsigned int hash2)
236 {
237 	if (hash1 == hash2) {
238 		spin_lock(&net->unx.table.locks[hash1]);
239 		return;
240 	}
241 
242 	if (hash1 > hash2)
243 		swap(hash1, hash2);
244 
245 	spin_lock(&net->unx.table.locks[hash1]);
246 	spin_lock(&net->unx.table.locks[hash2]);
247 }
248 
249 static void unix_table_double_unlock(struct net *net,
250 				     unsigned int hash1, unsigned int hash2)
251 {
252 	if (hash1 == hash2) {
253 		spin_unlock(&net->unx.table.locks[hash1]);
254 		return;
255 	}
256 
257 	spin_unlock(&net->unx.table.locks[hash1]);
258 	spin_unlock(&net->unx.table.locks[hash2]);
259 }
260 
261 #ifdef CONFIG_SECURITY_NETWORK
262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263 {
264 	UNIXCB(skb).secid = scm->secid;
265 }
266 
267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268 {
269 	scm->secid = UNIXCB(skb).secid;
270 }
271 
272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273 {
274 	return (scm->secid == UNIXCB(skb).secid);
275 }
276 #else
277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278 { }
279 
280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281 { }
282 
283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284 {
285 	return true;
286 }
287 #endif /* CONFIG_SECURITY_NETWORK */
288 
289 static inline int unix_may_send(struct sock *sk, struct sock *osk)
290 {
291 	return !unix_peer(osk) || unix_peer(osk) == sk;
292 }
293 
294 static inline int unix_recvq_full_lockless(const struct sock *sk)
295 {
296 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
297 }
298 
299 struct sock *unix_peer_get(struct sock *s)
300 {
301 	struct sock *peer;
302 
303 	unix_state_lock(s);
304 	peer = unix_peer(s);
305 	if (peer)
306 		sock_hold(peer);
307 	unix_state_unlock(s);
308 	return peer;
309 }
310 EXPORT_SYMBOL_GPL(unix_peer_get);
311 
312 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
313 					     int addr_len)
314 {
315 	struct unix_address *addr;
316 
317 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
318 	if (!addr)
319 		return NULL;
320 
321 	refcount_set(&addr->refcnt, 1);
322 	addr->len = addr_len;
323 	memcpy(addr->name, sunaddr, addr_len);
324 
325 	return addr;
326 }
327 
328 static inline void unix_release_addr(struct unix_address *addr)
329 {
330 	if (refcount_dec_and_test(&addr->refcnt))
331 		kfree(addr);
332 }
333 
334 /*
335  *	Check unix socket name:
336  *		- should be not zero length.
337  *	        - if started by not zero, should be NULL terminated (FS object)
338  *		- if started by zero, it is abstract name.
339  */
340 
341 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
342 {
343 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
344 	    addr_len > sizeof(*sunaddr))
345 		return -EINVAL;
346 
347 	if (sunaddr->sun_family != AF_UNIX)
348 		return -EINVAL;
349 
350 	return 0;
351 }
352 
353 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
354 {
355 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
356 	short offset = offsetof(struct sockaddr_storage, __data);
357 
358 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
359 
360 	/* This may look like an off by one error but it is a bit more
361 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
362 	 * sun_path[108] doesn't as such exist.  However in kernel space
363 	 * we are guaranteed that it is a valid memory location in our
364 	 * kernel address buffer because syscall functions always pass
365 	 * a pointer of struct sockaddr_storage which has a bigger buffer
366 	 * than 108.  Also, we must terminate sun_path for strlen() in
367 	 * getname_kernel().
368 	 */
369 	addr->__data[addr_len - offset] = 0;
370 
371 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
372 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
373 	 * know the actual buffer.
374 	 */
375 	return strlen(addr->__data) + offset + 1;
376 }
377 
378 static void __unix_remove_socket(struct sock *sk)
379 {
380 	sk_del_node_init(sk);
381 }
382 
383 static void __unix_insert_socket(struct net *net, struct sock *sk)
384 {
385 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
386 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
387 }
388 
389 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
390 				 struct unix_address *addr, unsigned int hash)
391 {
392 	__unix_remove_socket(sk);
393 	smp_store_release(&unix_sk(sk)->addr, addr);
394 
395 	sk->sk_hash = hash;
396 	__unix_insert_socket(net, sk);
397 }
398 
399 static void unix_remove_socket(struct net *net, struct sock *sk)
400 {
401 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
402 	__unix_remove_socket(sk);
403 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
404 }
405 
406 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
407 {
408 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
409 	__unix_insert_socket(net, sk);
410 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
411 }
412 
413 static void unix_insert_bsd_socket(struct sock *sk)
414 {
415 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
416 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
417 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
418 }
419 
420 static void unix_remove_bsd_socket(struct sock *sk)
421 {
422 	if (!hlist_unhashed(&sk->sk_bind_node)) {
423 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
424 		__sk_del_bind_node(sk);
425 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
426 
427 		sk_node_init(&sk->sk_bind_node);
428 	}
429 }
430 
431 static struct sock *__unix_find_socket_byname(struct net *net,
432 					      struct sockaddr_un *sunname,
433 					      int len, unsigned int hash)
434 {
435 	struct sock *s;
436 
437 	sk_for_each(s, &net->unx.table.buckets[hash]) {
438 		struct unix_sock *u = unix_sk(s);
439 
440 		if (u->addr->len == len &&
441 		    !memcmp(u->addr->name, sunname, len))
442 			return s;
443 	}
444 	return NULL;
445 }
446 
447 static inline struct sock *unix_find_socket_byname(struct net *net,
448 						   struct sockaddr_un *sunname,
449 						   int len, unsigned int hash)
450 {
451 	struct sock *s;
452 
453 	spin_lock(&net->unx.table.locks[hash]);
454 	s = __unix_find_socket_byname(net, sunname, len, hash);
455 	if (s)
456 		sock_hold(s);
457 	spin_unlock(&net->unx.table.locks[hash]);
458 	return s;
459 }
460 
461 static struct sock *unix_find_socket_byinode(struct inode *i)
462 {
463 	unsigned int hash = unix_bsd_hash(i);
464 	struct sock *s;
465 
466 	spin_lock(&bsd_socket_locks[hash]);
467 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
468 		struct dentry *dentry = unix_sk(s)->path.dentry;
469 
470 		if (dentry && d_backing_inode(dentry) == i) {
471 			sock_hold(s);
472 			spin_unlock(&bsd_socket_locks[hash]);
473 			return s;
474 		}
475 	}
476 	spin_unlock(&bsd_socket_locks[hash]);
477 	return NULL;
478 }
479 
480 /* Support code for asymmetrically connected dgram sockets
481  *
482  * If a datagram socket is connected to a socket not itself connected
483  * to the first socket (eg, /dev/log), clients may only enqueue more
484  * messages if the present receive queue of the server socket is not
485  * "too large". This means there's a second writeability condition
486  * poll and sendmsg need to test. The dgram recv code will do a wake
487  * up on the peer_wait wait queue of a socket upon reception of a
488  * datagram which needs to be propagated to sleeping would-be writers
489  * since these might not have sent anything so far. This can't be
490  * accomplished via poll_wait because the lifetime of the server
491  * socket might be less than that of its clients if these break their
492  * association with it or if the server socket is closed while clients
493  * are still connected to it and there's no way to inform "a polling
494  * implementation" that it should let go of a certain wait queue
495  *
496  * In order to propagate a wake up, a wait_queue_entry_t of the client
497  * socket is enqueued on the peer_wait queue of the server socket
498  * whose wake function does a wake_up on the ordinary client socket
499  * wait queue. This connection is established whenever a write (or
500  * poll for write) hit the flow control condition and broken when the
501  * association to the server socket is dissolved or after a wake up
502  * was relayed.
503  */
504 
505 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
506 				      void *key)
507 {
508 	struct unix_sock *u;
509 	wait_queue_head_t *u_sleep;
510 
511 	u = container_of(q, struct unix_sock, peer_wake);
512 
513 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
514 			    q);
515 	u->peer_wake.private = NULL;
516 
517 	/* relaying can only happen while the wq still exists */
518 	u_sleep = sk_sleep(&u->sk);
519 	if (u_sleep)
520 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
521 
522 	return 0;
523 }
524 
525 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
526 {
527 	struct unix_sock *u, *u_other;
528 	int rc;
529 
530 	u = unix_sk(sk);
531 	u_other = unix_sk(other);
532 	rc = 0;
533 	spin_lock(&u_other->peer_wait.lock);
534 
535 	if (!u->peer_wake.private) {
536 		u->peer_wake.private = other;
537 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
538 
539 		rc = 1;
540 	}
541 
542 	spin_unlock(&u_other->peer_wait.lock);
543 	return rc;
544 }
545 
546 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
547 					    struct sock *other)
548 {
549 	struct unix_sock *u, *u_other;
550 
551 	u = unix_sk(sk);
552 	u_other = unix_sk(other);
553 	spin_lock(&u_other->peer_wait.lock);
554 
555 	if (u->peer_wake.private == other) {
556 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
557 		u->peer_wake.private = NULL;
558 	}
559 
560 	spin_unlock(&u_other->peer_wait.lock);
561 }
562 
563 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
564 						   struct sock *other)
565 {
566 	unix_dgram_peer_wake_disconnect(sk, other);
567 	wake_up_interruptible_poll(sk_sleep(sk),
568 				   EPOLLOUT |
569 				   EPOLLWRNORM |
570 				   EPOLLWRBAND);
571 }
572 
573 /* preconditions:
574  *	- unix_peer(sk) == other
575  *	- association is stable
576  */
577 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
578 {
579 	int connected;
580 
581 	connected = unix_dgram_peer_wake_connect(sk, other);
582 
583 	/* If other is SOCK_DEAD, we want to make sure we signal
584 	 * POLLOUT, such that a subsequent write() can get a
585 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
586 	 * to other and its full, we will hang waiting for POLLOUT.
587 	 */
588 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
589 		return 1;
590 
591 	if (connected)
592 		unix_dgram_peer_wake_disconnect(sk, other);
593 
594 	return 0;
595 }
596 
597 static int unix_writable(const struct sock *sk, unsigned char state)
598 {
599 	return state != TCP_LISTEN &&
600 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
601 }
602 
603 static void unix_write_space(struct sock *sk)
604 {
605 	struct socket_wq *wq;
606 
607 	rcu_read_lock();
608 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
609 		wq = rcu_dereference(sk->sk_wq);
610 		if (skwq_has_sleeper(wq))
611 			wake_up_interruptible_sync_poll(&wq->wait,
612 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
613 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
614 	}
615 	rcu_read_unlock();
616 }
617 
618 /* When dgram socket disconnects (or changes its peer), we clear its receive
619  * queue of packets arrived from previous peer. First, it allows to do
620  * flow control based only on wmem_alloc; second, sk connected to peer
621  * may receive messages only from that peer. */
622 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
623 {
624 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
625 		skb_queue_purge_reason(&sk->sk_receive_queue,
626 				       SKB_DROP_REASON_UNIX_DISCONNECT);
627 
628 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
629 
630 		/* If one link of bidirectional dgram pipe is disconnected,
631 		 * we signal error. Messages are lost. Do not make this,
632 		 * when peer was not connected to us.
633 		 */
634 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
635 			WRITE_ONCE(other->sk_err, ECONNRESET);
636 			sk_error_report(other);
637 		}
638 	}
639 }
640 
641 static void unix_sock_destructor(struct sock *sk)
642 {
643 	struct unix_sock *u = unix_sk(sk);
644 
645 	skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
646 
647 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
648 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
649 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
650 	if (!sock_flag(sk, SOCK_DEAD)) {
651 		pr_info("Attempt to release alive unix socket: %p\n", sk);
652 		return;
653 	}
654 
655 	if (u->addr)
656 		unix_release_addr(u->addr);
657 
658 	atomic_long_dec(&unix_nr_socks);
659 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
660 #ifdef UNIX_REFCNT_DEBUG
661 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
662 		atomic_long_read(&unix_nr_socks));
663 #endif
664 }
665 
666 static void unix_release_sock(struct sock *sk, int embrion)
667 {
668 	struct unix_sock *u = unix_sk(sk);
669 	struct sock *skpair;
670 	struct sk_buff *skb;
671 	struct path path;
672 	int state;
673 
674 	unix_remove_socket(sock_net(sk), sk);
675 	unix_remove_bsd_socket(sk);
676 
677 	/* Clear state */
678 	unix_state_lock(sk);
679 	sock_orphan(sk);
680 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
681 	path	     = u->path;
682 	u->path.dentry = NULL;
683 	u->path.mnt = NULL;
684 	state = sk->sk_state;
685 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
686 
687 	skpair = unix_peer(sk);
688 	unix_peer(sk) = NULL;
689 
690 	unix_state_unlock(sk);
691 
692 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
693 	u->oob_skb = NULL;
694 #endif
695 
696 	wake_up_interruptible_all(&u->peer_wait);
697 
698 	if (skpair != NULL) {
699 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
700 			unix_state_lock(skpair);
701 			/* No more writes */
702 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
703 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
704 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
705 			unix_state_unlock(skpair);
706 			skpair->sk_state_change(skpair);
707 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
708 		}
709 
710 		unix_dgram_peer_wake_disconnect(sk, skpair);
711 		sock_put(skpair); /* It may now die */
712 	}
713 
714 	/* Try to flush out this socket. Throw out buffers at least */
715 
716 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
717 		if (state == TCP_LISTEN)
718 			unix_release_sock(skb->sk, 1);
719 
720 		/* passed fds are erased in the kfree_skb hook */
721 		kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
722 	}
723 
724 	if (path.dentry)
725 		path_put(&path);
726 
727 	sock_put(sk);
728 
729 	/* ---- Socket is dead now and most probably destroyed ---- */
730 
731 	/*
732 	 * Fixme: BSD difference: In BSD all sockets connected to us get
733 	 *	  ECONNRESET and we die on the spot. In Linux we behave
734 	 *	  like files and pipes do and wait for the last
735 	 *	  dereference.
736 	 *
737 	 * Can't we simply set sock->err?
738 	 *
739 	 *	  What the above comment does talk about? --ANK(980817)
740 	 */
741 
742 	if (READ_ONCE(unix_tot_inflight))
743 		unix_gc();		/* Garbage collect fds */
744 }
745 
746 static void init_peercred(struct sock *sk)
747 {
748 	sk->sk_peer_pid = get_pid(task_tgid(current));
749 	sk->sk_peer_cred = get_current_cred();
750 }
751 
752 static void update_peercred(struct sock *sk)
753 {
754 	const struct cred *old_cred;
755 	struct pid *old_pid;
756 
757 	spin_lock(&sk->sk_peer_lock);
758 	old_pid = sk->sk_peer_pid;
759 	old_cred = sk->sk_peer_cred;
760 	init_peercred(sk);
761 	spin_unlock(&sk->sk_peer_lock);
762 
763 	put_pid(old_pid);
764 	put_cred(old_cred);
765 }
766 
767 static void copy_peercred(struct sock *sk, struct sock *peersk)
768 {
769 	lockdep_assert_held(&unix_sk(peersk)->lock);
770 
771 	spin_lock(&sk->sk_peer_lock);
772 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
773 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
774 	spin_unlock(&sk->sk_peer_lock);
775 }
776 
777 static int unix_listen(struct socket *sock, int backlog)
778 {
779 	int err;
780 	struct sock *sk = sock->sk;
781 	struct unix_sock *u = unix_sk(sk);
782 
783 	err = -EOPNOTSUPP;
784 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
785 		goto out;	/* Only stream/seqpacket sockets accept */
786 	err = -EINVAL;
787 	if (!READ_ONCE(u->addr))
788 		goto out;	/* No listens on an unbound socket */
789 	unix_state_lock(sk);
790 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
791 		goto out_unlock;
792 	if (backlog > sk->sk_max_ack_backlog)
793 		wake_up_interruptible_all(&u->peer_wait);
794 	sk->sk_max_ack_backlog	= backlog;
795 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
796 
797 	/* set credentials so connect can copy them */
798 	update_peercred(sk);
799 	err = 0;
800 
801 out_unlock:
802 	unix_state_unlock(sk);
803 out:
804 	return err;
805 }
806 
807 static int unix_release(struct socket *);
808 static int unix_bind(struct socket *, struct sockaddr *, int);
809 static int unix_stream_connect(struct socket *, struct sockaddr *,
810 			       int addr_len, int flags);
811 static int unix_socketpair(struct socket *, struct socket *);
812 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
813 static int unix_getname(struct socket *, struct sockaddr *, int);
814 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
815 static __poll_t unix_dgram_poll(struct file *, struct socket *,
816 				    poll_table *);
817 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
818 #ifdef CONFIG_COMPAT
819 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
820 #endif
821 static int unix_shutdown(struct socket *, int);
822 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
823 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
824 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
825 				       struct pipe_inode_info *, size_t size,
826 				       unsigned int flags);
827 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
828 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
829 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
830 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
831 static int unix_dgram_connect(struct socket *, struct sockaddr *,
832 			      int, int);
833 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
834 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
835 				  int);
836 
837 #ifdef CONFIG_PROC_FS
838 static int unix_count_nr_fds(struct sock *sk)
839 {
840 	struct sk_buff *skb;
841 	struct unix_sock *u;
842 	int nr_fds = 0;
843 
844 	spin_lock(&sk->sk_receive_queue.lock);
845 	skb = skb_peek(&sk->sk_receive_queue);
846 	while (skb) {
847 		u = unix_sk(skb->sk);
848 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
849 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
850 	}
851 	spin_unlock(&sk->sk_receive_queue.lock);
852 
853 	return nr_fds;
854 }
855 
856 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
857 {
858 	struct sock *sk = sock->sk;
859 	unsigned char s_state;
860 	struct unix_sock *u;
861 	int nr_fds = 0;
862 
863 	if (sk) {
864 		s_state = READ_ONCE(sk->sk_state);
865 		u = unix_sk(sk);
866 
867 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
868 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
869 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
870 		 */
871 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
872 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
873 		else if (s_state == TCP_LISTEN)
874 			nr_fds = unix_count_nr_fds(sk);
875 
876 		seq_printf(m, "scm_fds: %u\n", nr_fds);
877 	}
878 }
879 #else
880 #define unix_show_fdinfo NULL
881 #endif
882 
883 static const struct proto_ops unix_stream_ops = {
884 	.family =	PF_UNIX,
885 	.owner =	THIS_MODULE,
886 	.release =	unix_release,
887 	.bind =		unix_bind,
888 	.connect =	unix_stream_connect,
889 	.socketpair =	unix_socketpair,
890 	.accept =	unix_accept,
891 	.getname =	unix_getname,
892 	.poll =		unix_poll,
893 	.ioctl =	unix_ioctl,
894 #ifdef CONFIG_COMPAT
895 	.compat_ioctl =	unix_compat_ioctl,
896 #endif
897 	.listen =	unix_listen,
898 	.shutdown =	unix_shutdown,
899 	.sendmsg =	unix_stream_sendmsg,
900 	.recvmsg =	unix_stream_recvmsg,
901 	.read_skb =	unix_stream_read_skb,
902 	.mmap =		sock_no_mmap,
903 	.splice_read =	unix_stream_splice_read,
904 	.set_peek_off =	sk_set_peek_off,
905 	.show_fdinfo =	unix_show_fdinfo,
906 };
907 
908 static const struct proto_ops unix_dgram_ops = {
909 	.family =	PF_UNIX,
910 	.owner =	THIS_MODULE,
911 	.release =	unix_release,
912 	.bind =		unix_bind,
913 	.connect =	unix_dgram_connect,
914 	.socketpair =	unix_socketpair,
915 	.accept =	sock_no_accept,
916 	.getname =	unix_getname,
917 	.poll =		unix_dgram_poll,
918 	.ioctl =	unix_ioctl,
919 #ifdef CONFIG_COMPAT
920 	.compat_ioctl =	unix_compat_ioctl,
921 #endif
922 	.listen =	sock_no_listen,
923 	.shutdown =	unix_shutdown,
924 	.sendmsg =	unix_dgram_sendmsg,
925 	.read_skb =	unix_read_skb,
926 	.recvmsg =	unix_dgram_recvmsg,
927 	.mmap =		sock_no_mmap,
928 	.set_peek_off =	sk_set_peek_off,
929 	.show_fdinfo =	unix_show_fdinfo,
930 };
931 
932 static const struct proto_ops unix_seqpacket_ops = {
933 	.family =	PF_UNIX,
934 	.owner =	THIS_MODULE,
935 	.release =	unix_release,
936 	.bind =		unix_bind,
937 	.connect =	unix_stream_connect,
938 	.socketpair =	unix_socketpair,
939 	.accept =	unix_accept,
940 	.getname =	unix_getname,
941 	.poll =		unix_dgram_poll,
942 	.ioctl =	unix_ioctl,
943 #ifdef CONFIG_COMPAT
944 	.compat_ioctl =	unix_compat_ioctl,
945 #endif
946 	.listen =	unix_listen,
947 	.shutdown =	unix_shutdown,
948 	.sendmsg =	unix_seqpacket_sendmsg,
949 	.recvmsg =	unix_seqpacket_recvmsg,
950 	.mmap =		sock_no_mmap,
951 	.set_peek_off =	sk_set_peek_off,
952 	.show_fdinfo =	unix_show_fdinfo,
953 };
954 
955 static void unix_close(struct sock *sk, long timeout)
956 {
957 	/* Nothing to do here, unix socket does not need a ->close().
958 	 * This is merely for sockmap.
959 	 */
960 }
961 
962 static void unix_unhash(struct sock *sk)
963 {
964 	/* Nothing to do here, unix socket does not need a ->unhash().
965 	 * This is merely for sockmap.
966 	 */
967 }
968 
969 static bool unix_bpf_bypass_getsockopt(int level, int optname)
970 {
971 	if (level == SOL_SOCKET) {
972 		switch (optname) {
973 		case SO_PEERPIDFD:
974 			return true;
975 		default:
976 			return false;
977 		}
978 	}
979 
980 	return false;
981 }
982 
983 struct proto unix_dgram_proto = {
984 	.name			= "UNIX",
985 	.owner			= THIS_MODULE,
986 	.obj_size		= sizeof(struct unix_sock),
987 	.close			= unix_close,
988 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
989 #ifdef CONFIG_BPF_SYSCALL
990 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
991 #endif
992 };
993 
994 struct proto unix_stream_proto = {
995 	.name			= "UNIX-STREAM",
996 	.owner			= THIS_MODULE,
997 	.obj_size		= sizeof(struct unix_sock),
998 	.close			= unix_close,
999 	.unhash			= unix_unhash,
1000 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1001 #ifdef CONFIG_BPF_SYSCALL
1002 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1003 #endif
1004 };
1005 
1006 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1007 {
1008 	struct unix_sock *u;
1009 	struct sock *sk;
1010 	int err;
1011 
1012 	atomic_long_inc(&unix_nr_socks);
1013 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1014 		err = -ENFILE;
1015 		goto err;
1016 	}
1017 
1018 	if (type == SOCK_STREAM)
1019 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1020 	else /*dgram and  seqpacket */
1021 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1022 
1023 	if (!sk) {
1024 		err = -ENOMEM;
1025 		goto err;
1026 	}
1027 
1028 	sock_init_data(sock, sk);
1029 
1030 	sk->sk_hash		= unix_unbound_hash(sk);
1031 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1032 	sk->sk_write_space	= unix_write_space;
1033 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1034 	sk->sk_destruct		= unix_sock_destructor;
1035 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1036 
1037 	u = unix_sk(sk);
1038 	u->listener = NULL;
1039 	u->vertex = NULL;
1040 	u->path.dentry = NULL;
1041 	u->path.mnt = NULL;
1042 	spin_lock_init(&u->lock);
1043 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1044 	mutex_init(&u->iolock); /* single task reading lock */
1045 	mutex_init(&u->bindlock); /* single task binding lock */
1046 	init_waitqueue_head(&u->peer_wait);
1047 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1048 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1049 	unix_insert_unbound_socket(net, sk);
1050 
1051 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1052 
1053 	return sk;
1054 
1055 err:
1056 	atomic_long_dec(&unix_nr_socks);
1057 	return ERR_PTR(err);
1058 }
1059 
1060 static int unix_create(struct net *net, struct socket *sock, int protocol,
1061 		       int kern)
1062 {
1063 	struct sock *sk;
1064 
1065 	if (protocol && protocol != PF_UNIX)
1066 		return -EPROTONOSUPPORT;
1067 
1068 	sock->state = SS_UNCONNECTED;
1069 
1070 	switch (sock->type) {
1071 	case SOCK_STREAM:
1072 		sock->ops = &unix_stream_ops;
1073 		break;
1074 		/*
1075 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1076 		 *	nothing uses it.
1077 		 */
1078 	case SOCK_RAW:
1079 		sock->type = SOCK_DGRAM;
1080 		fallthrough;
1081 	case SOCK_DGRAM:
1082 		sock->ops = &unix_dgram_ops;
1083 		break;
1084 	case SOCK_SEQPACKET:
1085 		sock->ops = &unix_seqpacket_ops;
1086 		break;
1087 	default:
1088 		return -ESOCKTNOSUPPORT;
1089 	}
1090 
1091 	sk = unix_create1(net, sock, kern, sock->type);
1092 	if (IS_ERR(sk))
1093 		return PTR_ERR(sk);
1094 
1095 	return 0;
1096 }
1097 
1098 static int unix_release(struct socket *sock)
1099 {
1100 	struct sock *sk = sock->sk;
1101 
1102 	if (!sk)
1103 		return 0;
1104 
1105 	sk->sk_prot->close(sk, 0);
1106 	unix_release_sock(sk, 0);
1107 	sock->sk = NULL;
1108 
1109 	return 0;
1110 }
1111 
1112 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1113 				  int type)
1114 {
1115 	struct inode *inode;
1116 	struct path path;
1117 	struct sock *sk;
1118 	int err;
1119 
1120 	unix_mkname_bsd(sunaddr, addr_len);
1121 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1122 	if (err)
1123 		goto fail;
1124 
1125 	err = path_permission(&path, MAY_WRITE);
1126 	if (err)
1127 		goto path_put;
1128 
1129 	err = -ECONNREFUSED;
1130 	inode = d_backing_inode(path.dentry);
1131 	if (!S_ISSOCK(inode->i_mode))
1132 		goto path_put;
1133 
1134 	sk = unix_find_socket_byinode(inode);
1135 	if (!sk)
1136 		goto path_put;
1137 
1138 	err = -EPROTOTYPE;
1139 	if (sk->sk_type == type)
1140 		touch_atime(&path);
1141 	else
1142 		goto sock_put;
1143 
1144 	path_put(&path);
1145 
1146 	return sk;
1147 
1148 sock_put:
1149 	sock_put(sk);
1150 path_put:
1151 	path_put(&path);
1152 fail:
1153 	return ERR_PTR(err);
1154 }
1155 
1156 static struct sock *unix_find_abstract(struct net *net,
1157 				       struct sockaddr_un *sunaddr,
1158 				       int addr_len, int type)
1159 {
1160 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1161 	struct dentry *dentry;
1162 	struct sock *sk;
1163 
1164 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1165 	if (!sk)
1166 		return ERR_PTR(-ECONNREFUSED);
1167 
1168 	dentry = unix_sk(sk)->path.dentry;
1169 	if (dentry)
1170 		touch_atime(&unix_sk(sk)->path);
1171 
1172 	return sk;
1173 }
1174 
1175 static struct sock *unix_find_other(struct net *net,
1176 				    struct sockaddr_un *sunaddr,
1177 				    int addr_len, int type)
1178 {
1179 	struct sock *sk;
1180 
1181 	if (sunaddr->sun_path[0])
1182 		sk = unix_find_bsd(sunaddr, addr_len, type);
1183 	else
1184 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1185 
1186 	return sk;
1187 }
1188 
1189 static int unix_autobind(struct sock *sk)
1190 {
1191 	struct unix_sock *u = unix_sk(sk);
1192 	unsigned int new_hash, old_hash;
1193 	struct net *net = sock_net(sk);
1194 	struct unix_address *addr;
1195 	u32 lastnum, ordernum;
1196 	int err;
1197 
1198 	err = mutex_lock_interruptible(&u->bindlock);
1199 	if (err)
1200 		return err;
1201 
1202 	if (u->addr)
1203 		goto out;
1204 
1205 	err = -ENOMEM;
1206 	addr = kzalloc(sizeof(*addr) +
1207 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1208 	if (!addr)
1209 		goto out;
1210 
1211 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1212 	addr->name->sun_family = AF_UNIX;
1213 	refcount_set(&addr->refcnt, 1);
1214 
1215 	old_hash = sk->sk_hash;
1216 	ordernum = get_random_u32();
1217 	lastnum = ordernum & 0xFFFFF;
1218 retry:
1219 	ordernum = (ordernum + 1) & 0xFFFFF;
1220 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1221 
1222 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1223 	unix_table_double_lock(net, old_hash, new_hash);
1224 
1225 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1226 		unix_table_double_unlock(net, old_hash, new_hash);
1227 
1228 		/* __unix_find_socket_byname() may take long time if many names
1229 		 * are already in use.
1230 		 */
1231 		cond_resched();
1232 
1233 		if (ordernum == lastnum) {
1234 			/* Give up if all names seems to be in use. */
1235 			err = -ENOSPC;
1236 			unix_release_addr(addr);
1237 			goto out;
1238 		}
1239 
1240 		goto retry;
1241 	}
1242 
1243 	__unix_set_addr_hash(net, sk, addr, new_hash);
1244 	unix_table_double_unlock(net, old_hash, new_hash);
1245 	err = 0;
1246 
1247 out:	mutex_unlock(&u->bindlock);
1248 	return err;
1249 }
1250 
1251 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1252 			 int addr_len)
1253 {
1254 	umode_t mode = S_IFSOCK |
1255 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1256 	struct unix_sock *u = unix_sk(sk);
1257 	unsigned int new_hash, old_hash;
1258 	struct net *net = sock_net(sk);
1259 	struct mnt_idmap *idmap;
1260 	struct unix_address *addr;
1261 	struct dentry *dentry;
1262 	struct path parent;
1263 	int err;
1264 
1265 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1266 	addr = unix_create_addr(sunaddr, addr_len);
1267 	if (!addr)
1268 		return -ENOMEM;
1269 
1270 	/*
1271 	 * Get the parent directory, calculate the hash for last
1272 	 * component.
1273 	 */
1274 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1275 	if (IS_ERR(dentry)) {
1276 		err = PTR_ERR(dentry);
1277 		goto out;
1278 	}
1279 
1280 	/*
1281 	 * All right, let's create it.
1282 	 */
1283 	idmap = mnt_idmap(parent.mnt);
1284 	err = security_path_mknod(&parent, dentry, mode, 0);
1285 	if (!err)
1286 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1287 	if (err)
1288 		goto out_path;
1289 	err = mutex_lock_interruptible(&u->bindlock);
1290 	if (err)
1291 		goto out_unlink;
1292 	if (u->addr)
1293 		goto out_unlock;
1294 
1295 	old_hash = sk->sk_hash;
1296 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1297 	unix_table_double_lock(net, old_hash, new_hash);
1298 	u->path.mnt = mntget(parent.mnt);
1299 	u->path.dentry = dget(dentry);
1300 	__unix_set_addr_hash(net, sk, addr, new_hash);
1301 	unix_table_double_unlock(net, old_hash, new_hash);
1302 	unix_insert_bsd_socket(sk);
1303 	mutex_unlock(&u->bindlock);
1304 	done_path_create(&parent, dentry);
1305 	return 0;
1306 
1307 out_unlock:
1308 	mutex_unlock(&u->bindlock);
1309 	err = -EINVAL;
1310 out_unlink:
1311 	/* failed after successful mknod?  unlink what we'd created... */
1312 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1313 out_path:
1314 	done_path_create(&parent, dentry);
1315 out:
1316 	unix_release_addr(addr);
1317 	return err == -EEXIST ? -EADDRINUSE : err;
1318 }
1319 
1320 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1321 			      int addr_len)
1322 {
1323 	struct unix_sock *u = unix_sk(sk);
1324 	unsigned int new_hash, old_hash;
1325 	struct net *net = sock_net(sk);
1326 	struct unix_address *addr;
1327 	int err;
1328 
1329 	addr = unix_create_addr(sunaddr, addr_len);
1330 	if (!addr)
1331 		return -ENOMEM;
1332 
1333 	err = mutex_lock_interruptible(&u->bindlock);
1334 	if (err)
1335 		goto out;
1336 
1337 	if (u->addr) {
1338 		err = -EINVAL;
1339 		goto out_mutex;
1340 	}
1341 
1342 	old_hash = sk->sk_hash;
1343 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1344 	unix_table_double_lock(net, old_hash, new_hash);
1345 
1346 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1347 		goto out_spin;
1348 
1349 	__unix_set_addr_hash(net, sk, addr, new_hash);
1350 	unix_table_double_unlock(net, old_hash, new_hash);
1351 	mutex_unlock(&u->bindlock);
1352 	return 0;
1353 
1354 out_spin:
1355 	unix_table_double_unlock(net, old_hash, new_hash);
1356 	err = -EADDRINUSE;
1357 out_mutex:
1358 	mutex_unlock(&u->bindlock);
1359 out:
1360 	unix_release_addr(addr);
1361 	return err;
1362 }
1363 
1364 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1365 {
1366 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1367 	struct sock *sk = sock->sk;
1368 	int err;
1369 
1370 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1371 	    sunaddr->sun_family == AF_UNIX)
1372 		return unix_autobind(sk);
1373 
1374 	err = unix_validate_addr(sunaddr, addr_len);
1375 	if (err)
1376 		return err;
1377 
1378 	if (sunaddr->sun_path[0])
1379 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1380 	else
1381 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1382 
1383 	return err;
1384 }
1385 
1386 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1387 {
1388 	if (unlikely(sk1 == sk2) || !sk2) {
1389 		unix_state_lock(sk1);
1390 		return;
1391 	}
1392 
1393 	if (sk1 > sk2)
1394 		swap(sk1, sk2);
1395 
1396 	unix_state_lock(sk1);
1397 	unix_state_lock(sk2);
1398 }
1399 
1400 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1401 {
1402 	if (unlikely(sk1 == sk2) || !sk2) {
1403 		unix_state_unlock(sk1);
1404 		return;
1405 	}
1406 	unix_state_unlock(sk1);
1407 	unix_state_unlock(sk2);
1408 }
1409 
1410 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1411 			      int alen, int flags)
1412 {
1413 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1414 	struct sock *sk = sock->sk;
1415 	struct sock *other;
1416 	int err;
1417 
1418 	err = -EINVAL;
1419 	if (alen < offsetofend(struct sockaddr, sa_family))
1420 		goto out;
1421 
1422 	if (addr->sa_family != AF_UNSPEC) {
1423 		err = unix_validate_addr(sunaddr, alen);
1424 		if (err)
1425 			goto out;
1426 
1427 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1428 		if (err)
1429 			goto out;
1430 
1431 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1432 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1433 		    !READ_ONCE(unix_sk(sk)->addr)) {
1434 			err = unix_autobind(sk);
1435 			if (err)
1436 				goto out;
1437 		}
1438 
1439 restart:
1440 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1441 		if (IS_ERR(other)) {
1442 			err = PTR_ERR(other);
1443 			goto out;
1444 		}
1445 
1446 		unix_state_double_lock(sk, other);
1447 
1448 		/* Apparently VFS overslept socket death. Retry. */
1449 		if (sock_flag(other, SOCK_DEAD)) {
1450 			unix_state_double_unlock(sk, other);
1451 			sock_put(other);
1452 			goto restart;
1453 		}
1454 
1455 		err = -EPERM;
1456 		if (!unix_may_send(sk, other))
1457 			goto out_unlock;
1458 
1459 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1460 		if (err)
1461 			goto out_unlock;
1462 
1463 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1464 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1465 	} else {
1466 		/*
1467 		 *	1003.1g breaking connected state with AF_UNSPEC
1468 		 */
1469 		other = NULL;
1470 		unix_state_double_lock(sk, other);
1471 	}
1472 
1473 	/*
1474 	 * If it was connected, reconnect.
1475 	 */
1476 	if (unix_peer(sk)) {
1477 		struct sock *old_peer = unix_peer(sk);
1478 
1479 		unix_peer(sk) = other;
1480 		if (!other)
1481 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1482 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1483 
1484 		unix_state_double_unlock(sk, other);
1485 
1486 		if (other != old_peer) {
1487 			unix_dgram_disconnected(sk, old_peer);
1488 
1489 			unix_state_lock(old_peer);
1490 			if (!unix_peer(old_peer))
1491 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1492 			unix_state_unlock(old_peer);
1493 		}
1494 
1495 		sock_put(old_peer);
1496 	} else {
1497 		unix_peer(sk) = other;
1498 		unix_state_double_unlock(sk, other);
1499 	}
1500 
1501 	return 0;
1502 
1503 out_unlock:
1504 	unix_state_double_unlock(sk, other);
1505 	sock_put(other);
1506 out:
1507 	return err;
1508 }
1509 
1510 static long unix_wait_for_peer(struct sock *other, long timeo)
1511 	__releases(&unix_sk(other)->lock)
1512 {
1513 	struct unix_sock *u = unix_sk(other);
1514 	int sched;
1515 	DEFINE_WAIT(wait);
1516 
1517 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1518 
1519 	sched = !sock_flag(other, SOCK_DEAD) &&
1520 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1521 		unix_recvq_full_lockless(other);
1522 
1523 	unix_state_unlock(other);
1524 
1525 	if (sched)
1526 		timeo = schedule_timeout(timeo);
1527 
1528 	finish_wait(&u->peer_wait, &wait);
1529 	return timeo;
1530 }
1531 
1532 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1533 			       int addr_len, int flags)
1534 {
1535 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1536 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1537 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1538 	struct net *net = sock_net(sk);
1539 	struct sk_buff *skb = NULL;
1540 	unsigned char state;
1541 	long timeo;
1542 	int err;
1543 
1544 	err = unix_validate_addr(sunaddr, addr_len);
1545 	if (err)
1546 		goto out;
1547 
1548 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1549 	if (err)
1550 		goto out;
1551 
1552 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1553 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1554 	    !READ_ONCE(u->addr)) {
1555 		err = unix_autobind(sk);
1556 		if (err)
1557 			goto out;
1558 	}
1559 
1560 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1561 
1562 	/* First of all allocate resources.
1563 	 * If we will make it after state is locked,
1564 	 * we will have to recheck all again in any case.
1565 	 */
1566 
1567 	/* create new sock for complete connection */
1568 	newsk = unix_create1(net, NULL, 0, sock->type);
1569 	if (IS_ERR(newsk)) {
1570 		err = PTR_ERR(newsk);
1571 		goto out;
1572 	}
1573 
1574 	/* Allocate skb for sending to listening sock */
1575 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1576 	if (!skb) {
1577 		err = -ENOMEM;
1578 		goto out_free_sk;
1579 	}
1580 
1581 restart:
1582 	/*  Find listening sock. */
1583 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1584 	if (IS_ERR(other)) {
1585 		err = PTR_ERR(other);
1586 		goto out_free_skb;
1587 	}
1588 
1589 	unix_state_lock(other);
1590 
1591 	/* Apparently VFS overslept socket death. Retry. */
1592 	if (sock_flag(other, SOCK_DEAD)) {
1593 		unix_state_unlock(other);
1594 		sock_put(other);
1595 		goto restart;
1596 	}
1597 
1598 	if (other->sk_state != TCP_LISTEN ||
1599 	    other->sk_shutdown & RCV_SHUTDOWN) {
1600 		err = -ECONNREFUSED;
1601 		goto out_unlock;
1602 	}
1603 
1604 	if (unix_recvq_full_lockless(other)) {
1605 		if (!timeo) {
1606 			err = -EAGAIN;
1607 			goto out_unlock;
1608 		}
1609 
1610 		timeo = unix_wait_for_peer(other, timeo);
1611 		sock_put(other);
1612 
1613 		err = sock_intr_errno(timeo);
1614 		if (signal_pending(current))
1615 			goto out_free_skb;
1616 
1617 		goto restart;
1618 	}
1619 
1620 	/* self connect and simultaneous connect are eliminated
1621 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1622 	 */
1623 	state = READ_ONCE(sk->sk_state);
1624 	if (unlikely(state != TCP_CLOSE)) {
1625 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1626 		goto out_unlock;
1627 	}
1628 
1629 	unix_state_lock(sk);
1630 
1631 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1632 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1633 		unix_state_unlock(sk);
1634 		goto out_unlock;
1635 	}
1636 
1637 	err = security_unix_stream_connect(sk, other, newsk);
1638 	if (err) {
1639 		unix_state_unlock(sk);
1640 		goto out_unlock;
1641 	}
1642 
1643 	/* The way is open! Fastly set all the necessary fields... */
1644 
1645 	sock_hold(sk);
1646 	unix_peer(newsk)	= sk;
1647 	newsk->sk_state		= TCP_ESTABLISHED;
1648 	newsk->sk_type		= sk->sk_type;
1649 	init_peercred(newsk);
1650 	newu = unix_sk(newsk);
1651 	newu->listener = other;
1652 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1653 	otheru = unix_sk(other);
1654 
1655 	/* copy address information from listening to new sock
1656 	 *
1657 	 * The contents of *(otheru->addr) and otheru->path
1658 	 * are seen fully set up here, since we have found
1659 	 * otheru in hash under its lock.  Insertion into the
1660 	 * hash chain we'd found it in had been done in an
1661 	 * earlier critical area protected by the chain's lock,
1662 	 * the same one where we'd set *(otheru->addr) contents,
1663 	 * as well as otheru->path and otheru->addr itself.
1664 	 *
1665 	 * Using smp_store_release() here to set newu->addr
1666 	 * is enough to make those stores, as well as stores
1667 	 * to newu->path visible to anyone who gets newu->addr
1668 	 * by smp_load_acquire().  IOW, the same warranties
1669 	 * as for unix_sock instances bound in unix_bind() or
1670 	 * in unix_autobind().
1671 	 */
1672 	if (otheru->path.dentry) {
1673 		path_get(&otheru->path);
1674 		newu->path = otheru->path;
1675 	}
1676 	refcount_inc(&otheru->addr->refcnt);
1677 	smp_store_release(&newu->addr, otheru->addr);
1678 
1679 	/* Set credentials */
1680 	copy_peercred(sk, other);
1681 
1682 	sock->state	= SS_CONNECTED;
1683 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1684 	sock_hold(newsk);
1685 
1686 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1687 	unix_peer(sk)	= newsk;
1688 
1689 	unix_state_unlock(sk);
1690 
1691 	/* take ten and send info to listening sock */
1692 	spin_lock(&other->sk_receive_queue.lock);
1693 	__skb_queue_tail(&other->sk_receive_queue, skb);
1694 	spin_unlock(&other->sk_receive_queue.lock);
1695 	unix_state_unlock(other);
1696 	other->sk_data_ready(other);
1697 	sock_put(other);
1698 	return 0;
1699 
1700 out_unlock:
1701 	unix_state_unlock(other);
1702 	sock_put(other);
1703 out_free_skb:
1704 	consume_skb(skb);
1705 out_free_sk:
1706 	unix_release_sock(newsk, 0);
1707 out:
1708 	return err;
1709 }
1710 
1711 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1712 {
1713 	struct sock *ska = socka->sk, *skb = sockb->sk;
1714 
1715 	/* Join our sockets back to back */
1716 	sock_hold(ska);
1717 	sock_hold(skb);
1718 	unix_peer(ska) = skb;
1719 	unix_peer(skb) = ska;
1720 	init_peercred(ska);
1721 	init_peercred(skb);
1722 
1723 	ska->sk_state = TCP_ESTABLISHED;
1724 	skb->sk_state = TCP_ESTABLISHED;
1725 	socka->state  = SS_CONNECTED;
1726 	sockb->state  = SS_CONNECTED;
1727 	return 0;
1728 }
1729 
1730 static void unix_sock_inherit_flags(const struct socket *old,
1731 				    struct socket *new)
1732 {
1733 	if (test_bit(SOCK_PASSCRED, &old->flags))
1734 		set_bit(SOCK_PASSCRED, &new->flags);
1735 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1736 		set_bit(SOCK_PASSPIDFD, &new->flags);
1737 	if (test_bit(SOCK_PASSSEC, &old->flags))
1738 		set_bit(SOCK_PASSSEC, &new->flags);
1739 }
1740 
1741 static int unix_accept(struct socket *sock, struct socket *newsock,
1742 		       struct proto_accept_arg *arg)
1743 {
1744 	struct sock *sk = sock->sk;
1745 	struct sk_buff *skb;
1746 	struct sock *tsk;
1747 
1748 	arg->err = -EOPNOTSUPP;
1749 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1750 		goto out;
1751 
1752 	arg->err = -EINVAL;
1753 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1754 		goto out;
1755 
1756 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1757 	 * so that no locks are necessary.
1758 	 */
1759 
1760 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1761 				&arg->err);
1762 	if (!skb) {
1763 		/* This means receive shutdown. */
1764 		if (arg->err == 0)
1765 			arg->err = -EINVAL;
1766 		goto out;
1767 	}
1768 
1769 	tsk = skb->sk;
1770 	skb_free_datagram(sk, skb);
1771 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1772 
1773 	/* attach accepted sock to socket */
1774 	unix_state_lock(tsk);
1775 	unix_update_edges(unix_sk(tsk));
1776 	newsock->state = SS_CONNECTED;
1777 	unix_sock_inherit_flags(sock, newsock);
1778 	sock_graft(tsk, newsock);
1779 	unix_state_unlock(tsk);
1780 	return 0;
1781 
1782 out:
1783 	return arg->err;
1784 }
1785 
1786 
1787 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1788 {
1789 	struct sock *sk = sock->sk;
1790 	struct unix_address *addr;
1791 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1792 	int err = 0;
1793 
1794 	if (peer) {
1795 		sk = unix_peer_get(sk);
1796 
1797 		err = -ENOTCONN;
1798 		if (!sk)
1799 			goto out;
1800 		err = 0;
1801 	} else {
1802 		sock_hold(sk);
1803 	}
1804 
1805 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1806 	if (!addr) {
1807 		sunaddr->sun_family = AF_UNIX;
1808 		sunaddr->sun_path[0] = 0;
1809 		err = offsetof(struct sockaddr_un, sun_path);
1810 	} else {
1811 		err = addr->len;
1812 		memcpy(sunaddr, addr->name, addr->len);
1813 
1814 		if (peer)
1815 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1816 					       CGROUP_UNIX_GETPEERNAME);
1817 		else
1818 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1819 					       CGROUP_UNIX_GETSOCKNAME);
1820 	}
1821 	sock_put(sk);
1822 out:
1823 	return err;
1824 }
1825 
1826 /* The "user->unix_inflight" variable is protected by the garbage
1827  * collection lock, and we just read it locklessly here. If you go
1828  * over the limit, there might be a tiny race in actually noticing
1829  * it across threads. Tough.
1830  */
1831 static inline bool too_many_unix_fds(struct task_struct *p)
1832 {
1833 	struct user_struct *user = current_user();
1834 
1835 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1836 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1837 	return false;
1838 }
1839 
1840 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1841 {
1842 	if (too_many_unix_fds(current))
1843 		return -ETOOMANYREFS;
1844 
1845 	UNIXCB(skb).fp = scm->fp;
1846 	scm->fp = NULL;
1847 
1848 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1849 		return -ENOMEM;
1850 
1851 	return 0;
1852 }
1853 
1854 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1855 {
1856 	scm->fp = UNIXCB(skb).fp;
1857 	UNIXCB(skb).fp = NULL;
1858 
1859 	unix_destroy_fpl(scm->fp);
1860 }
1861 
1862 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1863 {
1864 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1865 }
1866 
1867 static void unix_destruct_scm(struct sk_buff *skb)
1868 {
1869 	struct scm_cookie scm;
1870 
1871 	memset(&scm, 0, sizeof(scm));
1872 	scm.pid  = UNIXCB(skb).pid;
1873 	if (UNIXCB(skb).fp)
1874 		unix_detach_fds(&scm, skb);
1875 
1876 	/* Alas, it calls VFS */
1877 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1878 	scm_destroy(&scm);
1879 	sock_wfree(skb);
1880 }
1881 
1882 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1883 {
1884 	int err = 0;
1885 
1886 	UNIXCB(skb).pid  = get_pid(scm->pid);
1887 	UNIXCB(skb).uid = scm->creds.uid;
1888 	UNIXCB(skb).gid = scm->creds.gid;
1889 	UNIXCB(skb).fp = NULL;
1890 	unix_get_secdata(scm, skb);
1891 	if (scm->fp && send_fds)
1892 		err = unix_attach_fds(scm, skb);
1893 
1894 	skb->destructor = unix_destruct_scm;
1895 	return err;
1896 }
1897 
1898 static bool unix_passcred_enabled(const struct socket *sock,
1899 				  const struct sock *other)
1900 {
1901 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1902 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1903 	       !other->sk_socket ||
1904 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1905 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1906 }
1907 
1908 /*
1909  * Some apps rely on write() giving SCM_CREDENTIALS
1910  * We include credentials if source or destination socket
1911  * asserted SOCK_PASSCRED.
1912  */
1913 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1914 			    const struct sock *other)
1915 {
1916 	if (UNIXCB(skb).pid)
1917 		return;
1918 	if (unix_passcred_enabled(sock, other)) {
1919 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1920 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1921 	}
1922 }
1923 
1924 static bool unix_skb_scm_eq(struct sk_buff *skb,
1925 			    struct scm_cookie *scm)
1926 {
1927 	return UNIXCB(skb).pid == scm->pid &&
1928 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1929 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1930 	       unix_secdata_eq(scm, skb);
1931 }
1932 
1933 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1934 {
1935 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1936 	struct unix_sock *u = unix_sk(sk);
1937 
1938 	if (unlikely(fp && fp->count)) {
1939 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1940 		unix_add_edges(fp, u);
1941 	}
1942 }
1943 
1944 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1945 {
1946 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1947 	struct unix_sock *u = unix_sk(sk);
1948 
1949 	if (unlikely(fp && fp->count)) {
1950 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1951 		unix_del_edges(fp);
1952 	}
1953 }
1954 
1955 /*
1956  *	Send AF_UNIX data.
1957  */
1958 
1959 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1960 			      size_t len)
1961 {
1962 	struct sock *sk = sock->sk, *other = NULL;
1963 	struct unix_sock *u = unix_sk(sk);
1964 	struct scm_cookie scm;
1965 	struct sk_buff *skb;
1966 	int data_len = 0;
1967 	int sk_locked;
1968 	long timeo;
1969 	int err;
1970 
1971 	err = scm_send(sock, msg, &scm, false);
1972 	if (err < 0)
1973 		return err;
1974 
1975 	wait_for_unix_gc(scm.fp);
1976 
1977 	if (msg->msg_flags & MSG_OOB) {
1978 		err = -EOPNOTSUPP;
1979 		goto out;
1980 	}
1981 
1982 	if (msg->msg_namelen) {
1983 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
1984 		if (err)
1985 			goto out;
1986 
1987 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1988 							    msg->msg_name,
1989 							    &msg->msg_namelen,
1990 							    NULL);
1991 		if (err)
1992 			goto out;
1993 	}
1994 
1995 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1996 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1997 	    !READ_ONCE(u->addr)) {
1998 		err = unix_autobind(sk);
1999 		if (err)
2000 			goto out;
2001 	}
2002 
2003 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
2004 		err = -EMSGSIZE;
2005 		goto out;
2006 	}
2007 
2008 	if (len > SKB_MAX_ALLOC) {
2009 		data_len = min_t(size_t,
2010 				 len - SKB_MAX_ALLOC,
2011 				 MAX_SKB_FRAGS * PAGE_SIZE);
2012 		data_len = PAGE_ALIGN(data_len);
2013 
2014 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2015 	}
2016 
2017 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2018 				   msg->msg_flags & MSG_DONTWAIT, &err,
2019 				   PAGE_ALLOC_COSTLY_ORDER);
2020 	if (!skb)
2021 		goto out;
2022 
2023 	err = unix_scm_to_skb(&scm, skb, true);
2024 	if (err < 0)
2025 		goto out_free;
2026 
2027 	skb_put(skb, len - data_len);
2028 	skb->data_len = data_len;
2029 	skb->len = len;
2030 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2031 	if (err)
2032 		goto out_free;
2033 
2034 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2035 
2036 	if (msg->msg_namelen) {
2037 lookup:
2038 		other = unix_find_other(sock_net(sk), msg->msg_name,
2039 					msg->msg_namelen, sk->sk_type);
2040 		if (IS_ERR(other)) {
2041 			err = PTR_ERR(other);
2042 			goto out_free;
2043 		}
2044 	} else {
2045 		other = unix_peer_get(sk);
2046 		if (!other) {
2047 			err = -ENOTCONN;
2048 			goto out_free;
2049 		}
2050 	}
2051 
2052 	if (sk_filter(other, skb) < 0) {
2053 		/* Toss the packet but do not return any error to the sender */
2054 		err = len;
2055 		goto out_sock_put;
2056 	}
2057 
2058 restart:
2059 	sk_locked = 0;
2060 	unix_state_lock(other);
2061 restart_locked:
2062 
2063 	if (!unix_may_send(sk, other)) {
2064 		err = -EPERM;
2065 		goto out_unlock;
2066 	}
2067 
2068 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2069 		/* Check with 1003.1g - what should datagram error */
2070 
2071 		unix_state_unlock(other);
2072 
2073 		if (sk->sk_type == SOCK_SEQPACKET) {
2074 			/* We are here only when racing with unix_release_sock()
2075 			 * is clearing @other. Never change state to TCP_CLOSE
2076 			 * unlike SOCK_DGRAM wants.
2077 			 */
2078 			err = -EPIPE;
2079 			goto out_sock_put;
2080 		}
2081 
2082 		if (!sk_locked)
2083 			unix_state_lock(sk);
2084 
2085 		if (unix_peer(sk) == other) {
2086 			unix_peer(sk) = NULL;
2087 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2088 
2089 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2090 			unix_state_unlock(sk);
2091 
2092 			unix_dgram_disconnected(sk, other);
2093 			sock_put(other);
2094 			err = -ECONNREFUSED;
2095 			goto out_sock_put;
2096 		}
2097 
2098 		unix_state_unlock(sk);
2099 
2100 		if (!msg->msg_namelen) {
2101 			err = -ECONNRESET;
2102 			goto out_sock_put;
2103 		}
2104 
2105 		goto lookup;
2106 	}
2107 
2108 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2109 		err = -EPIPE;
2110 		goto out_unlock;
2111 	}
2112 
2113 	if (sk->sk_type != SOCK_SEQPACKET) {
2114 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2115 		if (err)
2116 			goto out_unlock;
2117 	}
2118 
2119 	/* other == sk && unix_peer(other) != sk if
2120 	 * - unix_peer(sk) == NULL, destination address bound to sk
2121 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2122 	 */
2123 	if (other != sk &&
2124 	    unlikely(unix_peer(other) != sk &&
2125 	    unix_recvq_full_lockless(other))) {
2126 		if (timeo) {
2127 			timeo = unix_wait_for_peer(other, timeo);
2128 
2129 			err = sock_intr_errno(timeo);
2130 			if (signal_pending(current))
2131 				goto out_sock_put;
2132 
2133 			goto restart;
2134 		}
2135 
2136 		if (!sk_locked) {
2137 			unix_state_unlock(other);
2138 			unix_state_double_lock(sk, other);
2139 		}
2140 
2141 		if (unix_peer(sk) != other ||
2142 		    unix_dgram_peer_wake_me(sk, other)) {
2143 			err = -EAGAIN;
2144 			sk_locked = 1;
2145 			goto out_unlock;
2146 		}
2147 
2148 		if (!sk_locked) {
2149 			sk_locked = 1;
2150 			goto restart_locked;
2151 		}
2152 	}
2153 
2154 	if (unlikely(sk_locked))
2155 		unix_state_unlock(sk);
2156 
2157 	if (sock_flag(other, SOCK_RCVTSTAMP))
2158 		__net_timestamp(skb);
2159 	maybe_add_creds(skb, sock, other);
2160 	scm_stat_add(other, skb);
2161 	skb_queue_tail(&other->sk_receive_queue, skb);
2162 	unix_state_unlock(other);
2163 	other->sk_data_ready(other);
2164 	sock_put(other);
2165 	scm_destroy(&scm);
2166 	return len;
2167 
2168 out_unlock:
2169 	if (sk_locked)
2170 		unix_state_unlock(sk);
2171 	unix_state_unlock(other);
2172 out_sock_put:
2173 	sock_put(other);
2174 out_free:
2175 	consume_skb(skb);
2176 out:
2177 	scm_destroy(&scm);
2178 	return err;
2179 }
2180 
2181 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2182  * bytes, and a minimum of a full page.
2183  */
2184 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2185 
2186 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2187 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2188 		     struct scm_cookie *scm, bool fds_sent)
2189 {
2190 	struct unix_sock *ousk = unix_sk(other);
2191 	struct sk_buff *skb;
2192 	int err;
2193 
2194 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2195 
2196 	if (!skb)
2197 		return err;
2198 
2199 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2200 	if (err < 0)
2201 		goto out;
2202 
2203 	skb_put(skb, 1);
2204 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2205 
2206 	if (err)
2207 		goto out;
2208 
2209 	unix_state_lock(other);
2210 
2211 	if (sock_flag(other, SOCK_DEAD) ||
2212 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2213 		unix_state_unlock(other);
2214 		err = -EPIPE;
2215 		goto out;
2216 	}
2217 
2218 	maybe_add_creds(skb, sock, other);
2219 	scm_stat_add(other, skb);
2220 
2221 	spin_lock(&other->sk_receive_queue.lock);
2222 	WRITE_ONCE(ousk->oob_skb, skb);
2223 	__skb_queue_tail(&other->sk_receive_queue, skb);
2224 	spin_unlock(&other->sk_receive_queue.lock);
2225 
2226 	sk_send_sigurg(other);
2227 	unix_state_unlock(other);
2228 	other->sk_data_ready(other);
2229 
2230 	return 0;
2231 out:
2232 	consume_skb(skb);
2233 	return err;
2234 }
2235 #endif
2236 
2237 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2238 			       size_t len)
2239 {
2240 	struct sock *sk = sock->sk;
2241 	struct sk_buff *skb = NULL;
2242 	struct sock *other = NULL;
2243 	struct scm_cookie scm;
2244 	bool fds_sent = false;
2245 	int err, sent = 0;
2246 
2247 	err = scm_send(sock, msg, &scm, false);
2248 	if (err < 0)
2249 		return err;
2250 
2251 	wait_for_unix_gc(scm.fp);
2252 
2253 	if (msg->msg_flags & MSG_OOB) {
2254 		err = -EOPNOTSUPP;
2255 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2256 		if (len)
2257 			len--;
2258 		else
2259 #endif
2260 			goto out_err;
2261 	}
2262 
2263 	if (msg->msg_namelen) {
2264 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2265 		goto out_err;
2266 	} else {
2267 		other = unix_peer(sk);
2268 		if (!other) {
2269 			err = -ENOTCONN;
2270 			goto out_err;
2271 		}
2272 	}
2273 
2274 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2275 		goto out_pipe;
2276 
2277 	while (sent < len) {
2278 		int size = len - sent;
2279 		int data_len;
2280 
2281 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2282 			skb = sock_alloc_send_pskb(sk, 0, 0,
2283 						   msg->msg_flags & MSG_DONTWAIT,
2284 						   &err, 0);
2285 		} else {
2286 			/* Keep two messages in the pipe so it schedules better */
2287 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2288 
2289 			/* allow fallback to order-0 allocations */
2290 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2291 
2292 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2293 
2294 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2295 
2296 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2297 						   msg->msg_flags & MSG_DONTWAIT, &err,
2298 						   get_order(UNIX_SKB_FRAGS_SZ));
2299 		}
2300 		if (!skb)
2301 			goto out_err;
2302 
2303 		/* Only send the fds in the first buffer */
2304 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2305 		if (err < 0)
2306 			goto out_free;
2307 
2308 		fds_sent = true;
2309 
2310 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2311 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2312 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2313 						   sk->sk_allocation);
2314 			if (err < 0)
2315 				goto out_free;
2316 
2317 			size = err;
2318 			refcount_add(size, &sk->sk_wmem_alloc);
2319 		} else {
2320 			skb_put(skb, size - data_len);
2321 			skb->data_len = data_len;
2322 			skb->len = size;
2323 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2324 			if (err)
2325 				goto out_free;
2326 		}
2327 
2328 		unix_state_lock(other);
2329 
2330 		if (sock_flag(other, SOCK_DEAD) ||
2331 		    (other->sk_shutdown & RCV_SHUTDOWN))
2332 			goto out_pipe_unlock;
2333 
2334 		maybe_add_creds(skb, sock, other);
2335 		scm_stat_add(other, skb);
2336 		skb_queue_tail(&other->sk_receive_queue, skb);
2337 		unix_state_unlock(other);
2338 		other->sk_data_ready(other);
2339 		sent += size;
2340 	}
2341 
2342 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2343 	if (msg->msg_flags & MSG_OOB) {
2344 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2345 		if (err)
2346 			goto out_err;
2347 		sent++;
2348 	}
2349 #endif
2350 
2351 	scm_destroy(&scm);
2352 
2353 	return sent;
2354 
2355 out_pipe_unlock:
2356 	unix_state_unlock(other);
2357 out_pipe:
2358 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2359 		send_sig(SIGPIPE, current, 0);
2360 	err = -EPIPE;
2361 out_free:
2362 	consume_skb(skb);
2363 out_err:
2364 	scm_destroy(&scm);
2365 	return sent ? : err;
2366 }
2367 
2368 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2369 				  size_t len)
2370 {
2371 	int err;
2372 	struct sock *sk = sock->sk;
2373 
2374 	err = sock_error(sk);
2375 	if (err)
2376 		return err;
2377 
2378 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2379 		return -ENOTCONN;
2380 
2381 	if (msg->msg_namelen)
2382 		msg->msg_namelen = 0;
2383 
2384 	return unix_dgram_sendmsg(sock, msg, len);
2385 }
2386 
2387 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2388 				  size_t size, int flags)
2389 {
2390 	struct sock *sk = sock->sk;
2391 
2392 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2393 		return -ENOTCONN;
2394 
2395 	return unix_dgram_recvmsg(sock, msg, size, flags);
2396 }
2397 
2398 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2399 {
2400 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2401 
2402 	if (addr) {
2403 		msg->msg_namelen = addr->len;
2404 		memcpy(msg->msg_name, addr->name, addr->len);
2405 	}
2406 }
2407 
2408 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2409 			 int flags)
2410 {
2411 	struct scm_cookie scm;
2412 	struct socket *sock = sk->sk_socket;
2413 	struct unix_sock *u = unix_sk(sk);
2414 	struct sk_buff *skb, *last;
2415 	long timeo;
2416 	int skip;
2417 	int err;
2418 
2419 	err = -EOPNOTSUPP;
2420 	if (flags&MSG_OOB)
2421 		goto out;
2422 
2423 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2424 
2425 	do {
2426 		mutex_lock(&u->iolock);
2427 
2428 		skip = sk_peek_offset(sk, flags);
2429 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2430 					      &skip, &err, &last);
2431 		if (skb) {
2432 			if (!(flags & MSG_PEEK))
2433 				scm_stat_del(sk, skb);
2434 			break;
2435 		}
2436 
2437 		mutex_unlock(&u->iolock);
2438 
2439 		if (err != -EAGAIN)
2440 			break;
2441 	} while (timeo &&
2442 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2443 					      &err, &timeo, last));
2444 
2445 	if (!skb) { /* implies iolock unlocked */
2446 		unix_state_lock(sk);
2447 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2448 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2449 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2450 			err = 0;
2451 		unix_state_unlock(sk);
2452 		goto out;
2453 	}
2454 
2455 	if (wq_has_sleeper(&u->peer_wait))
2456 		wake_up_interruptible_sync_poll(&u->peer_wait,
2457 						EPOLLOUT | EPOLLWRNORM |
2458 						EPOLLWRBAND);
2459 
2460 	if (msg->msg_name) {
2461 		unix_copy_addr(msg, skb->sk);
2462 
2463 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2464 						      msg->msg_name,
2465 						      &msg->msg_namelen);
2466 	}
2467 
2468 	if (size > skb->len - skip)
2469 		size = skb->len - skip;
2470 	else if (size < skb->len - skip)
2471 		msg->msg_flags |= MSG_TRUNC;
2472 
2473 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2474 	if (err)
2475 		goto out_free;
2476 
2477 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2478 		__sock_recv_timestamp(msg, sk, skb);
2479 
2480 	memset(&scm, 0, sizeof(scm));
2481 
2482 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2483 	unix_set_secdata(&scm, skb);
2484 
2485 	if (!(flags & MSG_PEEK)) {
2486 		if (UNIXCB(skb).fp)
2487 			unix_detach_fds(&scm, skb);
2488 
2489 		sk_peek_offset_bwd(sk, skb->len);
2490 	} else {
2491 		/* It is questionable: on PEEK we could:
2492 		   - do not return fds - good, but too simple 8)
2493 		   - return fds, and do not return them on read (old strategy,
2494 		     apparently wrong)
2495 		   - clone fds (I chose it for now, it is the most universal
2496 		     solution)
2497 
2498 		   POSIX 1003.1g does not actually define this clearly
2499 		   at all. POSIX 1003.1g doesn't define a lot of things
2500 		   clearly however!
2501 
2502 		*/
2503 
2504 		sk_peek_offset_fwd(sk, size);
2505 
2506 		if (UNIXCB(skb).fp)
2507 			unix_peek_fds(&scm, skb);
2508 	}
2509 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2510 
2511 	scm_recv_unix(sock, msg, &scm, flags);
2512 
2513 out_free:
2514 	skb_free_datagram(sk, skb);
2515 	mutex_unlock(&u->iolock);
2516 out:
2517 	return err;
2518 }
2519 
2520 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2521 			      int flags)
2522 {
2523 	struct sock *sk = sock->sk;
2524 
2525 #ifdef CONFIG_BPF_SYSCALL
2526 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2527 
2528 	if (prot != &unix_dgram_proto)
2529 		return prot->recvmsg(sk, msg, size, flags, NULL);
2530 #endif
2531 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2532 }
2533 
2534 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2535 {
2536 	struct unix_sock *u = unix_sk(sk);
2537 	struct sk_buff *skb;
2538 	int err;
2539 
2540 	mutex_lock(&u->iolock);
2541 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2542 	mutex_unlock(&u->iolock);
2543 	if (!skb)
2544 		return err;
2545 
2546 	return recv_actor(sk, skb);
2547 }
2548 
2549 /*
2550  *	Sleep until more data has arrived. But check for races..
2551  */
2552 static long unix_stream_data_wait(struct sock *sk, long timeo,
2553 				  struct sk_buff *last, unsigned int last_len,
2554 				  bool freezable)
2555 {
2556 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2557 	struct sk_buff *tail;
2558 	DEFINE_WAIT(wait);
2559 
2560 	unix_state_lock(sk);
2561 
2562 	for (;;) {
2563 		prepare_to_wait(sk_sleep(sk), &wait, state);
2564 
2565 		tail = skb_peek_tail(&sk->sk_receive_queue);
2566 		if (tail != last ||
2567 		    (tail && tail->len != last_len) ||
2568 		    sk->sk_err ||
2569 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2570 		    signal_pending(current) ||
2571 		    !timeo)
2572 			break;
2573 
2574 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2575 		unix_state_unlock(sk);
2576 		timeo = schedule_timeout(timeo);
2577 		unix_state_lock(sk);
2578 
2579 		if (sock_flag(sk, SOCK_DEAD))
2580 			break;
2581 
2582 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2583 	}
2584 
2585 	finish_wait(sk_sleep(sk), &wait);
2586 	unix_state_unlock(sk);
2587 	return timeo;
2588 }
2589 
2590 static unsigned int unix_skb_len(const struct sk_buff *skb)
2591 {
2592 	return skb->len - UNIXCB(skb).consumed;
2593 }
2594 
2595 struct unix_stream_read_state {
2596 	int (*recv_actor)(struct sk_buff *, int, int,
2597 			  struct unix_stream_read_state *);
2598 	struct socket *socket;
2599 	struct msghdr *msg;
2600 	struct pipe_inode_info *pipe;
2601 	size_t size;
2602 	int flags;
2603 	unsigned int splice_flags;
2604 };
2605 
2606 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2607 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2608 {
2609 	struct socket *sock = state->socket;
2610 	struct sock *sk = sock->sk;
2611 	struct unix_sock *u = unix_sk(sk);
2612 	int chunk = 1;
2613 	struct sk_buff *oob_skb;
2614 
2615 	mutex_lock(&u->iolock);
2616 	unix_state_lock(sk);
2617 	spin_lock(&sk->sk_receive_queue.lock);
2618 
2619 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2620 		spin_unlock(&sk->sk_receive_queue.lock);
2621 		unix_state_unlock(sk);
2622 		mutex_unlock(&u->iolock);
2623 		return -EINVAL;
2624 	}
2625 
2626 	oob_skb = u->oob_skb;
2627 
2628 	if (!(state->flags & MSG_PEEK))
2629 		WRITE_ONCE(u->oob_skb, NULL);
2630 
2631 	spin_unlock(&sk->sk_receive_queue.lock);
2632 	unix_state_unlock(sk);
2633 
2634 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2635 
2636 	if (!(state->flags & MSG_PEEK))
2637 		UNIXCB(oob_skb).consumed += 1;
2638 
2639 	mutex_unlock(&u->iolock);
2640 
2641 	if (chunk < 0)
2642 		return -EFAULT;
2643 
2644 	state->msg->msg_flags |= MSG_OOB;
2645 	return 1;
2646 }
2647 
2648 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2649 				  int flags, int copied)
2650 {
2651 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2652 	struct unix_sock *u = unix_sk(sk);
2653 
2654 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2655 		return skb;
2656 
2657 	spin_lock(&sk->sk_receive_queue.lock);
2658 
2659 	if (!unix_skb_len(skb)) {
2660 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2661 			skb = NULL;
2662 		} else if (flags & MSG_PEEK) {
2663 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2664 		} else {
2665 			read_skb = skb;
2666 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2667 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2668 		}
2669 
2670 		if (!skb)
2671 			goto unlock;
2672 	}
2673 
2674 	if (skb != u->oob_skb)
2675 		goto unlock;
2676 
2677 	if (copied) {
2678 		skb = NULL;
2679 	} else if (!(flags & MSG_PEEK)) {
2680 		WRITE_ONCE(u->oob_skb, NULL);
2681 
2682 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2683 			__skb_unlink(skb, &sk->sk_receive_queue);
2684 			unread_skb = skb;
2685 			skb = skb_peek(&sk->sk_receive_queue);
2686 		}
2687 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2688 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2689 	}
2690 
2691 unlock:
2692 	spin_unlock(&sk->sk_receive_queue.lock);
2693 
2694 	consume_skb(read_skb);
2695 	kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2696 
2697 	return skb;
2698 }
2699 #endif
2700 
2701 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2702 {
2703 	struct unix_sock *u = unix_sk(sk);
2704 	struct sk_buff *skb;
2705 	int err;
2706 
2707 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2708 		return -ENOTCONN;
2709 
2710 	mutex_lock(&u->iolock);
2711 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2712 	mutex_unlock(&u->iolock);
2713 	if (!skb)
2714 		return err;
2715 
2716 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2717 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2718 		bool drop = false;
2719 
2720 		unix_state_lock(sk);
2721 
2722 		if (sock_flag(sk, SOCK_DEAD)) {
2723 			unix_state_unlock(sk);
2724 			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2725 			return -ECONNRESET;
2726 		}
2727 
2728 		spin_lock(&sk->sk_receive_queue.lock);
2729 		if (likely(skb == u->oob_skb)) {
2730 			WRITE_ONCE(u->oob_skb, NULL);
2731 			drop = true;
2732 		}
2733 		spin_unlock(&sk->sk_receive_queue.lock);
2734 
2735 		unix_state_unlock(sk);
2736 
2737 		if (drop) {
2738 			kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2739 			return -EAGAIN;
2740 		}
2741 	}
2742 #endif
2743 
2744 	return recv_actor(sk, skb);
2745 }
2746 
2747 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2748 				    bool freezable)
2749 {
2750 	struct scm_cookie scm;
2751 	struct socket *sock = state->socket;
2752 	struct sock *sk = sock->sk;
2753 	struct unix_sock *u = unix_sk(sk);
2754 	int copied = 0;
2755 	int flags = state->flags;
2756 	int noblock = flags & MSG_DONTWAIT;
2757 	bool check_creds = false;
2758 	int target;
2759 	int err = 0;
2760 	long timeo;
2761 	int skip;
2762 	size_t size = state->size;
2763 	unsigned int last_len;
2764 
2765 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2766 		err = -EINVAL;
2767 		goto out;
2768 	}
2769 
2770 	if (unlikely(flags & MSG_OOB)) {
2771 		err = -EOPNOTSUPP;
2772 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2773 		err = unix_stream_recv_urg(state);
2774 #endif
2775 		goto out;
2776 	}
2777 
2778 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2779 	timeo = sock_rcvtimeo(sk, noblock);
2780 
2781 	memset(&scm, 0, sizeof(scm));
2782 
2783 	/* Lock the socket to prevent queue disordering
2784 	 * while sleeps in memcpy_tomsg
2785 	 */
2786 	mutex_lock(&u->iolock);
2787 
2788 	skip = max(sk_peek_offset(sk, flags), 0);
2789 
2790 	do {
2791 		struct sk_buff *skb, *last;
2792 		int chunk;
2793 
2794 redo:
2795 		unix_state_lock(sk);
2796 		if (sock_flag(sk, SOCK_DEAD)) {
2797 			err = -ECONNRESET;
2798 			goto unlock;
2799 		}
2800 		last = skb = skb_peek(&sk->sk_receive_queue);
2801 		last_len = last ? last->len : 0;
2802 
2803 again:
2804 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2805 		if (skb) {
2806 			skb = manage_oob(skb, sk, flags, copied);
2807 			if (!skb && copied) {
2808 				unix_state_unlock(sk);
2809 				break;
2810 			}
2811 		}
2812 #endif
2813 		if (skb == NULL) {
2814 			if (copied >= target)
2815 				goto unlock;
2816 
2817 			/*
2818 			 *	POSIX 1003.1g mandates this order.
2819 			 */
2820 
2821 			err = sock_error(sk);
2822 			if (err)
2823 				goto unlock;
2824 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2825 				goto unlock;
2826 
2827 			unix_state_unlock(sk);
2828 			if (!timeo) {
2829 				err = -EAGAIN;
2830 				break;
2831 			}
2832 
2833 			mutex_unlock(&u->iolock);
2834 
2835 			timeo = unix_stream_data_wait(sk, timeo, last,
2836 						      last_len, freezable);
2837 
2838 			if (signal_pending(current)) {
2839 				err = sock_intr_errno(timeo);
2840 				scm_destroy(&scm);
2841 				goto out;
2842 			}
2843 
2844 			mutex_lock(&u->iolock);
2845 			goto redo;
2846 unlock:
2847 			unix_state_unlock(sk);
2848 			break;
2849 		}
2850 
2851 		while (skip >= unix_skb_len(skb)) {
2852 			skip -= unix_skb_len(skb);
2853 			last = skb;
2854 			last_len = skb->len;
2855 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2856 			if (!skb)
2857 				goto again;
2858 		}
2859 
2860 		unix_state_unlock(sk);
2861 
2862 		if (check_creds) {
2863 			/* Never glue messages from different writers */
2864 			if (!unix_skb_scm_eq(skb, &scm))
2865 				break;
2866 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2867 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2868 			/* Copy credentials */
2869 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2870 			unix_set_secdata(&scm, skb);
2871 			check_creds = true;
2872 		}
2873 
2874 		/* Copy address just once */
2875 		if (state->msg && state->msg->msg_name) {
2876 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2877 					 state->msg->msg_name);
2878 			unix_copy_addr(state->msg, skb->sk);
2879 
2880 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2881 							      state->msg->msg_name,
2882 							      &state->msg->msg_namelen);
2883 
2884 			sunaddr = NULL;
2885 		}
2886 
2887 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2888 		chunk = state->recv_actor(skb, skip, chunk, state);
2889 		if (chunk < 0) {
2890 			if (copied == 0)
2891 				copied = -EFAULT;
2892 			break;
2893 		}
2894 		copied += chunk;
2895 		size -= chunk;
2896 
2897 		/* Mark read part of skb as used */
2898 		if (!(flags & MSG_PEEK)) {
2899 			UNIXCB(skb).consumed += chunk;
2900 
2901 			sk_peek_offset_bwd(sk, chunk);
2902 
2903 			if (UNIXCB(skb).fp) {
2904 				scm_stat_del(sk, skb);
2905 				unix_detach_fds(&scm, skb);
2906 			}
2907 
2908 			if (unix_skb_len(skb))
2909 				break;
2910 
2911 			skb_unlink(skb, &sk->sk_receive_queue);
2912 			consume_skb(skb);
2913 
2914 			if (scm.fp)
2915 				break;
2916 		} else {
2917 			/* It is questionable, see note in unix_dgram_recvmsg.
2918 			 */
2919 			if (UNIXCB(skb).fp)
2920 				unix_peek_fds(&scm, skb);
2921 
2922 			sk_peek_offset_fwd(sk, chunk);
2923 
2924 			if (UNIXCB(skb).fp)
2925 				break;
2926 
2927 			skip = 0;
2928 			last = skb;
2929 			last_len = skb->len;
2930 			unix_state_lock(sk);
2931 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2932 			if (skb)
2933 				goto again;
2934 			unix_state_unlock(sk);
2935 			break;
2936 		}
2937 	} while (size);
2938 
2939 	mutex_unlock(&u->iolock);
2940 	if (state->msg)
2941 		scm_recv_unix(sock, state->msg, &scm, flags);
2942 	else
2943 		scm_destroy(&scm);
2944 out:
2945 	return copied ? : err;
2946 }
2947 
2948 static int unix_stream_read_actor(struct sk_buff *skb,
2949 				  int skip, int chunk,
2950 				  struct unix_stream_read_state *state)
2951 {
2952 	int ret;
2953 
2954 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2955 				    state->msg, chunk);
2956 	return ret ?: chunk;
2957 }
2958 
2959 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2960 			  size_t size, int flags)
2961 {
2962 	struct unix_stream_read_state state = {
2963 		.recv_actor = unix_stream_read_actor,
2964 		.socket = sk->sk_socket,
2965 		.msg = msg,
2966 		.size = size,
2967 		.flags = flags
2968 	};
2969 
2970 	return unix_stream_read_generic(&state, true);
2971 }
2972 
2973 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2974 			       size_t size, int flags)
2975 {
2976 	struct unix_stream_read_state state = {
2977 		.recv_actor = unix_stream_read_actor,
2978 		.socket = sock,
2979 		.msg = msg,
2980 		.size = size,
2981 		.flags = flags
2982 	};
2983 
2984 #ifdef CONFIG_BPF_SYSCALL
2985 	struct sock *sk = sock->sk;
2986 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2987 
2988 	if (prot != &unix_stream_proto)
2989 		return prot->recvmsg(sk, msg, size, flags, NULL);
2990 #endif
2991 	return unix_stream_read_generic(&state, true);
2992 }
2993 
2994 static int unix_stream_splice_actor(struct sk_buff *skb,
2995 				    int skip, int chunk,
2996 				    struct unix_stream_read_state *state)
2997 {
2998 	return skb_splice_bits(skb, state->socket->sk,
2999 			       UNIXCB(skb).consumed + skip,
3000 			       state->pipe, chunk, state->splice_flags);
3001 }
3002 
3003 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3004 				       struct pipe_inode_info *pipe,
3005 				       size_t size, unsigned int flags)
3006 {
3007 	struct unix_stream_read_state state = {
3008 		.recv_actor = unix_stream_splice_actor,
3009 		.socket = sock,
3010 		.pipe = pipe,
3011 		.size = size,
3012 		.splice_flags = flags,
3013 	};
3014 
3015 	if (unlikely(*ppos))
3016 		return -ESPIPE;
3017 
3018 	if (sock->file->f_flags & O_NONBLOCK ||
3019 	    flags & SPLICE_F_NONBLOCK)
3020 		state.flags = MSG_DONTWAIT;
3021 
3022 	return unix_stream_read_generic(&state, false);
3023 }
3024 
3025 static int unix_shutdown(struct socket *sock, int mode)
3026 {
3027 	struct sock *sk = sock->sk;
3028 	struct sock *other;
3029 
3030 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3031 		return -EINVAL;
3032 	/* This maps:
3033 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3034 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3035 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3036 	 */
3037 	++mode;
3038 
3039 	unix_state_lock(sk);
3040 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3041 	other = unix_peer(sk);
3042 	if (other)
3043 		sock_hold(other);
3044 	unix_state_unlock(sk);
3045 	sk->sk_state_change(sk);
3046 
3047 	if (other &&
3048 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3049 
3050 		int peer_mode = 0;
3051 		const struct proto *prot = READ_ONCE(other->sk_prot);
3052 
3053 		if (prot->unhash)
3054 			prot->unhash(other);
3055 		if (mode&RCV_SHUTDOWN)
3056 			peer_mode |= SEND_SHUTDOWN;
3057 		if (mode&SEND_SHUTDOWN)
3058 			peer_mode |= RCV_SHUTDOWN;
3059 		unix_state_lock(other);
3060 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3061 		unix_state_unlock(other);
3062 		other->sk_state_change(other);
3063 		if (peer_mode == SHUTDOWN_MASK)
3064 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3065 		else if (peer_mode & RCV_SHUTDOWN)
3066 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3067 	}
3068 	if (other)
3069 		sock_put(other);
3070 
3071 	return 0;
3072 }
3073 
3074 long unix_inq_len(struct sock *sk)
3075 {
3076 	struct sk_buff *skb;
3077 	long amount = 0;
3078 
3079 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3080 		return -EINVAL;
3081 
3082 	spin_lock(&sk->sk_receive_queue.lock);
3083 	if (sk->sk_type == SOCK_STREAM ||
3084 	    sk->sk_type == SOCK_SEQPACKET) {
3085 		skb_queue_walk(&sk->sk_receive_queue, skb)
3086 			amount += unix_skb_len(skb);
3087 	} else {
3088 		skb = skb_peek(&sk->sk_receive_queue);
3089 		if (skb)
3090 			amount = skb->len;
3091 	}
3092 	spin_unlock(&sk->sk_receive_queue.lock);
3093 
3094 	return amount;
3095 }
3096 EXPORT_SYMBOL_GPL(unix_inq_len);
3097 
3098 long unix_outq_len(struct sock *sk)
3099 {
3100 	return sk_wmem_alloc_get(sk);
3101 }
3102 EXPORT_SYMBOL_GPL(unix_outq_len);
3103 
3104 static int unix_open_file(struct sock *sk)
3105 {
3106 	struct path path;
3107 	struct file *f;
3108 	int fd;
3109 
3110 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3111 		return -EPERM;
3112 
3113 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3114 		return -ENOENT;
3115 
3116 	path = unix_sk(sk)->path;
3117 	if (!path.dentry)
3118 		return -ENOENT;
3119 
3120 	path_get(&path);
3121 
3122 	fd = get_unused_fd_flags(O_CLOEXEC);
3123 	if (fd < 0)
3124 		goto out;
3125 
3126 	f = dentry_open(&path, O_PATH, current_cred());
3127 	if (IS_ERR(f)) {
3128 		put_unused_fd(fd);
3129 		fd = PTR_ERR(f);
3130 		goto out;
3131 	}
3132 
3133 	fd_install(fd, f);
3134 out:
3135 	path_put(&path);
3136 
3137 	return fd;
3138 }
3139 
3140 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3141 {
3142 	struct sock *sk = sock->sk;
3143 	long amount = 0;
3144 	int err;
3145 
3146 	switch (cmd) {
3147 	case SIOCOUTQ:
3148 		amount = unix_outq_len(sk);
3149 		err = put_user(amount, (int __user *)arg);
3150 		break;
3151 	case SIOCINQ:
3152 		amount = unix_inq_len(sk);
3153 		if (amount < 0)
3154 			err = amount;
3155 		else
3156 			err = put_user(amount, (int __user *)arg);
3157 		break;
3158 	case SIOCUNIXFILE:
3159 		err = unix_open_file(sk);
3160 		break;
3161 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3162 	case SIOCATMARK:
3163 		{
3164 			struct unix_sock *u = unix_sk(sk);
3165 			struct sk_buff *skb;
3166 			int answ = 0;
3167 
3168 			mutex_lock(&u->iolock);
3169 
3170 			skb = skb_peek(&sk->sk_receive_queue);
3171 			if (skb) {
3172 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3173 				struct sk_buff *next_skb;
3174 
3175 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3176 
3177 				if (skb == oob_skb ||
3178 				    (!unix_skb_len(skb) &&
3179 				     (!oob_skb || next_skb == oob_skb)))
3180 					answ = 1;
3181 			}
3182 
3183 			mutex_unlock(&u->iolock);
3184 
3185 			err = put_user(answ, (int __user *)arg);
3186 		}
3187 		break;
3188 #endif
3189 	default:
3190 		err = -ENOIOCTLCMD;
3191 		break;
3192 	}
3193 	return err;
3194 }
3195 
3196 #ifdef CONFIG_COMPAT
3197 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3198 {
3199 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3200 }
3201 #endif
3202 
3203 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3204 {
3205 	struct sock *sk = sock->sk;
3206 	unsigned char state;
3207 	__poll_t mask;
3208 	u8 shutdown;
3209 
3210 	sock_poll_wait(file, sock, wait);
3211 	mask = 0;
3212 	shutdown = READ_ONCE(sk->sk_shutdown);
3213 	state = READ_ONCE(sk->sk_state);
3214 
3215 	/* exceptional events? */
3216 	if (READ_ONCE(sk->sk_err))
3217 		mask |= EPOLLERR;
3218 	if (shutdown == SHUTDOWN_MASK)
3219 		mask |= EPOLLHUP;
3220 	if (shutdown & RCV_SHUTDOWN)
3221 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3222 
3223 	/* readable? */
3224 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3225 		mask |= EPOLLIN | EPOLLRDNORM;
3226 	if (sk_is_readable(sk))
3227 		mask |= EPOLLIN | EPOLLRDNORM;
3228 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3229 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3230 		mask |= EPOLLPRI;
3231 #endif
3232 
3233 	/* Connection-based need to check for termination and startup */
3234 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3235 	    state == TCP_CLOSE)
3236 		mask |= EPOLLHUP;
3237 
3238 	/*
3239 	 * we set writable also when the other side has shut down the
3240 	 * connection. This prevents stuck sockets.
3241 	 */
3242 	if (unix_writable(sk, state))
3243 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3244 
3245 	return mask;
3246 }
3247 
3248 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3249 				    poll_table *wait)
3250 {
3251 	struct sock *sk = sock->sk, *other;
3252 	unsigned int writable;
3253 	unsigned char state;
3254 	__poll_t mask;
3255 	u8 shutdown;
3256 
3257 	sock_poll_wait(file, sock, wait);
3258 	mask = 0;
3259 	shutdown = READ_ONCE(sk->sk_shutdown);
3260 	state = READ_ONCE(sk->sk_state);
3261 
3262 	/* exceptional events? */
3263 	if (READ_ONCE(sk->sk_err) ||
3264 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3265 		mask |= EPOLLERR |
3266 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3267 
3268 	if (shutdown & RCV_SHUTDOWN)
3269 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3270 	if (shutdown == SHUTDOWN_MASK)
3271 		mask |= EPOLLHUP;
3272 
3273 	/* readable? */
3274 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3275 		mask |= EPOLLIN | EPOLLRDNORM;
3276 	if (sk_is_readable(sk))
3277 		mask |= EPOLLIN | EPOLLRDNORM;
3278 
3279 	/* Connection-based need to check for termination and startup */
3280 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3281 		mask |= EPOLLHUP;
3282 
3283 	/* No write status requested, avoid expensive OUT tests. */
3284 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3285 		return mask;
3286 
3287 	writable = unix_writable(sk, state);
3288 	if (writable) {
3289 		unix_state_lock(sk);
3290 
3291 		other = unix_peer(sk);
3292 		if (other && unix_peer(other) != sk &&
3293 		    unix_recvq_full_lockless(other) &&
3294 		    unix_dgram_peer_wake_me(sk, other))
3295 			writable = 0;
3296 
3297 		unix_state_unlock(sk);
3298 	}
3299 
3300 	if (writable)
3301 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3302 	else
3303 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3304 
3305 	return mask;
3306 }
3307 
3308 #ifdef CONFIG_PROC_FS
3309 
3310 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3311 
3312 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3313 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3314 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3315 
3316 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3317 {
3318 	unsigned long offset = get_offset(*pos);
3319 	unsigned long bucket = get_bucket(*pos);
3320 	unsigned long count = 0;
3321 	struct sock *sk;
3322 
3323 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3324 	     sk; sk = sk_next(sk)) {
3325 		if (++count == offset)
3326 			break;
3327 	}
3328 
3329 	return sk;
3330 }
3331 
3332 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3333 {
3334 	unsigned long bucket = get_bucket(*pos);
3335 	struct net *net = seq_file_net(seq);
3336 	struct sock *sk;
3337 
3338 	while (bucket < UNIX_HASH_SIZE) {
3339 		spin_lock(&net->unx.table.locks[bucket]);
3340 
3341 		sk = unix_from_bucket(seq, pos);
3342 		if (sk)
3343 			return sk;
3344 
3345 		spin_unlock(&net->unx.table.locks[bucket]);
3346 
3347 		*pos = set_bucket_offset(++bucket, 1);
3348 	}
3349 
3350 	return NULL;
3351 }
3352 
3353 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3354 				  loff_t *pos)
3355 {
3356 	unsigned long bucket = get_bucket(*pos);
3357 
3358 	sk = sk_next(sk);
3359 	if (sk)
3360 		return sk;
3361 
3362 
3363 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3364 
3365 	*pos = set_bucket_offset(++bucket, 1);
3366 
3367 	return unix_get_first(seq, pos);
3368 }
3369 
3370 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3371 {
3372 	if (!*pos)
3373 		return SEQ_START_TOKEN;
3374 
3375 	return unix_get_first(seq, pos);
3376 }
3377 
3378 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3379 {
3380 	++*pos;
3381 
3382 	if (v == SEQ_START_TOKEN)
3383 		return unix_get_first(seq, pos);
3384 
3385 	return unix_get_next(seq, v, pos);
3386 }
3387 
3388 static void unix_seq_stop(struct seq_file *seq, void *v)
3389 {
3390 	struct sock *sk = v;
3391 
3392 	if (sk)
3393 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3394 }
3395 
3396 static int unix_seq_show(struct seq_file *seq, void *v)
3397 {
3398 
3399 	if (v == SEQ_START_TOKEN)
3400 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3401 			 "Inode Path\n");
3402 	else {
3403 		struct sock *s = v;
3404 		struct unix_sock *u = unix_sk(s);
3405 		unix_state_lock(s);
3406 
3407 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3408 			s,
3409 			refcount_read(&s->sk_refcnt),
3410 			0,
3411 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3412 			s->sk_type,
3413 			s->sk_socket ?
3414 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3415 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3416 			sock_i_ino(s));
3417 
3418 		if (u->addr) {	// under a hash table lock here
3419 			int i, len;
3420 			seq_putc(seq, ' ');
3421 
3422 			i = 0;
3423 			len = u->addr->len -
3424 				offsetof(struct sockaddr_un, sun_path);
3425 			if (u->addr->name->sun_path[0]) {
3426 				len--;
3427 			} else {
3428 				seq_putc(seq, '@');
3429 				i++;
3430 			}
3431 			for ( ; i < len; i++)
3432 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3433 					 '@');
3434 		}
3435 		unix_state_unlock(s);
3436 		seq_putc(seq, '\n');
3437 	}
3438 
3439 	return 0;
3440 }
3441 
3442 static const struct seq_operations unix_seq_ops = {
3443 	.start  = unix_seq_start,
3444 	.next   = unix_seq_next,
3445 	.stop   = unix_seq_stop,
3446 	.show   = unix_seq_show,
3447 };
3448 
3449 #ifdef CONFIG_BPF_SYSCALL
3450 struct bpf_unix_iter_state {
3451 	struct seq_net_private p;
3452 	unsigned int cur_sk;
3453 	unsigned int end_sk;
3454 	unsigned int max_sk;
3455 	struct sock **batch;
3456 	bool st_bucket_done;
3457 };
3458 
3459 struct bpf_iter__unix {
3460 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3461 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3462 	uid_t uid __aligned(8);
3463 };
3464 
3465 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3466 			      struct unix_sock *unix_sk, uid_t uid)
3467 {
3468 	struct bpf_iter__unix ctx;
3469 
3470 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3471 	ctx.meta = meta;
3472 	ctx.unix_sk = unix_sk;
3473 	ctx.uid = uid;
3474 	return bpf_iter_run_prog(prog, &ctx);
3475 }
3476 
3477 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3478 
3479 {
3480 	struct bpf_unix_iter_state *iter = seq->private;
3481 	unsigned int expected = 1;
3482 	struct sock *sk;
3483 
3484 	sock_hold(start_sk);
3485 	iter->batch[iter->end_sk++] = start_sk;
3486 
3487 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3488 		if (iter->end_sk < iter->max_sk) {
3489 			sock_hold(sk);
3490 			iter->batch[iter->end_sk++] = sk;
3491 		}
3492 
3493 		expected++;
3494 	}
3495 
3496 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3497 
3498 	return expected;
3499 }
3500 
3501 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3502 {
3503 	while (iter->cur_sk < iter->end_sk)
3504 		sock_put(iter->batch[iter->cur_sk++]);
3505 }
3506 
3507 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3508 				       unsigned int new_batch_sz)
3509 {
3510 	struct sock **new_batch;
3511 
3512 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3513 			     GFP_USER | __GFP_NOWARN);
3514 	if (!new_batch)
3515 		return -ENOMEM;
3516 
3517 	bpf_iter_unix_put_batch(iter);
3518 	kvfree(iter->batch);
3519 	iter->batch = new_batch;
3520 	iter->max_sk = new_batch_sz;
3521 
3522 	return 0;
3523 }
3524 
3525 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3526 					loff_t *pos)
3527 {
3528 	struct bpf_unix_iter_state *iter = seq->private;
3529 	unsigned int expected;
3530 	bool resized = false;
3531 	struct sock *sk;
3532 
3533 	if (iter->st_bucket_done)
3534 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3535 
3536 again:
3537 	/* Get a new batch */
3538 	iter->cur_sk = 0;
3539 	iter->end_sk = 0;
3540 
3541 	sk = unix_get_first(seq, pos);
3542 	if (!sk)
3543 		return NULL; /* Done */
3544 
3545 	expected = bpf_iter_unix_hold_batch(seq, sk);
3546 
3547 	if (iter->end_sk == expected) {
3548 		iter->st_bucket_done = true;
3549 		return sk;
3550 	}
3551 
3552 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3553 		resized = true;
3554 		goto again;
3555 	}
3556 
3557 	return sk;
3558 }
3559 
3560 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3561 {
3562 	if (!*pos)
3563 		return SEQ_START_TOKEN;
3564 
3565 	/* bpf iter does not support lseek, so it always
3566 	 * continue from where it was stop()-ped.
3567 	 */
3568 	return bpf_iter_unix_batch(seq, pos);
3569 }
3570 
3571 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3572 {
3573 	struct bpf_unix_iter_state *iter = seq->private;
3574 	struct sock *sk;
3575 
3576 	/* Whenever seq_next() is called, the iter->cur_sk is
3577 	 * done with seq_show(), so advance to the next sk in
3578 	 * the batch.
3579 	 */
3580 	if (iter->cur_sk < iter->end_sk)
3581 		sock_put(iter->batch[iter->cur_sk++]);
3582 
3583 	++*pos;
3584 
3585 	if (iter->cur_sk < iter->end_sk)
3586 		sk = iter->batch[iter->cur_sk];
3587 	else
3588 		sk = bpf_iter_unix_batch(seq, pos);
3589 
3590 	return sk;
3591 }
3592 
3593 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3594 {
3595 	struct bpf_iter_meta meta;
3596 	struct bpf_prog *prog;
3597 	struct sock *sk = v;
3598 	uid_t uid;
3599 	bool slow;
3600 	int ret;
3601 
3602 	if (v == SEQ_START_TOKEN)
3603 		return 0;
3604 
3605 	slow = lock_sock_fast(sk);
3606 
3607 	if (unlikely(sk_unhashed(sk))) {
3608 		ret = SEQ_SKIP;
3609 		goto unlock;
3610 	}
3611 
3612 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3613 	meta.seq = seq;
3614 	prog = bpf_iter_get_info(&meta, false);
3615 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3616 unlock:
3617 	unlock_sock_fast(sk, slow);
3618 	return ret;
3619 }
3620 
3621 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3622 {
3623 	struct bpf_unix_iter_state *iter = seq->private;
3624 	struct bpf_iter_meta meta;
3625 	struct bpf_prog *prog;
3626 
3627 	if (!v) {
3628 		meta.seq = seq;
3629 		prog = bpf_iter_get_info(&meta, true);
3630 		if (prog)
3631 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3632 	}
3633 
3634 	if (iter->cur_sk < iter->end_sk)
3635 		bpf_iter_unix_put_batch(iter);
3636 }
3637 
3638 static const struct seq_operations bpf_iter_unix_seq_ops = {
3639 	.start	= bpf_iter_unix_seq_start,
3640 	.next	= bpf_iter_unix_seq_next,
3641 	.stop	= bpf_iter_unix_seq_stop,
3642 	.show	= bpf_iter_unix_seq_show,
3643 };
3644 #endif
3645 #endif
3646 
3647 static const struct net_proto_family unix_family_ops = {
3648 	.family = PF_UNIX,
3649 	.create = unix_create,
3650 	.owner	= THIS_MODULE,
3651 };
3652 
3653 
3654 static int __net_init unix_net_init(struct net *net)
3655 {
3656 	int i;
3657 
3658 	net->unx.sysctl_max_dgram_qlen = 10;
3659 	if (unix_sysctl_register(net))
3660 		goto out;
3661 
3662 #ifdef CONFIG_PROC_FS
3663 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3664 			     sizeof(struct seq_net_private)))
3665 		goto err_sysctl;
3666 #endif
3667 
3668 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3669 					      sizeof(spinlock_t), GFP_KERNEL);
3670 	if (!net->unx.table.locks)
3671 		goto err_proc;
3672 
3673 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3674 						sizeof(struct hlist_head),
3675 						GFP_KERNEL);
3676 	if (!net->unx.table.buckets)
3677 		goto free_locks;
3678 
3679 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3680 		spin_lock_init(&net->unx.table.locks[i]);
3681 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3682 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3683 	}
3684 
3685 	return 0;
3686 
3687 free_locks:
3688 	kvfree(net->unx.table.locks);
3689 err_proc:
3690 #ifdef CONFIG_PROC_FS
3691 	remove_proc_entry("unix", net->proc_net);
3692 err_sysctl:
3693 #endif
3694 	unix_sysctl_unregister(net);
3695 out:
3696 	return -ENOMEM;
3697 }
3698 
3699 static void __net_exit unix_net_exit(struct net *net)
3700 {
3701 	kvfree(net->unx.table.buckets);
3702 	kvfree(net->unx.table.locks);
3703 	unix_sysctl_unregister(net);
3704 	remove_proc_entry("unix", net->proc_net);
3705 }
3706 
3707 static struct pernet_operations unix_net_ops = {
3708 	.init = unix_net_init,
3709 	.exit = unix_net_exit,
3710 };
3711 
3712 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3713 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3714 		     struct unix_sock *unix_sk, uid_t uid)
3715 
3716 #define INIT_BATCH_SZ 16
3717 
3718 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3719 {
3720 	struct bpf_unix_iter_state *iter = priv_data;
3721 	int err;
3722 
3723 	err = bpf_iter_init_seq_net(priv_data, aux);
3724 	if (err)
3725 		return err;
3726 
3727 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3728 	if (err) {
3729 		bpf_iter_fini_seq_net(priv_data);
3730 		return err;
3731 	}
3732 
3733 	return 0;
3734 }
3735 
3736 static void bpf_iter_fini_unix(void *priv_data)
3737 {
3738 	struct bpf_unix_iter_state *iter = priv_data;
3739 
3740 	bpf_iter_fini_seq_net(priv_data);
3741 	kvfree(iter->batch);
3742 }
3743 
3744 static const struct bpf_iter_seq_info unix_seq_info = {
3745 	.seq_ops		= &bpf_iter_unix_seq_ops,
3746 	.init_seq_private	= bpf_iter_init_unix,
3747 	.fini_seq_private	= bpf_iter_fini_unix,
3748 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3749 };
3750 
3751 static const struct bpf_func_proto *
3752 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3753 			     const struct bpf_prog *prog)
3754 {
3755 	switch (func_id) {
3756 	case BPF_FUNC_setsockopt:
3757 		return &bpf_sk_setsockopt_proto;
3758 	case BPF_FUNC_getsockopt:
3759 		return &bpf_sk_getsockopt_proto;
3760 	default:
3761 		return NULL;
3762 	}
3763 }
3764 
3765 static struct bpf_iter_reg unix_reg_info = {
3766 	.target			= "unix",
3767 	.ctx_arg_info_size	= 1,
3768 	.ctx_arg_info		= {
3769 		{ offsetof(struct bpf_iter__unix, unix_sk),
3770 		  PTR_TO_BTF_ID_OR_NULL },
3771 	},
3772 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3773 	.seq_info		= &unix_seq_info,
3774 };
3775 
3776 static void __init bpf_iter_register(void)
3777 {
3778 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3779 	if (bpf_iter_reg_target(&unix_reg_info))
3780 		pr_warn("Warning: could not register bpf iterator unix\n");
3781 }
3782 #endif
3783 
3784 static int __init af_unix_init(void)
3785 {
3786 	int i, rc = -1;
3787 
3788 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3789 
3790 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3791 		spin_lock_init(&bsd_socket_locks[i]);
3792 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3793 	}
3794 
3795 	rc = proto_register(&unix_dgram_proto, 1);
3796 	if (rc != 0) {
3797 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3798 		goto out;
3799 	}
3800 
3801 	rc = proto_register(&unix_stream_proto, 1);
3802 	if (rc != 0) {
3803 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3804 		proto_unregister(&unix_dgram_proto);
3805 		goto out;
3806 	}
3807 
3808 	sock_register(&unix_family_ops);
3809 	register_pernet_subsys(&unix_net_ops);
3810 	unix_bpf_build_proto();
3811 
3812 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3813 	bpf_iter_register();
3814 #endif
3815 
3816 out:
3817 	return rc;
3818 }
3819 
3820 /* Later than subsys_initcall() because we depend on stuff initialised there */
3821 fs_initcall(af_unix_init);
3822