xref: /linux/net/unix/af_unix.c (revision b01b59a4fa87831b8504f1e8fc553ce599e7362d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120 
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124 
125 /* SMP locking strategy:
126  *    hash table is protected with spinlock.
127  *    each socket state is protected by separate spinlock.
128  */
129 #ifdef CONFIG_PROVE_LOCKING
130 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
131 
132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133 				  const struct lockdep_map *b)
134 {
135 	return cmp_ptr(a, b);
136 }
137 
138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139 				  const struct lockdep_map *_b)
140 {
141 	const struct unix_sock *a, *b;
142 
143 	a = container_of(_a, struct unix_sock, lock.dep_map);
144 	b = container_of(_b, struct unix_sock, lock.dep_map);
145 
146 	if (a->sk.sk_state == TCP_LISTEN) {
147 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
148 		 *
149 		 *   1. a is TCP_LISTEN.
150 		 *   2. b is not a.
151 		 *   3. concurrent connect(b -> a) must fail.
152 		 *
153 		 * Except for 2. & 3., the b's state can be any possible
154 		 * value due to concurrent connect() or listen().
155 		 *
156 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157 		 * be expressed as lock_cmp_fn.
158 		 */
159 		switch (b->sk.sk_state) {
160 		case TCP_CLOSE:
161 		case TCP_ESTABLISHED:
162 		case TCP_LISTEN:
163 			return -1;
164 		default:
165 			/* Invalid case. */
166 			return 0;
167 		}
168 	}
169 
170 	/* Should never happen.  Just to be symmetric. */
171 	if (b->sk.sk_state == TCP_LISTEN) {
172 		switch (b->sk.sk_state) {
173 		case TCP_CLOSE:
174 		case TCP_ESTABLISHED:
175 			return 1;
176 		default:
177 			return 0;
178 		}
179 	}
180 
181 	/* unix_state_double_lock(): ascending address order. */
182 	return cmp_ptr(a, b);
183 }
184 
185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186 				  const struct lockdep_map *_b)
187 {
188 	const struct sock *a, *b;
189 
190 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192 
193 	/* unix_collect_skb(): listener -> embryo order. */
194 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195 		return -1;
196 
197 	/* Should never happen.  Just to be symmetric. */
198 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199 		return 1;
200 
201 	return 0;
202 }
203 #endif
204 
205 static unsigned int unix_unbound_hash(struct sock *sk)
206 {
207 	unsigned long hash = (unsigned long)sk;
208 
209 	hash ^= hash >> 16;
210 	hash ^= hash >> 8;
211 	hash ^= sk->sk_type;
212 
213 	return hash & UNIX_HASH_MOD;
214 }
215 
216 static unsigned int unix_bsd_hash(struct inode *i)
217 {
218 	return i->i_ino & UNIX_HASH_MOD;
219 }
220 
221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222 				       int addr_len, int type)
223 {
224 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
225 	unsigned int hash;
226 
227 	hash = (__force unsigned int)csum_fold(csum);
228 	hash ^= hash >> 8;
229 	hash ^= type;
230 
231 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232 }
233 
234 static void unix_table_double_lock(struct net *net,
235 				   unsigned int hash1, unsigned int hash2)
236 {
237 	if (hash1 == hash2) {
238 		spin_lock(&net->unx.table.locks[hash1]);
239 		return;
240 	}
241 
242 	if (hash1 > hash2)
243 		swap(hash1, hash2);
244 
245 	spin_lock(&net->unx.table.locks[hash1]);
246 	spin_lock(&net->unx.table.locks[hash2]);
247 }
248 
249 static void unix_table_double_unlock(struct net *net,
250 				     unsigned int hash1, unsigned int hash2)
251 {
252 	if (hash1 == hash2) {
253 		spin_unlock(&net->unx.table.locks[hash1]);
254 		return;
255 	}
256 
257 	spin_unlock(&net->unx.table.locks[hash1]);
258 	spin_unlock(&net->unx.table.locks[hash2]);
259 }
260 
261 #ifdef CONFIG_SECURITY_NETWORK
262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263 {
264 	UNIXCB(skb).secid = scm->secid;
265 }
266 
267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268 {
269 	scm->secid = UNIXCB(skb).secid;
270 }
271 
272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273 {
274 	return (scm->secid == UNIXCB(skb).secid);
275 }
276 #else
277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278 { }
279 
280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281 { }
282 
283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284 {
285 	return true;
286 }
287 #endif /* CONFIG_SECURITY_NETWORK */
288 
289 static inline int unix_may_send(struct sock *sk, struct sock *osk)
290 {
291 	return !unix_peer(osk) || unix_peer(osk) == sk;
292 }
293 
294 static inline int unix_recvq_full_lockless(const struct sock *sk)
295 {
296 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
297 }
298 
299 struct sock *unix_peer_get(struct sock *s)
300 {
301 	struct sock *peer;
302 
303 	unix_state_lock(s);
304 	peer = unix_peer(s);
305 	if (peer)
306 		sock_hold(peer);
307 	unix_state_unlock(s);
308 	return peer;
309 }
310 EXPORT_SYMBOL_GPL(unix_peer_get);
311 
312 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
313 					     int addr_len)
314 {
315 	struct unix_address *addr;
316 
317 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
318 	if (!addr)
319 		return NULL;
320 
321 	refcount_set(&addr->refcnt, 1);
322 	addr->len = addr_len;
323 	memcpy(addr->name, sunaddr, addr_len);
324 
325 	return addr;
326 }
327 
328 static inline void unix_release_addr(struct unix_address *addr)
329 {
330 	if (refcount_dec_and_test(&addr->refcnt))
331 		kfree(addr);
332 }
333 
334 /*
335  *	Check unix socket name:
336  *		- should be not zero length.
337  *	        - if started by not zero, should be NULL terminated (FS object)
338  *		- if started by zero, it is abstract name.
339  */
340 
341 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
342 {
343 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
344 	    addr_len > sizeof(*sunaddr))
345 		return -EINVAL;
346 
347 	if (sunaddr->sun_family != AF_UNIX)
348 		return -EINVAL;
349 
350 	return 0;
351 }
352 
353 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
354 {
355 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
356 	short offset = offsetof(struct sockaddr_storage, __data);
357 
358 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
359 
360 	/* This may look like an off by one error but it is a bit more
361 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
362 	 * sun_path[108] doesn't as such exist.  However in kernel space
363 	 * we are guaranteed that it is a valid memory location in our
364 	 * kernel address buffer because syscall functions always pass
365 	 * a pointer of struct sockaddr_storage which has a bigger buffer
366 	 * than 108.  Also, we must terminate sun_path for strlen() in
367 	 * getname_kernel().
368 	 */
369 	addr->__data[addr_len - offset] = 0;
370 
371 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
372 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
373 	 * know the actual buffer.
374 	 */
375 	return strlen(addr->__data) + offset + 1;
376 }
377 
378 static void __unix_remove_socket(struct sock *sk)
379 {
380 	sk_del_node_init(sk);
381 }
382 
383 static void __unix_insert_socket(struct net *net, struct sock *sk)
384 {
385 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
386 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
387 }
388 
389 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
390 				 struct unix_address *addr, unsigned int hash)
391 {
392 	__unix_remove_socket(sk);
393 	smp_store_release(&unix_sk(sk)->addr, addr);
394 
395 	sk->sk_hash = hash;
396 	__unix_insert_socket(net, sk);
397 }
398 
399 static void unix_remove_socket(struct net *net, struct sock *sk)
400 {
401 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
402 	__unix_remove_socket(sk);
403 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
404 }
405 
406 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
407 {
408 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
409 	__unix_insert_socket(net, sk);
410 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
411 }
412 
413 static void unix_insert_bsd_socket(struct sock *sk)
414 {
415 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
416 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
417 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
418 }
419 
420 static void unix_remove_bsd_socket(struct sock *sk)
421 {
422 	if (!hlist_unhashed(&sk->sk_bind_node)) {
423 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
424 		__sk_del_bind_node(sk);
425 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
426 
427 		sk_node_init(&sk->sk_bind_node);
428 	}
429 }
430 
431 static struct sock *__unix_find_socket_byname(struct net *net,
432 					      struct sockaddr_un *sunname,
433 					      int len, unsigned int hash)
434 {
435 	struct sock *s;
436 
437 	sk_for_each(s, &net->unx.table.buckets[hash]) {
438 		struct unix_sock *u = unix_sk(s);
439 
440 		if (u->addr->len == len &&
441 		    !memcmp(u->addr->name, sunname, len))
442 			return s;
443 	}
444 	return NULL;
445 }
446 
447 static inline struct sock *unix_find_socket_byname(struct net *net,
448 						   struct sockaddr_un *sunname,
449 						   int len, unsigned int hash)
450 {
451 	struct sock *s;
452 
453 	spin_lock(&net->unx.table.locks[hash]);
454 	s = __unix_find_socket_byname(net, sunname, len, hash);
455 	if (s)
456 		sock_hold(s);
457 	spin_unlock(&net->unx.table.locks[hash]);
458 	return s;
459 }
460 
461 static struct sock *unix_find_socket_byinode(struct inode *i)
462 {
463 	unsigned int hash = unix_bsd_hash(i);
464 	struct sock *s;
465 
466 	spin_lock(&bsd_socket_locks[hash]);
467 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
468 		struct dentry *dentry = unix_sk(s)->path.dentry;
469 
470 		if (dentry && d_backing_inode(dentry) == i) {
471 			sock_hold(s);
472 			spin_unlock(&bsd_socket_locks[hash]);
473 			return s;
474 		}
475 	}
476 	spin_unlock(&bsd_socket_locks[hash]);
477 	return NULL;
478 }
479 
480 /* Support code for asymmetrically connected dgram sockets
481  *
482  * If a datagram socket is connected to a socket not itself connected
483  * to the first socket (eg, /dev/log), clients may only enqueue more
484  * messages if the present receive queue of the server socket is not
485  * "too large". This means there's a second writeability condition
486  * poll and sendmsg need to test. The dgram recv code will do a wake
487  * up on the peer_wait wait queue of a socket upon reception of a
488  * datagram which needs to be propagated to sleeping would-be writers
489  * since these might not have sent anything so far. This can't be
490  * accomplished via poll_wait because the lifetime of the server
491  * socket might be less than that of its clients if these break their
492  * association with it or if the server socket is closed while clients
493  * are still connected to it and there's no way to inform "a polling
494  * implementation" that it should let go of a certain wait queue
495  *
496  * In order to propagate a wake up, a wait_queue_entry_t of the client
497  * socket is enqueued on the peer_wait queue of the server socket
498  * whose wake function does a wake_up on the ordinary client socket
499  * wait queue. This connection is established whenever a write (or
500  * poll for write) hit the flow control condition and broken when the
501  * association to the server socket is dissolved or after a wake up
502  * was relayed.
503  */
504 
505 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
506 				      void *key)
507 {
508 	struct unix_sock *u;
509 	wait_queue_head_t *u_sleep;
510 
511 	u = container_of(q, struct unix_sock, peer_wake);
512 
513 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
514 			    q);
515 	u->peer_wake.private = NULL;
516 
517 	/* relaying can only happen while the wq still exists */
518 	u_sleep = sk_sleep(&u->sk);
519 	if (u_sleep)
520 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
521 
522 	return 0;
523 }
524 
525 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
526 {
527 	struct unix_sock *u, *u_other;
528 	int rc;
529 
530 	u = unix_sk(sk);
531 	u_other = unix_sk(other);
532 	rc = 0;
533 	spin_lock(&u_other->peer_wait.lock);
534 
535 	if (!u->peer_wake.private) {
536 		u->peer_wake.private = other;
537 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
538 
539 		rc = 1;
540 	}
541 
542 	spin_unlock(&u_other->peer_wait.lock);
543 	return rc;
544 }
545 
546 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
547 					    struct sock *other)
548 {
549 	struct unix_sock *u, *u_other;
550 
551 	u = unix_sk(sk);
552 	u_other = unix_sk(other);
553 	spin_lock(&u_other->peer_wait.lock);
554 
555 	if (u->peer_wake.private == other) {
556 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
557 		u->peer_wake.private = NULL;
558 	}
559 
560 	spin_unlock(&u_other->peer_wait.lock);
561 }
562 
563 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
564 						   struct sock *other)
565 {
566 	unix_dgram_peer_wake_disconnect(sk, other);
567 	wake_up_interruptible_poll(sk_sleep(sk),
568 				   EPOLLOUT |
569 				   EPOLLWRNORM |
570 				   EPOLLWRBAND);
571 }
572 
573 /* preconditions:
574  *	- unix_peer(sk) == other
575  *	- association is stable
576  */
577 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
578 {
579 	int connected;
580 
581 	connected = unix_dgram_peer_wake_connect(sk, other);
582 
583 	/* If other is SOCK_DEAD, we want to make sure we signal
584 	 * POLLOUT, such that a subsequent write() can get a
585 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
586 	 * to other and its full, we will hang waiting for POLLOUT.
587 	 */
588 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
589 		return 1;
590 
591 	if (connected)
592 		unix_dgram_peer_wake_disconnect(sk, other);
593 
594 	return 0;
595 }
596 
597 static int unix_writable(const struct sock *sk, unsigned char state)
598 {
599 	return state != TCP_LISTEN &&
600 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
601 }
602 
603 static void unix_write_space(struct sock *sk)
604 {
605 	struct socket_wq *wq;
606 
607 	rcu_read_lock();
608 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
609 		wq = rcu_dereference(sk->sk_wq);
610 		if (skwq_has_sleeper(wq))
611 			wake_up_interruptible_sync_poll(&wq->wait,
612 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
613 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
614 	}
615 	rcu_read_unlock();
616 }
617 
618 /* When dgram socket disconnects (or changes its peer), we clear its receive
619  * queue of packets arrived from previous peer. First, it allows to do
620  * flow control based only on wmem_alloc; second, sk connected to peer
621  * may receive messages only from that peer. */
622 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
623 {
624 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
625 		skb_queue_purge(&sk->sk_receive_queue);
626 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
627 
628 		/* If one link of bidirectional dgram pipe is disconnected,
629 		 * we signal error. Messages are lost. Do not make this,
630 		 * when peer was not connected to us.
631 		 */
632 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
633 			WRITE_ONCE(other->sk_err, ECONNRESET);
634 			sk_error_report(other);
635 		}
636 	}
637 }
638 
639 static void unix_sock_destructor(struct sock *sk)
640 {
641 	struct unix_sock *u = unix_sk(sk);
642 
643 	skb_queue_purge(&sk->sk_receive_queue);
644 
645 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
646 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
647 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
648 	if (!sock_flag(sk, SOCK_DEAD)) {
649 		pr_info("Attempt to release alive unix socket: %p\n", sk);
650 		return;
651 	}
652 
653 	if (u->addr)
654 		unix_release_addr(u->addr);
655 
656 	atomic_long_dec(&unix_nr_socks);
657 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
658 #ifdef UNIX_REFCNT_DEBUG
659 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
660 		atomic_long_read(&unix_nr_socks));
661 #endif
662 }
663 
664 static void unix_release_sock(struct sock *sk, int embrion)
665 {
666 	struct unix_sock *u = unix_sk(sk);
667 	struct sock *skpair;
668 	struct sk_buff *skb;
669 	struct path path;
670 	int state;
671 
672 	unix_remove_socket(sock_net(sk), sk);
673 	unix_remove_bsd_socket(sk);
674 
675 	/* Clear state */
676 	unix_state_lock(sk);
677 	sock_orphan(sk);
678 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
679 	path	     = u->path;
680 	u->path.dentry = NULL;
681 	u->path.mnt = NULL;
682 	state = sk->sk_state;
683 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
684 
685 	skpair = unix_peer(sk);
686 	unix_peer(sk) = NULL;
687 
688 	unix_state_unlock(sk);
689 
690 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
691 	u->oob_skb = NULL;
692 #endif
693 
694 	wake_up_interruptible_all(&u->peer_wait);
695 
696 	if (skpair != NULL) {
697 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
698 			unix_state_lock(skpair);
699 			/* No more writes */
700 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
701 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
702 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
703 			unix_state_unlock(skpair);
704 			skpair->sk_state_change(skpair);
705 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
706 		}
707 
708 		unix_dgram_peer_wake_disconnect(sk, skpair);
709 		sock_put(skpair); /* It may now die */
710 	}
711 
712 	/* Try to flush out this socket. Throw out buffers at least */
713 
714 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
715 		if (state == TCP_LISTEN)
716 			unix_release_sock(skb->sk, 1);
717 
718 		/* passed fds are erased in the kfree_skb hook	      */
719 		kfree_skb(skb);
720 	}
721 
722 	if (path.dentry)
723 		path_put(&path);
724 
725 	sock_put(sk);
726 
727 	/* ---- Socket is dead now and most probably destroyed ---- */
728 
729 	/*
730 	 * Fixme: BSD difference: In BSD all sockets connected to us get
731 	 *	  ECONNRESET and we die on the spot. In Linux we behave
732 	 *	  like files and pipes do and wait for the last
733 	 *	  dereference.
734 	 *
735 	 * Can't we simply set sock->err?
736 	 *
737 	 *	  What the above comment does talk about? --ANK(980817)
738 	 */
739 
740 	if (READ_ONCE(unix_tot_inflight))
741 		unix_gc();		/* Garbage collect fds */
742 }
743 
744 static void init_peercred(struct sock *sk)
745 {
746 	sk->sk_peer_pid = get_pid(task_tgid(current));
747 	sk->sk_peer_cred = get_current_cred();
748 }
749 
750 static void update_peercred(struct sock *sk)
751 {
752 	const struct cred *old_cred;
753 	struct pid *old_pid;
754 
755 	spin_lock(&sk->sk_peer_lock);
756 	old_pid = sk->sk_peer_pid;
757 	old_cred = sk->sk_peer_cred;
758 	init_peercred(sk);
759 	spin_unlock(&sk->sk_peer_lock);
760 
761 	put_pid(old_pid);
762 	put_cred(old_cred);
763 }
764 
765 static void copy_peercred(struct sock *sk, struct sock *peersk)
766 {
767 	lockdep_assert_held(&unix_sk(peersk)->lock);
768 
769 	spin_lock(&sk->sk_peer_lock);
770 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
771 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
772 	spin_unlock(&sk->sk_peer_lock);
773 }
774 
775 static int unix_listen(struct socket *sock, int backlog)
776 {
777 	int err;
778 	struct sock *sk = sock->sk;
779 	struct unix_sock *u = unix_sk(sk);
780 
781 	err = -EOPNOTSUPP;
782 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
783 		goto out;	/* Only stream/seqpacket sockets accept */
784 	err = -EINVAL;
785 	if (!READ_ONCE(u->addr))
786 		goto out;	/* No listens on an unbound socket */
787 	unix_state_lock(sk);
788 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
789 		goto out_unlock;
790 	if (backlog > sk->sk_max_ack_backlog)
791 		wake_up_interruptible_all(&u->peer_wait);
792 	sk->sk_max_ack_backlog	= backlog;
793 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
794 
795 	/* set credentials so connect can copy them */
796 	update_peercred(sk);
797 	err = 0;
798 
799 out_unlock:
800 	unix_state_unlock(sk);
801 out:
802 	return err;
803 }
804 
805 static int unix_release(struct socket *);
806 static int unix_bind(struct socket *, struct sockaddr *, int);
807 static int unix_stream_connect(struct socket *, struct sockaddr *,
808 			       int addr_len, int flags);
809 static int unix_socketpair(struct socket *, struct socket *);
810 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
811 static int unix_getname(struct socket *, struct sockaddr *, int);
812 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
813 static __poll_t unix_dgram_poll(struct file *, struct socket *,
814 				    poll_table *);
815 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
816 #ifdef CONFIG_COMPAT
817 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
818 #endif
819 static int unix_shutdown(struct socket *, int);
820 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
821 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
822 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
823 				       struct pipe_inode_info *, size_t size,
824 				       unsigned int flags);
825 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
826 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
827 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
828 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
829 static int unix_dgram_connect(struct socket *, struct sockaddr *,
830 			      int, int);
831 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
832 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
833 				  int);
834 
835 #ifdef CONFIG_PROC_FS
836 static int unix_count_nr_fds(struct sock *sk)
837 {
838 	struct sk_buff *skb;
839 	struct unix_sock *u;
840 	int nr_fds = 0;
841 
842 	spin_lock(&sk->sk_receive_queue.lock);
843 	skb = skb_peek(&sk->sk_receive_queue);
844 	while (skb) {
845 		u = unix_sk(skb->sk);
846 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
847 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
848 	}
849 	spin_unlock(&sk->sk_receive_queue.lock);
850 
851 	return nr_fds;
852 }
853 
854 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
855 {
856 	struct sock *sk = sock->sk;
857 	unsigned char s_state;
858 	struct unix_sock *u;
859 	int nr_fds = 0;
860 
861 	if (sk) {
862 		s_state = READ_ONCE(sk->sk_state);
863 		u = unix_sk(sk);
864 
865 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
866 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
867 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
868 		 */
869 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
870 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
871 		else if (s_state == TCP_LISTEN)
872 			nr_fds = unix_count_nr_fds(sk);
873 
874 		seq_printf(m, "scm_fds: %u\n", nr_fds);
875 	}
876 }
877 #else
878 #define unix_show_fdinfo NULL
879 #endif
880 
881 static const struct proto_ops unix_stream_ops = {
882 	.family =	PF_UNIX,
883 	.owner =	THIS_MODULE,
884 	.release =	unix_release,
885 	.bind =		unix_bind,
886 	.connect =	unix_stream_connect,
887 	.socketpair =	unix_socketpair,
888 	.accept =	unix_accept,
889 	.getname =	unix_getname,
890 	.poll =		unix_poll,
891 	.ioctl =	unix_ioctl,
892 #ifdef CONFIG_COMPAT
893 	.compat_ioctl =	unix_compat_ioctl,
894 #endif
895 	.listen =	unix_listen,
896 	.shutdown =	unix_shutdown,
897 	.sendmsg =	unix_stream_sendmsg,
898 	.recvmsg =	unix_stream_recvmsg,
899 	.read_skb =	unix_stream_read_skb,
900 	.mmap =		sock_no_mmap,
901 	.splice_read =	unix_stream_splice_read,
902 	.set_peek_off =	sk_set_peek_off,
903 	.show_fdinfo =	unix_show_fdinfo,
904 };
905 
906 static const struct proto_ops unix_dgram_ops = {
907 	.family =	PF_UNIX,
908 	.owner =	THIS_MODULE,
909 	.release =	unix_release,
910 	.bind =		unix_bind,
911 	.connect =	unix_dgram_connect,
912 	.socketpair =	unix_socketpair,
913 	.accept =	sock_no_accept,
914 	.getname =	unix_getname,
915 	.poll =		unix_dgram_poll,
916 	.ioctl =	unix_ioctl,
917 #ifdef CONFIG_COMPAT
918 	.compat_ioctl =	unix_compat_ioctl,
919 #endif
920 	.listen =	sock_no_listen,
921 	.shutdown =	unix_shutdown,
922 	.sendmsg =	unix_dgram_sendmsg,
923 	.read_skb =	unix_read_skb,
924 	.recvmsg =	unix_dgram_recvmsg,
925 	.mmap =		sock_no_mmap,
926 	.set_peek_off =	sk_set_peek_off,
927 	.show_fdinfo =	unix_show_fdinfo,
928 };
929 
930 static const struct proto_ops unix_seqpacket_ops = {
931 	.family =	PF_UNIX,
932 	.owner =	THIS_MODULE,
933 	.release =	unix_release,
934 	.bind =		unix_bind,
935 	.connect =	unix_stream_connect,
936 	.socketpair =	unix_socketpair,
937 	.accept =	unix_accept,
938 	.getname =	unix_getname,
939 	.poll =		unix_dgram_poll,
940 	.ioctl =	unix_ioctl,
941 #ifdef CONFIG_COMPAT
942 	.compat_ioctl =	unix_compat_ioctl,
943 #endif
944 	.listen =	unix_listen,
945 	.shutdown =	unix_shutdown,
946 	.sendmsg =	unix_seqpacket_sendmsg,
947 	.recvmsg =	unix_seqpacket_recvmsg,
948 	.mmap =		sock_no_mmap,
949 	.set_peek_off =	sk_set_peek_off,
950 	.show_fdinfo =	unix_show_fdinfo,
951 };
952 
953 static void unix_close(struct sock *sk, long timeout)
954 {
955 	/* Nothing to do here, unix socket does not need a ->close().
956 	 * This is merely for sockmap.
957 	 */
958 }
959 
960 static void unix_unhash(struct sock *sk)
961 {
962 	/* Nothing to do here, unix socket does not need a ->unhash().
963 	 * This is merely for sockmap.
964 	 */
965 }
966 
967 static bool unix_bpf_bypass_getsockopt(int level, int optname)
968 {
969 	if (level == SOL_SOCKET) {
970 		switch (optname) {
971 		case SO_PEERPIDFD:
972 			return true;
973 		default:
974 			return false;
975 		}
976 	}
977 
978 	return false;
979 }
980 
981 struct proto unix_dgram_proto = {
982 	.name			= "UNIX",
983 	.owner			= THIS_MODULE,
984 	.obj_size		= sizeof(struct unix_sock),
985 	.close			= unix_close,
986 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
987 #ifdef CONFIG_BPF_SYSCALL
988 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
989 #endif
990 };
991 
992 struct proto unix_stream_proto = {
993 	.name			= "UNIX-STREAM",
994 	.owner			= THIS_MODULE,
995 	.obj_size		= sizeof(struct unix_sock),
996 	.close			= unix_close,
997 	.unhash			= unix_unhash,
998 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
999 #ifdef CONFIG_BPF_SYSCALL
1000 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1001 #endif
1002 };
1003 
1004 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1005 {
1006 	struct unix_sock *u;
1007 	struct sock *sk;
1008 	int err;
1009 
1010 	atomic_long_inc(&unix_nr_socks);
1011 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1012 		err = -ENFILE;
1013 		goto err;
1014 	}
1015 
1016 	if (type == SOCK_STREAM)
1017 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1018 	else /*dgram and  seqpacket */
1019 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1020 
1021 	if (!sk) {
1022 		err = -ENOMEM;
1023 		goto err;
1024 	}
1025 
1026 	sock_init_data(sock, sk);
1027 
1028 	sk->sk_hash		= unix_unbound_hash(sk);
1029 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1030 	sk->sk_write_space	= unix_write_space;
1031 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1032 	sk->sk_destruct		= unix_sock_destructor;
1033 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1034 
1035 	u = unix_sk(sk);
1036 	u->listener = NULL;
1037 	u->vertex = NULL;
1038 	u->path.dentry = NULL;
1039 	u->path.mnt = NULL;
1040 	spin_lock_init(&u->lock);
1041 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1042 	mutex_init(&u->iolock); /* single task reading lock */
1043 	mutex_init(&u->bindlock); /* single task binding lock */
1044 	init_waitqueue_head(&u->peer_wait);
1045 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1046 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1047 	unix_insert_unbound_socket(net, sk);
1048 
1049 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1050 
1051 	return sk;
1052 
1053 err:
1054 	atomic_long_dec(&unix_nr_socks);
1055 	return ERR_PTR(err);
1056 }
1057 
1058 static int unix_create(struct net *net, struct socket *sock, int protocol,
1059 		       int kern)
1060 {
1061 	struct sock *sk;
1062 
1063 	if (protocol && protocol != PF_UNIX)
1064 		return -EPROTONOSUPPORT;
1065 
1066 	sock->state = SS_UNCONNECTED;
1067 
1068 	switch (sock->type) {
1069 	case SOCK_STREAM:
1070 		sock->ops = &unix_stream_ops;
1071 		break;
1072 		/*
1073 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1074 		 *	nothing uses it.
1075 		 */
1076 	case SOCK_RAW:
1077 		sock->type = SOCK_DGRAM;
1078 		fallthrough;
1079 	case SOCK_DGRAM:
1080 		sock->ops = &unix_dgram_ops;
1081 		break;
1082 	case SOCK_SEQPACKET:
1083 		sock->ops = &unix_seqpacket_ops;
1084 		break;
1085 	default:
1086 		return -ESOCKTNOSUPPORT;
1087 	}
1088 
1089 	sk = unix_create1(net, sock, kern, sock->type);
1090 	if (IS_ERR(sk))
1091 		return PTR_ERR(sk);
1092 
1093 	return 0;
1094 }
1095 
1096 static int unix_release(struct socket *sock)
1097 {
1098 	struct sock *sk = sock->sk;
1099 
1100 	if (!sk)
1101 		return 0;
1102 
1103 	sk->sk_prot->close(sk, 0);
1104 	unix_release_sock(sk, 0);
1105 	sock->sk = NULL;
1106 
1107 	return 0;
1108 }
1109 
1110 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1111 				  int type)
1112 {
1113 	struct inode *inode;
1114 	struct path path;
1115 	struct sock *sk;
1116 	int err;
1117 
1118 	unix_mkname_bsd(sunaddr, addr_len);
1119 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1120 	if (err)
1121 		goto fail;
1122 
1123 	err = path_permission(&path, MAY_WRITE);
1124 	if (err)
1125 		goto path_put;
1126 
1127 	err = -ECONNREFUSED;
1128 	inode = d_backing_inode(path.dentry);
1129 	if (!S_ISSOCK(inode->i_mode))
1130 		goto path_put;
1131 
1132 	sk = unix_find_socket_byinode(inode);
1133 	if (!sk)
1134 		goto path_put;
1135 
1136 	err = -EPROTOTYPE;
1137 	if (sk->sk_type == type)
1138 		touch_atime(&path);
1139 	else
1140 		goto sock_put;
1141 
1142 	path_put(&path);
1143 
1144 	return sk;
1145 
1146 sock_put:
1147 	sock_put(sk);
1148 path_put:
1149 	path_put(&path);
1150 fail:
1151 	return ERR_PTR(err);
1152 }
1153 
1154 static struct sock *unix_find_abstract(struct net *net,
1155 				       struct sockaddr_un *sunaddr,
1156 				       int addr_len, int type)
1157 {
1158 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1159 	struct dentry *dentry;
1160 	struct sock *sk;
1161 
1162 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1163 	if (!sk)
1164 		return ERR_PTR(-ECONNREFUSED);
1165 
1166 	dentry = unix_sk(sk)->path.dentry;
1167 	if (dentry)
1168 		touch_atime(&unix_sk(sk)->path);
1169 
1170 	return sk;
1171 }
1172 
1173 static struct sock *unix_find_other(struct net *net,
1174 				    struct sockaddr_un *sunaddr,
1175 				    int addr_len, int type)
1176 {
1177 	struct sock *sk;
1178 
1179 	if (sunaddr->sun_path[0])
1180 		sk = unix_find_bsd(sunaddr, addr_len, type);
1181 	else
1182 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1183 
1184 	return sk;
1185 }
1186 
1187 static int unix_autobind(struct sock *sk)
1188 {
1189 	struct unix_sock *u = unix_sk(sk);
1190 	unsigned int new_hash, old_hash;
1191 	struct net *net = sock_net(sk);
1192 	struct unix_address *addr;
1193 	u32 lastnum, ordernum;
1194 	int err;
1195 
1196 	err = mutex_lock_interruptible(&u->bindlock);
1197 	if (err)
1198 		return err;
1199 
1200 	if (u->addr)
1201 		goto out;
1202 
1203 	err = -ENOMEM;
1204 	addr = kzalloc(sizeof(*addr) +
1205 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1206 	if (!addr)
1207 		goto out;
1208 
1209 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1210 	addr->name->sun_family = AF_UNIX;
1211 	refcount_set(&addr->refcnt, 1);
1212 
1213 	old_hash = sk->sk_hash;
1214 	ordernum = get_random_u32();
1215 	lastnum = ordernum & 0xFFFFF;
1216 retry:
1217 	ordernum = (ordernum + 1) & 0xFFFFF;
1218 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1219 
1220 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1221 	unix_table_double_lock(net, old_hash, new_hash);
1222 
1223 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1224 		unix_table_double_unlock(net, old_hash, new_hash);
1225 
1226 		/* __unix_find_socket_byname() may take long time if many names
1227 		 * are already in use.
1228 		 */
1229 		cond_resched();
1230 
1231 		if (ordernum == lastnum) {
1232 			/* Give up if all names seems to be in use. */
1233 			err = -ENOSPC;
1234 			unix_release_addr(addr);
1235 			goto out;
1236 		}
1237 
1238 		goto retry;
1239 	}
1240 
1241 	__unix_set_addr_hash(net, sk, addr, new_hash);
1242 	unix_table_double_unlock(net, old_hash, new_hash);
1243 	err = 0;
1244 
1245 out:	mutex_unlock(&u->bindlock);
1246 	return err;
1247 }
1248 
1249 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1250 			 int addr_len)
1251 {
1252 	umode_t mode = S_IFSOCK |
1253 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1254 	struct unix_sock *u = unix_sk(sk);
1255 	unsigned int new_hash, old_hash;
1256 	struct net *net = sock_net(sk);
1257 	struct mnt_idmap *idmap;
1258 	struct unix_address *addr;
1259 	struct dentry *dentry;
1260 	struct path parent;
1261 	int err;
1262 
1263 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1264 	addr = unix_create_addr(sunaddr, addr_len);
1265 	if (!addr)
1266 		return -ENOMEM;
1267 
1268 	/*
1269 	 * Get the parent directory, calculate the hash for last
1270 	 * component.
1271 	 */
1272 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1273 	if (IS_ERR(dentry)) {
1274 		err = PTR_ERR(dentry);
1275 		goto out;
1276 	}
1277 
1278 	/*
1279 	 * All right, let's create it.
1280 	 */
1281 	idmap = mnt_idmap(parent.mnt);
1282 	err = security_path_mknod(&parent, dentry, mode, 0);
1283 	if (!err)
1284 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1285 	if (err)
1286 		goto out_path;
1287 	err = mutex_lock_interruptible(&u->bindlock);
1288 	if (err)
1289 		goto out_unlink;
1290 	if (u->addr)
1291 		goto out_unlock;
1292 
1293 	old_hash = sk->sk_hash;
1294 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1295 	unix_table_double_lock(net, old_hash, new_hash);
1296 	u->path.mnt = mntget(parent.mnt);
1297 	u->path.dentry = dget(dentry);
1298 	__unix_set_addr_hash(net, sk, addr, new_hash);
1299 	unix_table_double_unlock(net, old_hash, new_hash);
1300 	unix_insert_bsd_socket(sk);
1301 	mutex_unlock(&u->bindlock);
1302 	done_path_create(&parent, dentry);
1303 	return 0;
1304 
1305 out_unlock:
1306 	mutex_unlock(&u->bindlock);
1307 	err = -EINVAL;
1308 out_unlink:
1309 	/* failed after successful mknod?  unlink what we'd created... */
1310 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1311 out_path:
1312 	done_path_create(&parent, dentry);
1313 out:
1314 	unix_release_addr(addr);
1315 	return err == -EEXIST ? -EADDRINUSE : err;
1316 }
1317 
1318 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1319 			      int addr_len)
1320 {
1321 	struct unix_sock *u = unix_sk(sk);
1322 	unsigned int new_hash, old_hash;
1323 	struct net *net = sock_net(sk);
1324 	struct unix_address *addr;
1325 	int err;
1326 
1327 	addr = unix_create_addr(sunaddr, addr_len);
1328 	if (!addr)
1329 		return -ENOMEM;
1330 
1331 	err = mutex_lock_interruptible(&u->bindlock);
1332 	if (err)
1333 		goto out;
1334 
1335 	if (u->addr) {
1336 		err = -EINVAL;
1337 		goto out_mutex;
1338 	}
1339 
1340 	old_hash = sk->sk_hash;
1341 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1342 	unix_table_double_lock(net, old_hash, new_hash);
1343 
1344 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1345 		goto out_spin;
1346 
1347 	__unix_set_addr_hash(net, sk, addr, new_hash);
1348 	unix_table_double_unlock(net, old_hash, new_hash);
1349 	mutex_unlock(&u->bindlock);
1350 	return 0;
1351 
1352 out_spin:
1353 	unix_table_double_unlock(net, old_hash, new_hash);
1354 	err = -EADDRINUSE;
1355 out_mutex:
1356 	mutex_unlock(&u->bindlock);
1357 out:
1358 	unix_release_addr(addr);
1359 	return err;
1360 }
1361 
1362 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1363 {
1364 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1365 	struct sock *sk = sock->sk;
1366 	int err;
1367 
1368 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1369 	    sunaddr->sun_family == AF_UNIX)
1370 		return unix_autobind(sk);
1371 
1372 	err = unix_validate_addr(sunaddr, addr_len);
1373 	if (err)
1374 		return err;
1375 
1376 	if (sunaddr->sun_path[0])
1377 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1378 	else
1379 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1380 
1381 	return err;
1382 }
1383 
1384 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1385 {
1386 	if (unlikely(sk1 == sk2) || !sk2) {
1387 		unix_state_lock(sk1);
1388 		return;
1389 	}
1390 
1391 	if (sk1 > sk2)
1392 		swap(sk1, sk2);
1393 
1394 	unix_state_lock(sk1);
1395 	unix_state_lock(sk2);
1396 }
1397 
1398 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1399 {
1400 	if (unlikely(sk1 == sk2) || !sk2) {
1401 		unix_state_unlock(sk1);
1402 		return;
1403 	}
1404 	unix_state_unlock(sk1);
1405 	unix_state_unlock(sk2);
1406 }
1407 
1408 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1409 			      int alen, int flags)
1410 {
1411 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1412 	struct sock *sk = sock->sk;
1413 	struct sock *other;
1414 	int err;
1415 
1416 	err = -EINVAL;
1417 	if (alen < offsetofend(struct sockaddr, sa_family))
1418 		goto out;
1419 
1420 	if (addr->sa_family != AF_UNSPEC) {
1421 		err = unix_validate_addr(sunaddr, alen);
1422 		if (err)
1423 			goto out;
1424 
1425 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1426 		if (err)
1427 			goto out;
1428 
1429 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1430 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1431 		    !READ_ONCE(unix_sk(sk)->addr)) {
1432 			err = unix_autobind(sk);
1433 			if (err)
1434 				goto out;
1435 		}
1436 
1437 restart:
1438 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1439 		if (IS_ERR(other)) {
1440 			err = PTR_ERR(other);
1441 			goto out;
1442 		}
1443 
1444 		unix_state_double_lock(sk, other);
1445 
1446 		/* Apparently VFS overslept socket death. Retry. */
1447 		if (sock_flag(other, SOCK_DEAD)) {
1448 			unix_state_double_unlock(sk, other);
1449 			sock_put(other);
1450 			goto restart;
1451 		}
1452 
1453 		err = -EPERM;
1454 		if (!unix_may_send(sk, other))
1455 			goto out_unlock;
1456 
1457 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1458 		if (err)
1459 			goto out_unlock;
1460 
1461 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1462 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1463 	} else {
1464 		/*
1465 		 *	1003.1g breaking connected state with AF_UNSPEC
1466 		 */
1467 		other = NULL;
1468 		unix_state_double_lock(sk, other);
1469 	}
1470 
1471 	/*
1472 	 * If it was connected, reconnect.
1473 	 */
1474 	if (unix_peer(sk)) {
1475 		struct sock *old_peer = unix_peer(sk);
1476 
1477 		unix_peer(sk) = other;
1478 		if (!other)
1479 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1480 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1481 
1482 		unix_state_double_unlock(sk, other);
1483 
1484 		if (other != old_peer) {
1485 			unix_dgram_disconnected(sk, old_peer);
1486 
1487 			unix_state_lock(old_peer);
1488 			if (!unix_peer(old_peer))
1489 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1490 			unix_state_unlock(old_peer);
1491 		}
1492 
1493 		sock_put(old_peer);
1494 	} else {
1495 		unix_peer(sk) = other;
1496 		unix_state_double_unlock(sk, other);
1497 	}
1498 
1499 	return 0;
1500 
1501 out_unlock:
1502 	unix_state_double_unlock(sk, other);
1503 	sock_put(other);
1504 out:
1505 	return err;
1506 }
1507 
1508 static long unix_wait_for_peer(struct sock *other, long timeo)
1509 	__releases(&unix_sk(other)->lock)
1510 {
1511 	struct unix_sock *u = unix_sk(other);
1512 	int sched;
1513 	DEFINE_WAIT(wait);
1514 
1515 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1516 
1517 	sched = !sock_flag(other, SOCK_DEAD) &&
1518 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1519 		unix_recvq_full_lockless(other);
1520 
1521 	unix_state_unlock(other);
1522 
1523 	if (sched)
1524 		timeo = schedule_timeout(timeo);
1525 
1526 	finish_wait(&u->peer_wait, &wait);
1527 	return timeo;
1528 }
1529 
1530 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1531 			       int addr_len, int flags)
1532 {
1533 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1534 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1535 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1536 	struct net *net = sock_net(sk);
1537 	struct sk_buff *skb = NULL;
1538 	unsigned char state;
1539 	long timeo;
1540 	int err;
1541 
1542 	err = unix_validate_addr(sunaddr, addr_len);
1543 	if (err)
1544 		goto out;
1545 
1546 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1547 	if (err)
1548 		goto out;
1549 
1550 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1551 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1552 	    !READ_ONCE(u->addr)) {
1553 		err = unix_autobind(sk);
1554 		if (err)
1555 			goto out;
1556 	}
1557 
1558 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1559 
1560 	/* First of all allocate resources.
1561 	 * If we will make it after state is locked,
1562 	 * we will have to recheck all again in any case.
1563 	 */
1564 
1565 	/* create new sock for complete connection */
1566 	newsk = unix_create1(net, NULL, 0, sock->type);
1567 	if (IS_ERR(newsk)) {
1568 		err = PTR_ERR(newsk);
1569 		goto out;
1570 	}
1571 
1572 	/* Allocate skb for sending to listening sock */
1573 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1574 	if (!skb) {
1575 		err = -ENOMEM;
1576 		goto out_free_sk;
1577 	}
1578 
1579 restart:
1580 	/*  Find listening sock. */
1581 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1582 	if (IS_ERR(other)) {
1583 		err = PTR_ERR(other);
1584 		goto out_free_skb;
1585 	}
1586 
1587 	unix_state_lock(other);
1588 
1589 	/* Apparently VFS overslept socket death. Retry. */
1590 	if (sock_flag(other, SOCK_DEAD)) {
1591 		unix_state_unlock(other);
1592 		sock_put(other);
1593 		goto restart;
1594 	}
1595 
1596 	if (other->sk_state != TCP_LISTEN ||
1597 	    other->sk_shutdown & RCV_SHUTDOWN) {
1598 		err = -ECONNREFUSED;
1599 		goto out_unlock;
1600 	}
1601 
1602 	if (unix_recvq_full_lockless(other)) {
1603 		if (!timeo) {
1604 			err = -EAGAIN;
1605 			goto out_unlock;
1606 		}
1607 
1608 		timeo = unix_wait_for_peer(other, timeo);
1609 		sock_put(other);
1610 
1611 		err = sock_intr_errno(timeo);
1612 		if (signal_pending(current))
1613 			goto out_free_skb;
1614 
1615 		goto restart;
1616 	}
1617 
1618 	/* self connect and simultaneous connect are eliminated
1619 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1620 	 */
1621 	state = READ_ONCE(sk->sk_state);
1622 	if (unlikely(state != TCP_CLOSE)) {
1623 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1624 		goto out_unlock;
1625 	}
1626 
1627 	unix_state_lock(sk);
1628 
1629 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1630 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1631 		unix_state_unlock(sk);
1632 		goto out_unlock;
1633 	}
1634 
1635 	err = security_unix_stream_connect(sk, other, newsk);
1636 	if (err) {
1637 		unix_state_unlock(sk);
1638 		goto out_unlock;
1639 	}
1640 
1641 	/* The way is open! Fastly set all the necessary fields... */
1642 
1643 	sock_hold(sk);
1644 	unix_peer(newsk)	= sk;
1645 	newsk->sk_state		= TCP_ESTABLISHED;
1646 	newsk->sk_type		= sk->sk_type;
1647 	init_peercred(newsk);
1648 	newu = unix_sk(newsk);
1649 	newu->listener = other;
1650 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1651 	otheru = unix_sk(other);
1652 
1653 	/* copy address information from listening to new sock
1654 	 *
1655 	 * The contents of *(otheru->addr) and otheru->path
1656 	 * are seen fully set up here, since we have found
1657 	 * otheru in hash under its lock.  Insertion into the
1658 	 * hash chain we'd found it in had been done in an
1659 	 * earlier critical area protected by the chain's lock,
1660 	 * the same one where we'd set *(otheru->addr) contents,
1661 	 * as well as otheru->path and otheru->addr itself.
1662 	 *
1663 	 * Using smp_store_release() here to set newu->addr
1664 	 * is enough to make those stores, as well as stores
1665 	 * to newu->path visible to anyone who gets newu->addr
1666 	 * by smp_load_acquire().  IOW, the same warranties
1667 	 * as for unix_sock instances bound in unix_bind() or
1668 	 * in unix_autobind().
1669 	 */
1670 	if (otheru->path.dentry) {
1671 		path_get(&otheru->path);
1672 		newu->path = otheru->path;
1673 	}
1674 	refcount_inc(&otheru->addr->refcnt);
1675 	smp_store_release(&newu->addr, otheru->addr);
1676 
1677 	/* Set credentials */
1678 	copy_peercred(sk, other);
1679 
1680 	sock->state	= SS_CONNECTED;
1681 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1682 	sock_hold(newsk);
1683 
1684 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1685 	unix_peer(sk)	= newsk;
1686 
1687 	unix_state_unlock(sk);
1688 
1689 	/* take ten and send info to listening sock */
1690 	spin_lock(&other->sk_receive_queue.lock);
1691 	__skb_queue_tail(&other->sk_receive_queue, skb);
1692 	spin_unlock(&other->sk_receive_queue.lock);
1693 	unix_state_unlock(other);
1694 	other->sk_data_ready(other);
1695 	sock_put(other);
1696 	return 0;
1697 
1698 out_unlock:
1699 	unix_state_unlock(other);
1700 	sock_put(other);
1701 out_free_skb:
1702 	kfree_skb(skb);
1703 out_free_sk:
1704 	unix_release_sock(newsk, 0);
1705 out:
1706 	return err;
1707 }
1708 
1709 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1710 {
1711 	struct sock *ska = socka->sk, *skb = sockb->sk;
1712 
1713 	/* Join our sockets back to back */
1714 	sock_hold(ska);
1715 	sock_hold(skb);
1716 	unix_peer(ska) = skb;
1717 	unix_peer(skb) = ska;
1718 	init_peercred(ska);
1719 	init_peercred(skb);
1720 
1721 	ska->sk_state = TCP_ESTABLISHED;
1722 	skb->sk_state = TCP_ESTABLISHED;
1723 	socka->state  = SS_CONNECTED;
1724 	sockb->state  = SS_CONNECTED;
1725 	return 0;
1726 }
1727 
1728 static void unix_sock_inherit_flags(const struct socket *old,
1729 				    struct socket *new)
1730 {
1731 	if (test_bit(SOCK_PASSCRED, &old->flags))
1732 		set_bit(SOCK_PASSCRED, &new->flags);
1733 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1734 		set_bit(SOCK_PASSPIDFD, &new->flags);
1735 	if (test_bit(SOCK_PASSSEC, &old->flags))
1736 		set_bit(SOCK_PASSSEC, &new->flags);
1737 }
1738 
1739 static int unix_accept(struct socket *sock, struct socket *newsock,
1740 		       struct proto_accept_arg *arg)
1741 {
1742 	struct sock *sk = sock->sk;
1743 	struct sk_buff *skb;
1744 	struct sock *tsk;
1745 
1746 	arg->err = -EOPNOTSUPP;
1747 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1748 		goto out;
1749 
1750 	arg->err = -EINVAL;
1751 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1752 		goto out;
1753 
1754 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1755 	 * so that no locks are necessary.
1756 	 */
1757 
1758 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1759 				&arg->err);
1760 	if (!skb) {
1761 		/* This means receive shutdown. */
1762 		if (arg->err == 0)
1763 			arg->err = -EINVAL;
1764 		goto out;
1765 	}
1766 
1767 	tsk = skb->sk;
1768 	skb_free_datagram(sk, skb);
1769 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1770 
1771 	/* attach accepted sock to socket */
1772 	unix_state_lock(tsk);
1773 	unix_update_edges(unix_sk(tsk));
1774 	newsock->state = SS_CONNECTED;
1775 	unix_sock_inherit_flags(sock, newsock);
1776 	sock_graft(tsk, newsock);
1777 	unix_state_unlock(tsk);
1778 	return 0;
1779 
1780 out:
1781 	return arg->err;
1782 }
1783 
1784 
1785 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1786 {
1787 	struct sock *sk = sock->sk;
1788 	struct unix_address *addr;
1789 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1790 	int err = 0;
1791 
1792 	if (peer) {
1793 		sk = unix_peer_get(sk);
1794 
1795 		err = -ENOTCONN;
1796 		if (!sk)
1797 			goto out;
1798 		err = 0;
1799 	} else {
1800 		sock_hold(sk);
1801 	}
1802 
1803 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1804 	if (!addr) {
1805 		sunaddr->sun_family = AF_UNIX;
1806 		sunaddr->sun_path[0] = 0;
1807 		err = offsetof(struct sockaddr_un, sun_path);
1808 	} else {
1809 		err = addr->len;
1810 		memcpy(sunaddr, addr->name, addr->len);
1811 
1812 		if (peer)
1813 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1814 					       CGROUP_UNIX_GETPEERNAME);
1815 		else
1816 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1817 					       CGROUP_UNIX_GETSOCKNAME);
1818 	}
1819 	sock_put(sk);
1820 out:
1821 	return err;
1822 }
1823 
1824 /* The "user->unix_inflight" variable is protected by the garbage
1825  * collection lock, and we just read it locklessly here. If you go
1826  * over the limit, there might be a tiny race in actually noticing
1827  * it across threads. Tough.
1828  */
1829 static inline bool too_many_unix_fds(struct task_struct *p)
1830 {
1831 	struct user_struct *user = current_user();
1832 
1833 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1834 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1835 	return false;
1836 }
1837 
1838 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1839 {
1840 	if (too_many_unix_fds(current))
1841 		return -ETOOMANYREFS;
1842 
1843 	UNIXCB(skb).fp = scm->fp;
1844 	scm->fp = NULL;
1845 
1846 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1847 		return -ENOMEM;
1848 
1849 	return 0;
1850 }
1851 
1852 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1853 {
1854 	scm->fp = UNIXCB(skb).fp;
1855 	UNIXCB(skb).fp = NULL;
1856 
1857 	unix_destroy_fpl(scm->fp);
1858 }
1859 
1860 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1861 {
1862 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1863 }
1864 
1865 static void unix_destruct_scm(struct sk_buff *skb)
1866 {
1867 	struct scm_cookie scm;
1868 
1869 	memset(&scm, 0, sizeof(scm));
1870 	scm.pid  = UNIXCB(skb).pid;
1871 	if (UNIXCB(skb).fp)
1872 		unix_detach_fds(&scm, skb);
1873 
1874 	/* Alas, it calls VFS */
1875 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1876 	scm_destroy(&scm);
1877 	sock_wfree(skb);
1878 }
1879 
1880 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1881 {
1882 	int err = 0;
1883 
1884 	UNIXCB(skb).pid  = get_pid(scm->pid);
1885 	UNIXCB(skb).uid = scm->creds.uid;
1886 	UNIXCB(skb).gid = scm->creds.gid;
1887 	UNIXCB(skb).fp = NULL;
1888 	unix_get_secdata(scm, skb);
1889 	if (scm->fp && send_fds)
1890 		err = unix_attach_fds(scm, skb);
1891 
1892 	skb->destructor = unix_destruct_scm;
1893 	return err;
1894 }
1895 
1896 static bool unix_passcred_enabled(const struct socket *sock,
1897 				  const struct sock *other)
1898 {
1899 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1900 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1901 	       !other->sk_socket ||
1902 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1903 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1904 }
1905 
1906 /*
1907  * Some apps rely on write() giving SCM_CREDENTIALS
1908  * We include credentials if source or destination socket
1909  * asserted SOCK_PASSCRED.
1910  */
1911 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1912 			    const struct sock *other)
1913 {
1914 	if (UNIXCB(skb).pid)
1915 		return;
1916 	if (unix_passcred_enabled(sock, other)) {
1917 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1918 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1919 	}
1920 }
1921 
1922 static bool unix_skb_scm_eq(struct sk_buff *skb,
1923 			    struct scm_cookie *scm)
1924 {
1925 	return UNIXCB(skb).pid == scm->pid &&
1926 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1927 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1928 	       unix_secdata_eq(scm, skb);
1929 }
1930 
1931 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1932 {
1933 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1934 	struct unix_sock *u = unix_sk(sk);
1935 
1936 	if (unlikely(fp && fp->count)) {
1937 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1938 		unix_add_edges(fp, u);
1939 	}
1940 }
1941 
1942 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1943 {
1944 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1945 	struct unix_sock *u = unix_sk(sk);
1946 
1947 	if (unlikely(fp && fp->count)) {
1948 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1949 		unix_del_edges(fp);
1950 	}
1951 }
1952 
1953 /*
1954  *	Send AF_UNIX data.
1955  */
1956 
1957 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1958 			      size_t len)
1959 {
1960 	struct sock *sk = sock->sk, *other = NULL;
1961 	struct unix_sock *u = unix_sk(sk);
1962 	struct scm_cookie scm;
1963 	struct sk_buff *skb;
1964 	int data_len = 0;
1965 	int sk_locked;
1966 	long timeo;
1967 	int err;
1968 
1969 	err = scm_send(sock, msg, &scm, false);
1970 	if (err < 0)
1971 		return err;
1972 
1973 	wait_for_unix_gc(scm.fp);
1974 
1975 	if (msg->msg_flags & MSG_OOB) {
1976 		err = -EOPNOTSUPP;
1977 		goto out;
1978 	}
1979 
1980 	if (msg->msg_namelen) {
1981 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
1982 		if (err)
1983 			goto out;
1984 
1985 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1986 							    msg->msg_name,
1987 							    &msg->msg_namelen,
1988 							    NULL);
1989 		if (err)
1990 			goto out;
1991 	}
1992 
1993 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1994 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1995 	    !READ_ONCE(u->addr)) {
1996 		err = unix_autobind(sk);
1997 		if (err)
1998 			goto out;
1999 	}
2000 
2001 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
2002 		err = -EMSGSIZE;
2003 		goto out;
2004 	}
2005 
2006 	if (len > SKB_MAX_ALLOC) {
2007 		data_len = min_t(size_t,
2008 				 len - SKB_MAX_ALLOC,
2009 				 MAX_SKB_FRAGS * PAGE_SIZE);
2010 		data_len = PAGE_ALIGN(data_len);
2011 
2012 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2013 	}
2014 
2015 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2016 				   msg->msg_flags & MSG_DONTWAIT, &err,
2017 				   PAGE_ALLOC_COSTLY_ORDER);
2018 	if (!skb)
2019 		goto out;
2020 
2021 	err = unix_scm_to_skb(&scm, skb, true);
2022 	if (err < 0)
2023 		goto out_free;
2024 
2025 	skb_put(skb, len - data_len);
2026 	skb->data_len = data_len;
2027 	skb->len = len;
2028 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2029 	if (err)
2030 		goto out_free;
2031 
2032 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2033 
2034 	if (msg->msg_namelen) {
2035 lookup:
2036 		other = unix_find_other(sock_net(sk), msg->msg_name,
2037 					msg->msg_namelen, sk->sk_type);
2038 		if (IS_ERR(other)) {
2039 			err = PTR_ERR(other);
2040 			goto out_free;
2041 		}
2042 	} else {
2043 		other = unix_peer_get(sk);
2044 		if (!other) {
2045 			err = -ENOTCONN;
2046 			goto out_free;
2047 		}
2048 	}
2049 
2050 	if (sk_filter(other, skb) < 0) {
2051 		/* Toss the packet but do not return any error to the sender */
2052 		err = len;
2053 		goto out_sock_put;
2054 	}
2055 
2056 restart:
2057 	sk_locked = 0;
2058 	unix_state_lock(other);
2059 restart_locked:
2060 
2061 	if (!unix_may_send(sk, other)) {
2062 		err = -EPERM;
2063 		goto out_unlock;
2064 	}
2065 
2066 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2067 		/* Check with 1003.1g - what should datagram error */
2068 
2069 		unix_state_unlock(other);
2070 
2071 		if (sk->sk_type == SOCK_SEQPACKET) {
2072 			/* We are here only when racing with unix_release_sock()
2073 			 * is clearing @other. Never change state to TCP_CLOSE
2074 			 * unlike SOCK_DGRAM wants.
2075 			 */
2076 			err = -EPIPE;
2077 			goto out_sock_put;
2078 		}
2079 
2080 		if (!sk_locked)
2081 			unix_state_lock(sk);
2082 
2083 		if (unix_peer(sk) == other) {
2084 			unix_peer(sk) = NULL;
2085 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2086 
2087 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2088 			unix_state_unlock(sk);
2089 
2090 			unix_dgram_disconnected(sk, other);
2091 			sock_put(other);
2092 			err = -ECONNREFUSED;
2093 			goto out_sock_put;
2094 		}
2095 
2096 		unix_state_unlock(sk);
2097 
2098 		if (!msg->msg_namelen) {
2099 			err = -ECONNRESET;
2100 			goto out_sock_put;
2101 		}
2102 
2103 		goto lookup;
2104 	}
2105 
2106 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2107 		err = -EPIPE;
2108 		goto out_unlock;
2109 	}
2110 
2111 	if (sk->sk_type != SOCK_SEQPACKET) {
2112 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2113 		if (err)
2114 			goto out_unlock;
2115 	}
2116 
2117 	/* other == sk && unix_peer(other) != sk if
2118 	 * - unix_peer(sk) == NULL, destination address bound to sk
2119 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2120 	 */
2121 	if (other != sk &&
2122 	    unlikely(unix_peer(other) != sk &&
2123 	    unix_recvq_full_lockless(other))) {
2124 		if (timeo) {
2125 			timeo = unix_wait_for_peer(other, timeo);
2126 
2127 			err = sock_intr_errno(timeo);
2128 			if (signal_pending(current))
2129 				goto out_sock_put;
2130 
2131 			goto restart;
2132 		}
2133 
2134 		if (!sk_locked) {
2135 			unix_state_unlock(other);
2136 			unix_state_double_lock(sk, other);
2137 		}
2138 
2139 		if (unix_peer(sk) != other ||
2140 		    unix_dgram_peer_wake_me(sk, other)) {
2141 			err = -EAGAIN;
2142 			sk_locked = 1;
2143 			goto out_unlock;
2144 		}
2145 
2146 		if (!sk_locked) {
2147 			sk_locked = 1;
2148 			goto restart_locked;
2149 		}
2150 	}
2151 
2152 	if (unlikely(sk_locked))
2153 		unix_state_unlock(sk);
2154 
2155 	if (sock_flag(other, SOCK_RCVTSTAMP))
2156 		__net_timestamp(skb);
2157 	maybe_add_creds(skb, sock, other);
2158 	scm_stat_add(other, skb);
2159 	skb_queue_tail(&other->sk_receive_queue, skb);
2160 	unix_state_unlock(other);
2161 	other->sk_data_ready(other);
2162 	sock_put(other);
2163 	scm_destroy(&scm);
2164 	return len;
2165 
2166 out_unlock:
2167 	if (sk_locked)
2168 		unix_state_unlock(sk);
2169 	unix_state_unlock(other);
2170 out_sock_put:
2171 	sock_put(other);
2172 out_free:
2173 	kfree_skb(skb);
2174 out:
2175 	scm_destroy(&scm);
2176 	return err;
2177 }
2178 
2179 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2180  * bytes, and a minimum of a full page.
2181  */
2182 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2183 
2184 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2185 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2186 		     struct scm_cookie *scm, bool fds_sent)
2187 {
2188 	struct unix_sock *ousk = unix_sk(other);
2189 	struct sk_buff *skb;
2190 	int err = 0;
2191 
2192 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2193 
2194 	if (!skb)
2195 		return err;
2196 
2197 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2198 	if (err < 0) {
2199 		kfree_skb(skb);
2200 		return err;
2201 	}
2202 	skb_put(skb, 1);
2203 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2204 
2205 	if (err) {
2206 		kfree_skb(skb);
2207 		return err;
2208 	}
2209 
2210 	unix_state_lock(other);
2211 
2212 	if (sock_flag(other, SOCK_DEAD) ||
2213 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2214 		unix_state_unlock(other);
2215 		kfree_skb(skb);
2216 		return -EPIPE;
2217 	}
2218 
2219 	maybe_add_creds(skb, sock, other);
2220 	scm_stat_add(other, skb);
2221 
2222 	spin_lock(&other->sk_receive_queue.lock);
2223 	WRITE_ONCE(ousk->oob_skb, skb);
2224 	__skb_queue_tail(&other->sk_receive_queue, skb);
2225 	spin_unlock(&other->sk_receive_queue.lock);
2226 
2227 	sk_send_sigurg(other);
2228 	unix_state_unlock(other);
2229 	other->sk_data_ready(other);
2230 
2231 	return err;
2232 }
2233 #endif
2234 
2235 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2236 			       size_t len)
2237 {
2238 	struct sock *sk = sock->sk;
2239 	struct sock *other = NULL;
2240 	int err, size;
2241 	struct sk_buff *skb;
2242 	int sent = 0;
2243 	struct scm_cookie scm;
2244 	bool fds_sent = false;
2245 	int data_len;
2246 
2247 	err = scm_send(sock, msg, &scm, false);
2248 	if (err < 0)
2249 		return err;
2250 
2251 	wait_for_unix_gc(scm.fp);
2252 
2253 	if (msg->msg_flags & MSG_OOB) {
2254 		err = -EOPNOTSUPP;
2255 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2256 		if (len)
2257 			len--;
2258 		else
2259 #endif
2260 			goto out_err;
2261 	}
2262 
2263 	if (msg->msg_namelen) {
2264 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2265 		goto out_err;
2266 	} else {
2267 		other = unix_peer(sk);
2268 		if (!other) {
2269 			err = -ENOTCONN;
2270 			goto out_err;
2271 		}
2272 	}
2273 
2274 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) {
2275 		if (!(msg->msg_flags & MSG_NOSIGNAL))
2276 			send_sig(SIGPIPE, current, 0);
2277 
2278 		err = -EPIPE;
2279 		goto out_err;
2280 	}
2281 
2282 	while (sent < len) {
2283 		size = len - sent;
2284 
2285 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2286 			skb = sock_alloc_send_pskb(sk, 0, 0,
2287 						   msg->msg_flags & MSG_DONTWAIT,
2288 						   &err, 0);
2289 		} else {
2290 			/* Keep two messages in the pipe so it schedules better */
2291 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2292 
2293 			/* allow fallback to order-0 allocations */
2294 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2295 
2296 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2297 
2298 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2299 
2300 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2301 						   msg->msg_flags & MSG_DONTWAIT, &err,
2302 						   get_order(UNIX_SKB_FRAGS_SZ));
2303 		}
2304 		if (!skb)
2305 			goto out_err;
2306 
2307 		/* Only send the fds in the first buffer */
2308 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2309 		if (err < 0)
2310 			goto out_free;
2311 
2312 		fds_sent = true;
2313 
2314 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2315 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2316 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2317 						   sk->sk_allocation);
2318 			if (err < 0)
2319 				goto out_free;
2320 
2321 			size = err;
2322 			refcount_add(size, &sk->sk_wmem_alloc);
2323 		} else {
2324 			skb_put(skb, size - data_len);
2325 			skb->data_len = data_len;
2326 			skb->len = size;
2327 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2328 			if (err)
2329 				goto out_free;
2330 		}
2331 
2332 		unix_state_lock(other);
2333 
2334 		if (sock_flag(other, SOCK_DEAD) ||
2335 		    (other->sk_shutdown & RCV_SHUTDOWN))
2336 			goto out_pipe;
2337 
2338 		maybe_add_creds(skb, sock, other);
2339 		scm_stat_add(other, skb);
2340 		skb_queue_tail(&other->sk_receive_queue, skb);
2341 		unix_state_unlock(other);
2342 		other->sk_data_ready(other);
2343 		sent += size;
2344 	}
2345 
2346 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2347 	if (msg->msg_flags & MSG_OOB) {
2348 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2349 		if (err)
2350 			goto out_err;
2351 		sent++;
2352 	}
2353 #endif
2354 
2355 	scm_destroy(&scm);
2356 
2357 	return sent;
2358 
2359 out_pipe:
2360 	unix_state_unlock(other);
2361 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2362 		send_sig(SIGPIPE, current, 0);
2363 	err = -EPIPE;
2364 out_free:
2365 	kfree_skb(skb);
2366 out_err:
2367 	scm_destroy(&scm);
2368 	return sent ? : err;
2369 }
2370 
2371 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2372 				  size_t len)
2373 {
2374 	int err;
2375 	struct sock *sk = sock->sk;
2376 
2377 	err = sock_error(sk);
2378 	if (err)
2379 		return err;
2380 
2381 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2382 		return -ENOTCONN;
2383 
2384 	if (msg->msg_namelen)
2385 		msg->msg_namelen = 0;
2386 
2387 	return unix_dgram_sendmsg(sock, msg, len);
2388 }
2389 
2390 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2391 				  size_t size, int flags)
2392 {
2393 	struct sock *sk = sock->sk;
2394 
2395 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2396 		return -ENOTCONN;
2397 
2398 	return unix_dgram_recvmsg(sock, msg, size, flags);
2399 }
2400 
2401 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2402 {
2403 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2404 
2405 	if (addr) {
2406 		msg->msg_namelen = addr->len;
2407 		memcpy(msg->msg_name, addr->name, addr->len);
2408 	}
2409 }
2410 
2411 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2412 			 int flags)
2413 {
2414 	struct scm_cookie scm;
2415 	struct socket *sock = sk->sk_socket;
2416 	struct unix_sock *u = unix_sk(sk);
2417 	struct sk_buff *skb, *last;
2418 	long timeo;
2419 	int skip;
2420 	int err;
2421 
2422 	err = -EOPNOTSUPP;
2423 	if (flags&MSG_OOB)
2424 		goto out;
2425 
2426 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2427 
2428 	do {
2429 		mutex_lock(&u->iolock);
2430 
2431 		skip = sk_peek_offset(sk, flags);
2432 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2433 					      &skip, &err, &last);
2434 		if (skb) {
2435 			if (!(flags & MSG_PEEK))
2436 				scm_stat_del(sk, skb);
2437 			break;
2438 		}
2439 
2440 		mutex_unlock(&u->iolock);
2441 
2442 		if (err != -EAGAIN)
2443 			break;
2444 	} while (timeo &&
2445 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2446 					      &err, &timeo, last));
2447 
2448 	if (!skb) { /* implies iolock unlocked */
2449 		unix_state_lock(sk);
2450 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2451 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2452 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2453 			err = 0;
2454 		unix_state_unlock(sk);
2455 		goto out;
2456 	}
2457 
2458 	if (wq_has_sleeper(&u->peer_wait))
2459 		wake_up_interruptible_sync_poll(&u->peer_wait,
2460 						EPOLLOUT | EPOLLWRNORM |
2461 						EPOLLWRBAND);
2462 
2463 	if (msg->msg_name) {
2464 		unix_copy_addr(msg, skb->sk);
2465 
2466 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2467 						      msg->msg_name,
2468 						      &msg->msg_namelen);
2469 	}
2470 
2471 	if (size > skb->len - skip)
2472 		size = skb->len - skip;
2473 	else if (size < skb->len - skip)
2474 		msg->msg_flags |= MSG_TRUNC;
2475 
2476 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2477 	if (err)
2478 		goto out_free;
2479 
2480 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2481 		__sock_recv_timestamp(msg, sk, skb);
2482 
2483 	memset(&scm, 0, sizeof(scm));
2484 
2485 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2486 	unix_set_secdata(&scm, skb);
2487 
2488 	if (!(flags & MSG_PEEK)) {
2489 		if (UNIXCB(skb).fp)
2490 			unix_detach_fds(&scm, skb);
2491 
2492 		sk_peek_offset_bwd(sk, skb->len);
2493 	} else {
2494 		/* It is questionable: on PEEK we could:
2495 		   - do not return fds - good, but too simple 8)
2496 		   - return fds, and do not return them on read (old strategy,
2497 		     apparently wrong)
2498 		   - clone fds (I chose it for now, it is the most universal
2499 		     solution)
2500 
2501 		   POSIX 1003.1g does not actually define this clearly
2502 		   at all. POSIX 1003.1g doesn't define a lot of things
2503 		   clearly however!
2504 
2505 		*/
2506 
2507 		sk_peek_offset_fwd(sk, size);
2508 
2509 		if (UNIXCB(skb).fp)
2510 			unix_peek_fds(&scm, skb);
2511 	}
2512 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2513 
2514 	scm_recv_unix(sock, msg, &scm, flags);
2515 
2516 out_free:
2517 	skb_free_datagram(sk, skb);
2518 	mutex_unlock(&u->iolock);
2519 out:
2520 	return err;
2521 }
2522 
2523 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2524 			      int flags)
2525 {
2526 	struct sock *sk = sock->sk;
2527 
2528 #ifdef CONFIG_BPF_SYSCALL
2529 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2530 
2531 	if (prot != &unix_dgram_proto)
2532 		return prot->recvmsg(sk, msg, size, flags, NULL);
2533 #endif
2534 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2535 }
2536 
2537 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2538 {
2539 	struct unix_sock *u = unix_sk(sk);
2540 	struct sk_buff *skb;
2541 	int err;
2542 
2543 	mutex_lock(&u->iolock);
2544 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2545 	mutex_unlock(&u->iolock);
2546 	if (!skb)
2547 		return err;
2548 
2549 	return recv_actor(sk, skb);
2550 }
2551 
2552 /*
2553  *	Sleep until more data has arrived. But check for races..
2554  */
2555 static long unix_stream_data_wait(struct sock *sk, long timeo,
2556 				  struct sk_buff *last, unsigned int last_len,
2557 				  bool freezable)
2558 {
2559 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2560 	struct sk_buff *tail;
2561 	DEFINE_WAIT(wait);
2562 
2563 	unix_state_lock(sk);
2564 
2565 	for (;;) {
2566 		prepare_to_wait(sk_sleep(sk), &wait, state);
2567 
2568 		tail = skb_peek_tail(&sk->sk_receive_queue);
2569 		if (tail != last ||
2570 		    (tail && tail->len != last_len) ||
2571 		    sk->sk_err ||
2572 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2573 		    signal_pending(current) ||
2574 		    !timeo)
2575 			break;
2576 
2577 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2578 		unix_state_unlock(sk);
2579 		timeo = schedule_timeout(timeo);
2580 		unix_state_lock(sk);
2581 
2582 		if (sock_flag(sk, SOCK_DEAD))
2583 			break;
2584 
2585 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2586 	}
2587 
2588 	finish_wait(sk_sleep(sk), &wait);
2589 	unix_state_unlock(sk);
2590 	return timeo;
2591 }
2592 
2593 static unsigned int unix_skb_len(const struct sk_buff *skb)
2594 {
2595 	return skb->len - UNIXCB(skb).consumed;
2596 }
2597 
2598 struct unix_stream_read_state {
2599 	int (*recv_actor)(struct sk_buff *, int, int,
2600 			  struct unix_stream_read_state *);
2601 	struct socket *socket;
2602 	struct msghdr *msg;
2603 	struct pipe_inode_info *pipe;
2604 	size_t size;
2605 	int flags;
2606 	unsigned int splice_flags;
2607 };
2608 
2609 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2610 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2611 {
2612 	struct socket *sock = state->socket;
2613 	struct sock *sk = sock->sk;
2614 	struct unix_sock *u = unix_sk(sk);
2615 	int chunk = 1;
2616 	struct sk_buff *oob_skb;
2617 
2618 	mutex_lock(&u->iolock);
2619 	unix_state_lock(sk);
2620 	spin_lock(&sk->sk_receive_queue.lock);
2621 
2622 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2623 		spin_unlock(&sk->sk_receive_queue.lock);
2624 		unix_state_unlock(sk);
2625 		mutex_unlock(&u->iolock);
2626 		return -EINVAL;
2627 	}
2628 
2629 	oob_skb = u->oob_skb;
2630 
2631 	if (!(state->flags & MSG_PEEK))
2632 		WRITE_ONCE(u->oob_skb, NULL);
2633 
2634 	spin_unlock(&sk->sk_receive_queue.lock);
2635 	unix_state_unlock(sk);
2636 
2637 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2638 
2639 	if (!(state->flags & MSG_PEEK))
2640 		UNIXCB(oob_skb).consumed += 1;
2641 
2642 	mutex_unlock(&u->iolock);
2643 
2644 	if (chunk < 0)
2645 		return -EFAULT;
2646 
2647 	state->msg->msg_flags |= MSG_OOB;
2648 	return 1;
2649 }
2650 
2651 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2652 				  int flags, int copied)
2653 {
2654 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2655 	struct unix_sock *u = unix_sk(sk);
2656 
2657 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2658 		return skb;
2659 
2660 	spin_lock(&sk->sk_receive_queue.lock);
2661 
2662 	if (!unix_skb_len(skb)) {
2663 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2664 			skb = NULL;
2665 		} else if (flags & MSG_PEEK) {
2666 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2667 		} else {
2668 			read_skb = skb;
2669 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2670 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2671 		}
2672 
2673 		if (!skb)
2674 			goto unlock;
2675 	}
2676 
2677 	if (skb != u->oob_skb)
2678 		goto unlock;
2679 
2680 	if (copied) {
2681 		skb = NULL;
2682 	} else if (!(flags & MSG_PEEK)) {
2683 		WRITE_ONCE(u->oob_skb, NULL);
2684 
2685 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2686 			__skb_unlink(skb, &sk->sk_receive_queue);
2687 			unread_skb = skb;
2688 			skb = skb_peek(&sk->sk_receive_queue);
2689 		}
2690 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2691 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2692 	}
2693 
2694 unlock:
2695 	spin_unlock(&sk->sk_receive_queue.lock);
2696 
2697 	consume_skb(read_skb);
2698 	kfree_skb(unread_skb);
2699 
2700 	return skb;
2701 }
2702 #endif
2703 
2704 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2705 {
2706 	struct unix_sock *u = unix_sk(sk);
2707 	struct sk_buff *skb;
2708 	int err;
2709 
2710 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2711 		return -ENOTCONN;
2712 
2713 	mutex_lock(&u->iolock);
2714 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2715 	mutex_unlock(&u->iolock);
2716 	if (!skb)
2717 		return err;
2718 
2719 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2720 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2721 		bool drop = false;
2722 
2723 		unix_state_lock(sk);
2724 
2725 		if (sock_flag(sk, SOCK_DEAD)) {
2726 			unix_state_unlock(sk);
2727 			kfree_skb(skb);
2728 			return -ECONNRESET;
2729 		}
2730 
2731 		spin_lock(&sk->sk_receive_queue.lock);
2732 		if (likely(skb == u->oob_skb)) {
2733 			WRITE_ONCE(u->oob_skb, NULL);
2734 			drop = true;
2735 		}
2736 		spin_unlock(&sk->sk_receive_queue.lock);
2737 
2738 		unix_state_unlock(sk);
2739 
2740 		if (drop) {
2741 			kfree_skb(skb);
2742 			return -EAGAIN;
2743 		}
2744 	}
2745 #endif
2746 
2747 	return recv_actor(sk, skb);
2748 }
2749 
2750 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2751 				    bool freezable)
2752 {
2753 	struct scm_cookie scm;
2754 	struct socket *sock = state->socket;
2755 	struct sock *sk = sock->sk;
2756 	struct unix_sock *u = unix_sk(sk);
2757 	int copied = 0;
2758 	int flags = state->flags;
2759 	int noblock = flags & MSG_DONTWAIT;
2760 	bool check_creds = false;
2761 	int target;
2762 	int err = 0;
2763 	long timeo;
2764 	int skip;
2765 	size_t size = state->size;
2766 	unsigned int last_len;
2767 
2768 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2769 		err = -EINVAL;
2770 		goto out;
2771 	}
2772 
2773 	if (unlikely(flags & MSG_OOB)) {
2774 		err = -EOPNOTSUPP;
2775 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2776 		err = unix_stream_recv_urg(state);
2777 #endif
2778 		goto out;
2779 	}
2780 
2781 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2782 	timeo = sock_rcvtimeo(sk, noblock);
2783 
2784 	memset(&scm, 0, sizeof(scm));
2785 
2786 	/* Lock the socket to prevent queue disordering
2787 	 * while sleeps in memcpy_tomsg
2788 	 */
2789 	mutex_lock(&u->iolock);
2790 
2791 	skip = max(sk_peek_offset(sk, flags), 0);
2792 
2793 	do {
2794 		struct sk_buff *skb, *last;
2795 		int chunk;
2796 
2797 redo:
2798 		unix_state_lock(sk);
2799 		if (sock_flag(sk, SOCK_DEAD)) {
2800 			err = -ECONNRESET;
2801 			goto unlock;
2802 		}
2803 		last = skb = skb_peek(&sk->sk_receive_queue);
2804 		last_len = last ? last->len : 0;
2805 
2806 again:
2807 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2808 		if (skb) {
2809 			skb = manage_oob(skb, sk, flags, copied);
2810 			if (!skb && copied) {
2811 				unix_state_unlock(sk);
2812 				break;
2813 			}
2814 		}
2815 #endif
2816 		if (skb == NULL) {
2817 			if (copied >= target)
2818 				goto unlock;
2819 
2820 			/*
2821 			 *	POSIX 1003.1g mandates this order.
2822 			 */
2823 
2824 			err = sock_error(sk);
2825 			if (err)
2826 				goto unlock;
2827 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2828 				goto unlock;
2829 
2830 			unix_state_unlock(sk);
2831 			if (!timeo) {
2832 				err = -EAGAIN;
2833 				break;
2834 			}
2835 
2836 			mutex_unlock(&u->iolock);
2837 
2838 			timeo = unix_stream_data_wait(sk, timeo, last,
2839 						      last_len, freezable);
2840 
2841 			if (signal_pending(current)) {
2842 				err = sock_intr_errno(timeo);
2843 				scm_destroy(&scm);
2844 				goto out;
2845 			}
2846 
2847 			mutex_lock(&u->iolock);
2848 			goto redo;
2849 unlock:
2850 			unix_state_unlock(sk);
2851 			break;
2852 		}
2853 
2854 		while (skip >= unix_skb_len(skb)) {
2855 			skip -= unix_skb_len(skb);
2856 			last = skb;
2857 			last_len = skb->len;
2858 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2859 			if (!skb)
2860 				goto again;
2861 		}
2862 
2863 		unix_state_unlock(sk);
2864 
2865 		if (check_creds) {
2866 			/* Never glue messages from different writers */
2867 			if (!unix_skb_scm_eq(skb, &scm))
2868 				break;
2869 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2870 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2871 			/* Copy credentials */
2872 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2873 			unix_set_secdata(&scm, skb);
2874 			check_creds = true;
2875 		}
2876 
2877 		/* Copy address just once */
2878 		if (state->msg && state->msg->msg_name) {
2879 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2880 					 state->msg->msg_name);
2881 			unix_copy_addr(state->msg, skb->sk);
2882 
2883 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2884 							      state->msg->msg_name,
2885 							      &state->msg->msg_namelen);
2886 
2887 			sunaddr = NULL;
2888 		}
2889 
2890 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2891 		chunk = state->recv_actor(skb, skip, chunk, state);
2892 		if (chunk < 0) {
2893 			if (copied == 0)
2894 				copied = -EFAULT;
2895 			break;
2896 		}
2897 		copied += chunk;
2898 		size -= chunk;
2899 
2900 		/* Mark read part of skb as used */
2901 		if (!(flags & MSG_PEEK)) {
2902 			UNIXCB(skb).consumed += chunk;
2903 
2904 			sk_peek_offset_bwd(sk, chunk);
2905 
2906 			if (UNIXCB(skb).fp) {
2907 				scm_stat_del(sk, skb);
2908 				unix_detach_fds(&scm, skb);
2909 			}
2910 
2911 			if (unix_skb_len(skb))
2912 				break;
2913 
2914 			skb_unlink(skb, &sk->sk_receive_queue);
2915 			consume_skb(skb);
2916 
2917 			if (scm.fp)
2918 				break;
2919 		} else {
2920 			/* It is questionable, see note in unix_dgram_recvmsg.
2921 			 */
2922 			if (UNIXCB(skb).fp)
2923 				unix_peek_fds(&scm, skb);
2924 
2925 			sk_peek_offset_fwd(sk, chunk);
2926 
2927 			if (UNIXCB(skb).fp)
2928 				break;
2929 
2930 			skip = 0;
2931 			last = skb;
2932 			last_len = skb->len;
2933 			unix_state_lock(sk);
2934 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2935 			if (skb)
2936 				goto again;
2937 			unix_state_unlock(sk);
2938 			break;
2939 		}
2940 	} while (size);
2941 
2942 	mutex_unlock(&u->iolock);
2943 	if (state->msg)
2944 		scm_recv_unix(sock, state->msg, &scm, flags);
2945 	else
2946 		scm_destroy(&scm);
2947 out:
2948 	return copied ? : err;
2949 }
2950 
2951 static int unix_stream_read_actor(struct sk_buff *skb,
2952 				  int skip, int chunk,
2953 				  struct unix_stream_read_state *state)
2954 {
2955 	int ret;
2956 
2957 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2958 				    state->msg, chunk);
2959 	return ret ?: chunk;
2960 }
2961 
2962 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2963 			  size_t size, int flags)
2964 {
2965 	struct unix_stream_read_state state = {
2966 		.recv_actor = unix_stream_read_actor,
2967 		.socket = sk->sk_socket,
2968 		.msg = msg,
2969 		.size = size,
2970 		.flags = flags
2971 	};
2972 
2973 	return unix_stream_read_generic(&state, true);
2974 }
2975 
2976 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2977 			       size_t size, int flags)
2978 {
2979 	struct unix_stream_read_state state = {
2980 		.recv_actor = unix_stream_read_actor,
2981 		.socket = sock,
2982 		.msg = msg,
2983 		.size = size,
2984 		.flags = flags
2985 	};
2986 
2987 #ifdef CONFIG_BPF_SYSCALL
2988 	struct sock *sk = sock->sk;
2989 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2990 
2991 	if (prot != &unix_stream_proto)
2992 		return prot->recvmsg(sk, msg, size, flags, NULL);
2993 #endif
2994 	return unix_stream_read_generic(&state, true);
2995 }
2996 
2997 static int unix_stream_splice_actor(struct sk_buff *skb,
2998 				    int skip, int chunk,
2999 				    struct unix_stream_read_state *state)
3000 {
3001 	return skb_splice_bits(skb, state->socket->sk,
3002 			       UNIXCB(skb).consumed + skip,
3003 			       state->pipe, chunk, state->splice_flags);
3004 }
3005 
3006 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3007 				       struct pipe_inode_info *pipe,
3008 				       size_t size, unsigned int flags)
3009 {
3010 	struct unix_stream_read_state state = {
3011 		.recv_actor = unix_stream_splice_actor,
3012 		.socket = sock,
3013 		.pipe = pipe,
3014 		.size = size,
3015 		.splice_flags = flags,
3016 	};
3017 
3018 	if (unlikely(*ppos))
3019 		return -ESPIPE;
3020 
3021 	if (sock->file->f_flags & O_NONBLOCK ||
3022 	    flags & SPLICE_F_NONBLOCK)
3023 		state.flags = MSG_DONTWAIT;
3024 
3025 	return unix_stream_read_generic(&state, false);
3026 }
3027 
3028 static int unix_shutdown(struct socket *sock, int mode)
3029 {
3030 	struct sock *sk = sock->sk;
3031 	struct sock *other;
3032 
3033 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3034 		return -EINVAL;
3035 	/* This maps:
3036 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3037 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3038 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3039 	 */
3040 	++mode;
3041 
3042 	unix_state_lock(sk);
3043 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3044 	other = unix_peer(sk);
3045 	if (other)
3046 		sock_hold(other);
3047 	unix_state_unlock(sk);
3048 	sk->sk_state_change(sk);
3049 
3050 	if (other &&
3051 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3052 
3053 		int peer_mode = 0;
3054 		const struct proto *prot = READ_ONCE(other->sk_prot);
3055 
3056 		if (prot->unhash)
3057 			prot->unhash(other);
3058 		if (mode&RCV_SHUTDOWN)
3059 			peer_mode |= SEND_SHUTDOWN;
3060 		if (mode&SEND_SHUTDOWN)
3061 			peer_mode |= RCV_SHUTDOWN;
3062 		unix_state_lock(other);
3063 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3064 		unix_state_unlock(other);
3065 		other->sk_state_change(other);
3066 		if (peer_mode == SHUTDOWN_MASK)
3067 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3068 		else if (peer_mode & RCV_SHUTDOWN)
3069 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3070 	}
3071 	if (other)
3072 		sock_put(other);
3073 
3074 	return 0;
3075 }
3076 
3077 long unix_inq_len(struct sock *sk)
3078 {
3079 	struct sk_buff *skb;
3080 	long amount = 0;
3081 
3082 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3083 		return -EINVAL;
3084 
3085 	spin_lock(&sk->sk_receive_queue.lock);
3086 	if (sk->sk_type == SOCK_STREAM ||
3087 	    sk->sk_type == SOCK_SEQPACKET) {
3088 		skb_queue_walk(&sk->sk_receive_queue, skb)
3089 			amount += unix_skb_len(skb);
3090 	} else {
3091 		skb = skb_peek(&sk->sk_receive_queue);
3092 		if (skb)
3093 			amount = skb->len;
3094 	}
3095 	spin_unlock(&sk->sk_receive_queue.lock);
3096 
3097 	return amount;
3098 }
3099 EXPORT_SYMBOL_GPL(unix_inq_len);
3100 
3101 long unix_outq_len(struct sock *sk)
3102 {
3103 	return sk_wmem_alloc_get(sk);
3104 }
3105 EXPORT_SYMBOL_GPL(unix_outq_len);
3106 
3107 static int unix_open_file(struct sock *sk)
3108 {
3109 	struct path path;
3110 	struct file *f;
3111 	int fd;
3112 
3113 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3114 		return -EPERM;
3115 
3116 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3117 		return -ENOENT;
3118 
3119 	path = unix_sk(sk)->path;
3120 	if (!path.dentry)
3121 		return -ENOENT;
3122 
3123 	path_get(&path);
3124 
3125 	fd = get_unused_fd_flags(O_CLOEXEC);
3126 	if (fd < 0)
3127 		goto out;
3128 
3129 	f = dentry_open(&path, O_PATH, current_cred());
3130 	if (IS_ERR(f)) {
3131 		put_unused_fd(fd);
3132 		fd = PTR_ERR(f);
3133 		goto out;
3134 	}
3135 
3136 	fd_install(fd, f);
3137 out:
3138 	path_put(&path);
3139 
3140 	return fd;
3141 }
3142 
3143 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3144 {
3145 	struct sock *sk = sock->sk;
3146 	long amount = 0;
3147 	int err;
3148 
3149 	switch (cmd) {
3150 	case SIOCOUTQ:
3151 		amount = unix_outq_len(sk);
3152 		err = put_user(amount, (int __user *)arg);
3153 		break;
3154 	case SIOCINQ:
3155 		amount = unix_inq_len(sk);
3156 		if (amount < 0)
3157 			err = amount;
3158 		else
3159 			err = put_user(amount, (int __user *)arg);
3160 		break;
3161 	case SIOCUNIXFILE:
3162 		err = unix_open_file(sk);
3163 		break;
3164 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3165 	case SIOCATMARK:
3166 		{
3167 			struct unix_sock *u = unix_sk(sk);
3168 			struct sk_buff *skb;
3169 			int answ = 0;
3170 
3171 			mutex_lock(&u->iolock);
3172 
3173 			skb = skb_peek(&sk->sk_receive_queue);
3174 			if (skb) {
3175 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3176 				struct sk_buff *next_skb;
3177 
3178 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3179 
3180 				if (skb == oob_skb ||
3181 				    (!unix_skb_len(skb) &&
3182 				     (!oob_skb || next_skb == oob_skb)))
3183 					answ = 1;
3184 			}
3185 
3186 			mutex_unlock(&u->iolock);
3187 
3188 			err = put_user(answ, (int __user *)arg);
3189 		}
3190 		break;
3191 #endif
3192 	default:
3193 		err = -ENOIOCTLCMD;
3194 		break;
3195 	}
3196 	return err;
3197 }
3198 
3199 #ifdef CONFIG_COMPAT
3200 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3201 {
3202 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3203 }
3204 #endif
3205 
3206 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3207 {
3208 	struct sock *sk = sock->sk;
3209 	unsigned char state;
3210 	__poll_t mask;
3211 	u8 shutdown;
3212 
3213 	sock_poll_wait(file, sock, wait);
3214 	mask = 0;
3215 	shutdown = READ_ONCE(sk->sk_shutdown);
3216 	state = READ_ONCE(sk->sk_state);
3217 
3218 	/* exceptional events? */
3219 	if (READ_ONCE(sk->sk_err))
3220 		mask |= EPOLLERR;
3221 	if (shutdown == SHUTDOWN_MASK)
3222 		mask |= EPOLLHUP;
3223 	if (shutdown & RCV_SHUTDOWN)
3224 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3225 
3226 	/* readable? */
3227 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3228 		mask |= EPOLLIN | EPOLLRDNORM;
3229 	if (sk_is_readable(sk))
3230 		mask |= EPOLLIN | EPOLLRDNORM;
3231 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3232 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3233 		mask |= EPOLLPRI;
3234 #endif
3235 
3236 	/* Connection-based need to check for termination and startup */
3237 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3238 	    state == TCP_CLOSE)
3239 		mask |= EPOLLHUP;
3240 
3241 	/*
3242 	 * we set writable also when the other side has shut down the
3243 	 * connection. This prevents stuck sockets.
3244 	 */
3245 	if (unix_writable(sk, state))
3246 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3247 
3248 	return mask;
3249 }
3250 
3251 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3252 				    poll_table *wait)
3253 {
3254 	struct sock *sk = sock->sk, *other;
3255 	unsigned int writable;
3256 	unsigned char state;
3257 	__poll_t mask;
3258 	u8 shutdown;
3259 
3260 	sock_poll_wait(file, sock, wait);
3261 	mask = 0;
3262 	shutdown = READ_ONCE(sk->sk_shutdown);
3263 	state = READ_ONCE(sk->sk_state);
3264 
3265 	/* exceptional events? */
3266 	if (READ_ONCE(sk->sk_err) ||
3267 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3268 		mask |= EPOLLERR |
3269 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3270 
3271 	if (shutdown & RCV_SHUTDOWN)
3272 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3273 	if (shutdown == SHUTDOWN_MASK)
3274 		mask |= EPOLLHUP;
3275 
3276 	/* readable? */
3277 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3278 		mask |= EPOLLIN | EPOLLRDNORM;
3279 	if (sk_is_readable(sk))
3280 		mask |= EPOLLIN | EPOLLRDNORM;
3281 
3282 	/* Connection-based need to check for termination and startup */
3283 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3284 		mask |= EPOLLHUP;
3285 
3286 	/* No write status requested, avoid expensive OUT tests. */
3287 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3288 		return mask;
3289 
3290 	writable = unix_writable(sk, state);
3291 	if (writable) {
3292 		unix_state_lock(sk);
3293 
3294 		other = unix_peer(sk);
3295 		if (other && unix_peer(other) != sk &&
3296 		    unix_recvq_full_lockless(other) &&
3297 		    unix_dgram_peer_wake_me(sk, other))
3298 			writable = 0;
3299 
3300 		unix_state_unlock(sk);
3301 	}
3302 
3303 	if (writable)
3304 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3305 	else
3306 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3307 
3308 	return mask;
3309 }
3310 
3311 #ifdef CONFIG_PROC_FS
3312 
3313 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3314 
3315 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3316 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3317 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3318 
3319 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3320 {
3321 	unsigned long offset = get_offset(*pos);
3322 	unsigned long bucket = get_bucket(*pos);
3323 	unsigned long count = 0;
3324 	struct sock *sk;
3325 
3326 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3327 	     sk; sk = sk_next(sk)) {
3328 		if (++count == offset)
3329 			break;
3330 	}
3331 
3332 	return sk;
3333 }
3334 
3335 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3336 {
3337 	unsigned long bucket = get_bucket(*pos);
3338 	struct net *net = seq_file_net(seq);
3339 	struct sock *sk;
3340 
3341 	while (bucket < UNIX_HASH_SIZE) {
3342 		spin_lock(&net->unx.table.locks[bucket]);
3343 
3344 		sk = unix_from_bucket(seq, pos);
3345 		if (sk)
3346 			return sk;
3347 
3348 		spin_unlock(&net->unx.table.locks[bucket]);
3349 
3350 		*pos = set_bucket_offset(++bucket, 1);
3351 	}
3352 
3353 	return NULL;
3354 }
3355 
3356 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3357 				  loff_t *pos)
3358 {
3359 	unsigned long bucket = get_bucket(*pos);
3360 
3361 	sk = sk_next(sk);
3362 	if (sk)
3363 		return sk;
3364 
3365 
3366 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3367 
3368 	*pos = set_bucket_offset(++bucket, 1);
3369 
3370 	return unix_get_first(seq, pos);
3371 }
3372 
3373 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3374 {
3375 	if (!*pos)
3376 		return SEQ_START_TOKEN;
3377 
3378 	return unix_get_first(seq, pos);
3379 }
3380 
3381 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3382 {
3383 	++*pos;
3384 
3385 	if (v == SEQ_START_TOKEN)
3386 		return unix_get_first(seq, pos);
3387 
3388 	return unix_get_next(seq, v, pos);
3389 }
3390 
3391 static void unix_seq_stop(struct seq_file *seq, void *v)
3392 {
3393 	struct sock *sk = v;
3394 
3395 	if (sk)
3396 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3397 }
3398 
3399 static int unix_seq_show(struct seq_file *seq, void *v)
3400 {
3401 
3402 	if (v == SEQ_START_TOKEN)
3403 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3404 			 "Inode Path\n");
3405 	else {
3406 		struct sock *s = v;
3407 		struct unix_sock *u = unix_sk(s);
3408 		unix_state_lock(s);
3409 
3410 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3411 			s,
3412 			refcount_read(&s->sk_refcnt),
3413 			0,
3414 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3415 			s->sk_type,
3416 			s->sk_socket ?
3417 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3418 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3419 			sock_i_ino(s));
3420 
3421 		if (u->addr) {	// under a hash table lock here
3422 			int i, len;
3423 			seq_putc(seq, ' ');
3424 
3425 			i = 0;
3426 			len = u->addr->len -
3427 				offsetof(struct sockaddr_un, sun_path);
3428 			if (u->addr->name->sun_path[0]) {
3429 				len--;
3430 			} else {
3431 				seq_putc(seq, '@');
3432 				i++;
3433 			}
3434 			for ( ; i < len; i++)
3435 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3436 					 '@');
3437 		}
3438 		unix_state_unlock(s);
3439 		seq_putc(seq, '\n');
3440 	}
3441 
3442 	return 0;
3443 }
3444 
3445 static const struct seq_operations unix_seq_ops = {
3446 	.start  = unix_seq_start,
3447 	.next   = unix_seq_next,
3448 	.stop   = unix_seq_stop,
3449 	.show   = unix_seq_show,
3450 };
3451 
3452 #ifdef CONFIG_BPF_SYSCALL
3453 struct bpf_unix_iter_state {
3454 	struct seq_net_private p;
3455 	unsigned int cur_sk;
3456 	unsigned int end_sk;
3457 	unsigned int max_sk;
3458 	struct sock **batch;
3459 	bool st_bucket_done;
3460 };
3461 
3462 struct bpf_iter__unix {
3463 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3464 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3465 	uid_t uid __aligned(8);
3466 };
3467 
3468 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3469 			      struct unix_sock *unix_sk, uid_t uid)
3470 {
3471 	struct bpf_iter__unix ctx;
3472 
3473 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3474 	ctx.meta = meta;
3475 	ctx.unix_sk = unix_sk;
3476 	ctx.uid = uid;
3477 	return bpf_iter_run_prog(prog, &ctx);
3478 }
3479 
3480 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3481 
3482 {
3483 	struct bpf_unix_iter_state *iter = seq->private;
3484 	unsigned int expected = 1;
3485 	struct sock *sk;
3486 
3487 	sock_hold(start_sk);
3488 	iter->batch[iter->end_sk++] = start_sk;
3489 
3490 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3491 		if (iter->end_sk < iter->max_sk) {
3492 			sock_hold(sk);
3493 			iter->batch[iter->end_sk++] = sk;
3494 		}
3495 
3496 		expected++;
3497 	}
3498 
3499 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3500 
3501 	return expected;
3502 }
3503 
3504 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3505 {
3506 	while (iter->cur_sk < iter->end_sk)
3507 		sock_put(iter->batch[iter->cur_sk++]);
3508 }
3509 
3510 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3511 				       unsigned int new_batch_sz)
3512 {
3513 	struct sock **new_batch;
3514 
3515 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3516 			     GFP_USER | __GFP_NOWARN);
3517 	if (!new_batch)
3518 		return -ENOMEM;
3519 
3520 	bpf_iter_unix_put_batch(iter);
3521 	kvfree(iter->batch);
3522 	iter->batch = new_batch;
3523 	iter->max_sk = new_batch_sz;
3524 
3525 	return 0;
3526 }
3527 
3528 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3529 					loff_t *pos)
3530 {
3531 	struct bpf_unix_iter_state *iter = seq->private;
3532 	unsigned int expected;
3533 	bool resized = false;
3534 	struct sock *sk;
3535 
3536 	if (iter->st_bucket_done)
3537 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3538 
3539 again:
3540 	/* Get a new batch */
3541 	iter->cur_sk = 0;
3542 	iter->end_sk = 0;
3543 
3544 	sk = unix_get_first(seq, pos);
3545 	if (!sk)
3546 		return NULL; /* Done */
3547 
3548 	expected = bpf_iter_unix_hold_batch(seq, sk);
3549 
3550 	if (iter->end_sk == expected) {
3551 		iter->st_bucket_done = true;
3552 		return sk;
3553 	}
3554 
3555 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3556 		resized = true;
3557 		goto again;
3558 	}
3559 
3560 	return sk;
3561 }
3562 
3563 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3564 {
3565 	if (!*pos)
3566 		return SEQ_START_TOKEN;
3567 
3568 	/* bpf iter does not support lseek, so it always
3569 	 * continue from where it was stop()-ped.
3570 	 */
3571 	return bpf_iter_unix_batch(seq, pos);
3572 }
3573 
3574 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3575 {
3576 	struct bpf_unix_iter_state *iter = seq->private;
3577 	struct sock *sk;
3578 
3579 	/* Whenever seq_next() is called, the iter->cur_sk is
3580 	 * done with seq_show(), so advance to the next sk in
3581 	 * the batch.
3582 	 */
3583 	if (iter->cur_sk < iter->end_sk)
3584 		sock_put(iter->batch[iter->cur_sk++]);
3585 
3586 	++*pos;
3587 
3588 	if (iter->cur_sk < iter->end_sk)
3589 		sk = iter->batch[iter->cur_sk];
3590 	else
3591 		sk = bpf_iter_unix_batch(seq, pos);
3592 
3593 	return sk;
3594 }
3595 
3596 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3597 {
3598 	struct bpf_iter_meta meta;
3599 	struct bpf_prog *prog;
3600 	struct sock *sk = v;
3601 	uid_t uid;
3602 	bool slow;
3603 	int ret;
3604 
3605 	if (v == SEQ_START_TOKEN)
3606 		return 0;
3607 
3608 	slow = lock_sock_fast(sk);
3609 
3610 	if (unlikely(sk_unhashed(sk))) {
3611 		ret = SEQ_SKIP;
3612 		goto unlock;
3613 	}
3614 
3615 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3616 	meta.seq = seq;
3617 	prog = bpf_iter_get_info(&meta, false);
3618 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3619 unlock:
3620 	unlock_sock_fast(sk, slow);
3621 	return ret;
3622 }
3623 
3624 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3625 {
3626 	struct bpf_unix_iter_state *iter = seq->private;
3627 	struct bpf_iter_meta meta;
3628 	struct bpf_prog *prog;
3629 
3630 	if (!v) {
3631 		meta.seq = seq;
3632 		prog = bpf_iter_get_info(&meta, true);
3633 		if (prog)
3634 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3635 	}
3636 
3637 	if (iter->cur_sk < iter->end_sk)
3638 		bpf_iter_unix_put_batch(iter);
3639 }
3640 
3641 static const struct seq_operations bpf_iter_unix_seq_ops = {
3642 	.start	= bpf_iter_unix_seq_start,
3643 	.next	= bpf_iter_unix_seq_next,
3644 	.stop	= bpf_iter_unix_seq_stop,
3645 	.show	= bpf_iter_unix_seq_show,
3646 };
3647 #endif
3648 #endif
3649 
3650 static const struct net_proto_family unix_family_ops = {
3651 	.family = PF_UNIX,
3652 	.create = unix_create,
3653 	.owner	= THIS_MODULE,
3654 };
3655 
3656 
3657 static int __net_init unix_net_init(struct net *net)
3658 {
3659 	int i;
3660 
3661 	net->unx.sysctl_max_dgram_qlen = 10;
3662 	if (unix_sysctl_register(net))
3663 		goto out;
3664 
3665 #ifdef CONFIG_PROC_FS
3666 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3667 			     sizeof(struct seq_net_private)))
3668 		goto err_sysctl;
3669 #endif
3670 
3671 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3672 					      sizeof(spinlock_t), GFP_KERNEL);
3673 	if (!net->unx.table.locks)
3674 		goto err_proc;
3675 
3676 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3677 						sizeof(struct hlist_head),
3678 						GFP_KERNEL);
3679 	if (!net->unx.table.buckets)
3680 		goto free_locks;
3681 
3682 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3683 		spin_lock_init(&net->unx.table.locks[i]);
3684 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3685 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3686 	}
3687 
3688 	return 0;
3689 
3690 free_locks:
3691 	kvfree(net->unx.table.locks);
3692 err_proc:
3693 #ifdef CONFIG_PROC_FS
3694 	remove_proc_entry("unix", net->proc_net);
3695 err_sysctl:
3696 #endif
3697 	unix_sysctl_unregister(net);
3698 out:
3699 	return -ENOMEM;
3700 }
3701 
3702 static void __net_exit unix_net_exit(struct net *net)
3703 {
3704 	kvfree(net->unx.table.buckets);
3705 	kvfree(net->unx.table.locks);
3706 	unix_sysctl_unregister(net);
3707 	remove_proc_entry("unix", net->proc_net);
3708 }
3709 
3710 static struct pernet_operations unix_net_ops = {
3711 	.init = unix_net_init,
3712 	.exit = unix_net_exit,
3713 };
3714 
3715 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3716 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3717 		     struct unix_sock *unix_sk, uid_t uid)
3718 
3719 #define INIT_BATCH_SZ 16
3720 
3721 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3722 {
3723 	struct bpf_unix_iter_state *iter = priv_data;
3724 	int err;
3725 
3726 	err = bpf_iter_init_seq_net(priv_data, aux);
3727 	if (err)
3728 		return err;
3729 
3730 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3731 	if (err) {
3732 		bpf_iter_fini_seq_net(priv_data);
3733 		return err;
3734 	}
3735 
3736 	return 0;
3737 }
3738 
3739 static void bpf_iter_fini_unix(void *priv_data)
3740 {
3741 	struct bpf_unix_iter_state *iter = priv_data;
3742 
3743 	bpf_iter_fini_seq_net(priv_data);
3744 	kvfree(iter->batch);
3745 }
3746 
3747 static const struct bpf_iter_seq_info unix_seq_info = {
3748 	.seq_ops		= &bpf_iter_unix_seq_ops,
3749 	.init_seq_private	= bpf_iter_init_unix,
3750 	.fini_seq_private	= bpf_iter_fini_unix,
3751 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3752 };
3753 
3754 static const struct bpf_func_proto *
3755 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3756 			     const struct bpf_prog *prog)
3757 {
3758 	switch (func_id) {
3759 	case BPF_FUNC_setsockopt:
3760 		return &bpf_sk_setsockopt_proto;
3761 	case BPF_FUNC_getsockopt:
3762 		return &bpf_sk_getsockopt_proto;
3763 	default:
3764 		return NULL;
3765 	}
3766 }
3767 
3768 static struct bpf_iter_reg unix_reg_info = {
3769 	.target			= "unix",
3770 	.ctx_arg_info_size	= 1,
3771 	.ctx_arg_info		= {
3772 		{ offsetof(struct bpf_iter__unix, unix_sk),
3773 		  PTR_TO_BTF_ID_OR_NULL },
3774 	},
3775 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3776 	.seq_info		= &unix_seq_info,
3777 };
3778 
3779 static void __init bpf_iter_register(void)
3780 {
3781 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3782 	if (bpf_iter_reg_target(&unix_reg_info))
3783 		pr_warn("Warning: could not register bpf iterator unix\n");
3784 }
3785 #endif
3786 
3787 static int __init af_unix_init(void)
3788 {
3789 	int i, rc = -1;
3790 
3791 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3792 
3793 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3794 		spin_lock_init(&bsd_socket_locks[i]);
3795 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3796 	}
3797 
3798 	rc = proto_register(&unix_dgram_proto, 1);
3799 	if (rc != 0) {
3800 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3801 		goto out;
3802 	}
3803 
3804 	rc = proto_register(&unix_stream_proto, 1);
3805 	if (rc != 0) {
3806 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3807 		proto_unregister(&unix_dgram_proto);
3808 		goto out;
3809 	}
3810 
3811 	sock_register(&unix_family_ops);
3812 	register_pernet_subsys(&unix_net_ops);
3813 	unix_bpf_build_proto();
3814 
3815 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3816 	bpf_iter_register();
3817 #endif
3818 
3819 out:
3820 	return rc;
3821 }
3822 
3823 /* Later than subsys_initcall() because we depend on stuff initialised there */
3824 fs_initcall(af_unix_init);
3825