xref: /linux/net/unix/af_unix.c (revision d0f93ac2c384c40202cf393fa7e8a2cac7004ba1)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * NET4:	Implementation of BSD Unix domain sockets.
4   *
5   * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6   *
7   * Fixes:
8   *		Linus Torvalds	:	Assorted bug cures.
9   *		Niibe Yutaka	:	async I/O support.
10   *		Carsten Paeth	:	PF_UNIX check, address fixes.
11   *		Alan Cox	:	Limit size of allocated blocks.
12   *		Alan Cox	:	Fixed the stupid socketpair bug.
13   *		Alan Cox	:	BSD compatibility fine tuning.
14   *		Alan Cox	:	Fixed a bug in connect when interrupted.
15   *		Alan Cox	:	Sorted out a proper draft version of
16   *					file descriptor passing hacked up from
17   *					Mike Shaver's work.
18   *		Marty Leisner	:	Fixes to fd passing
19   *		Nick Nevin	:	recvmsg bugfix.
20   *		Alan Cox	:	Started proper garbage collector
21   *		Heiko EiBfeldt	:	Missing verify_area check
22   *		Alan Cox	:	Started POSIXisms
23   *		Andreas Schwab	:	Replace inode by dentry for proper
24   *					reference counting
25   *		Kirk Petersen	:	Made this a module
26   *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27   *					Lots of bug fixes.
28   *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29   *					by above two patches.
30   *	     Andrea Arcangeli	:	If possible we block in connect(2)
31   *					if the max backlog of the listen socket
32   *					is been reached. This won't break
33   *					old apps and it will avoid huge amount
34   *					of socks hashed (this for unix_gc()
35   *					performances reasons).
36   *					Security fix that limits the max
37   *					number of socks to 2*max_files and
38   *					the number of skb queueable in the
39   *					dgram receiver.
40   *		Artur Skawina   :	Hash function optimizations
41   *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42   *	      Malcolm Beattie   :	Set peercred for socketpair
43   *	     Michal Ostrowski   :       Module initialization cleanup.
44   *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45   *	     				the core infrastructure is doing that
46   *	     				for all net proto families now (2.5.69+)
47   *
48   * Known differences from reference BSD that was tested:
49   *
50   *	[TO FIX]
51   *	ECONNREFUSED is not returned from one end of a connected() socket to the
52   *		other the moment one end closes.
53   *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54   *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55   *	[NOT TO FIX]
56   *	accept() returns a path name even if the connecting socket has closed
57   *		in the meantime (BSD loses the path and gives up).
58   *	accept() returns 0 length path for an unbound connector. BSD returns 16
59   *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60   *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61   *	BSD af_unix apparently has connect forgetting to block properly.
62   *		(need to check this with the POSIX spec in detail)
63   *
64   * Differences from 2.0.0-11-... (ANK)
65   *	Bug fixes and improvements.
66   *		- client shutdown killed server socket.
67   *		- removed all useless cli/sti pairs.
68   *
69   *	Semantic changes/extensions.
70   *		- generic control message passing.
71   *		- SCM_CREDENTIALS control message.
72   *		- "Abstract" (not FS based) socket bindings.
73   *		  Abstract names are sequences of bytes (not zero terminated)
74   *		  started by 0, so that this name space does not intersect
75   *		  with BSD names.
76   */
77  
78  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79  
80  #include <linux/module.h>
81  #include <linux/kernel.h>
82  #include <linux/signal.h>
83  #include <linux/sched/signal.h>
84  #include <linux/errno.h>
85  #include <linux/string.h>
86  #include <linux/stat.h>
87  #include <linux/dcache.h>
88  #include <linux/namei.h>
89  #include <linux/socket.h>
90  #include <linux/un.h>
91  #include <linux/fcntl.h>
92  #include <linux/filter.h>
93  #include <linux/termios.h>
94  #include <linux/sockios.h>
95  #include <linux/net.h>
96  #include <linux/in.h>
97  #include <linux/fs.h>
98  #include <linux/slab.h>
99  #include <linux/uaccess.h>
100  #include <linux/skbuff.h>
101  #include <linux/netdevice.h>
102  #include <net/net_namespace.h>
103  #include <net/sock.h>
104  #include <net/tcp_states.h>
105  #include <net/af_unix.h>
106  #include <linux/proc_fs.h>
107  #include <linux/seq_file.h>
108  #include <net/scm.h>
109  #include <linux/init.h>
110  #include <linux/poll.h>
111  #include <linux/rtnetlink.h>
112  #include <linux/mount.h>
113  #include <net/checksum.h>
114  #include <linux/security.h>
115  #include <linux/splice.h>
116  #include <linux/freezer.h>
117  #include <linux/file.h>
118  #include <linux/btf_ids.h>
119  #include <linux/bpf-cgroup.h>
120  
121  static atomic_long_t unix_nr_socks;
122  static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123  static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124  
125  /* SMP locking strategy:
126   *    hash table is protected with spinlock.
127   *    each socket state is protected by separate spinlock.
128   */
129  #ifdef CONFIG_PROVE_LOCKING
130  #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
131  
132  static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133  				  const struct lockdep_map *b)
134  {
135  	return cmp_ptr(a, b);
136  }
137  
138  static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139  				  const struct lockdep_map *_b)
140  {
141  	const struct unix_sock *a, *b;
142  
143  	a = container_of(_a, struct unix_sock, lock.dep_map);
144  	b = container_of(_b, struct unix_sock, lock.dep_map);
145  
146  	if (a->sk.sk_state == TCP_LISTEN) {
147  		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
148  		 *
149  		 *   1. a is TCP_LISTEN.
150  		 *   2. b is not a.
151  		 *   3. concurrent connect(b -> a) must fail.
152  		 *
153  		 * Except for 2. & 3., the b's state can be any possible
154  		 * value due to concurrent connect() or listen().
155  		 *
156  		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157  		 * be expressed as lock_cmp_fn.
158  		 */
159  		switch (b->sk.sk_state) {
160  		case TCP_CLOSE:
161  		case TCP_ESTABLISHED:
162  		case TCP_LISTEN:
163  			return -1;
164  		default:
165  			/* Invalid case. */
166  			return 0;
167  		}
168  	}
169  
170  	/* Should never happen.  Just to be symmetric. */
171  	if (b->sk.sk_state == TCP_LISTEN) {
172  		switch (b->sk.sk_state) {
173  		case TCP_CLOSE:
174  		case TCP_ESTABLISHED:
175  			return 1;
176  		default:
177  			return 0;
178  		}
179  	}
180  
181  	/* unix_state_double_lock(): ascending address order. */
182  	return cmp_ptr(a, b);
183  }
184  
185  static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186  				  const struct lockdep_map *_b)
187  {
188  	const struct sock *a, *b;
189  
190  	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191  	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192  
193  	/* unix_collect_skb(): listener -> embryo order. */
194  	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195  		return -1;
196  
197  	/* Should never happen.  Just to be symmetric. */
198  	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199  		return 1;
200  
201  	return 0;
202  }
203  #endif
204  
205  static unsigned int unix_unbound_hash(struct sock *sk)
206  {
207  	unsigned long hash = (unsigned long)sk;
208  
209  	hash ^= hash >> 16;
210  	hash ^= hash >> 8;
211  	hash ^= sk->sk_type;
212  
213  	return hash & UNIX_HASH_MOD;
214  }
215  
216  static unsigned int unix_bsd_hash(struct inode *i)
217  {
218  	return i->i_ino & UNIX_HASH_MOD;
219  }
220  
221  static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222  				       int addr_len, int type)
223  {
224  	__wsum csum = csum_partial(sunaddr, addr_len, 0);
225  	unsigned int hash;
226  
227  	hash = (__force unsigned int)csum_fold(csum);
228  	hash ^= hash >> 8;
229  	hash ^= type;
230  
231  	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232  }
233  
234  static void unix_table_double_lock(struct net *net,
235  				   unsigned int hash1, unsigned int hash2)
236  {
237  	if (hash1 == hash2) {
238  		spin_lock(&net->unx.table.locks[hash1]);
239  		return;
240  	}
241  
242  	if (hash1 > hash2)
243  		swap(hash1, hash2);
244  
245  	spin_lock(&net->unx.table.locks[hash1]);
246  	spin_lock(&net->unx.table.locks[hash2]);
247  }
248  
249  static void unix_table_double_unlock(struct net *net,
250  				     unsigned int hash1, unsigned int hash2)
251  {
252  	if (hash1 == hash2) {
253  		spin_unlock(&net->unx.table.locks[hash1]);
254  		return;
255  	}
256  
257  	spin_unlock(&net->unx.table.locks[hash1]);
258  	spin_unlock(&net->unx.table.locks[hash2]);
259  }
260  
261  #ifdef CONFIG_SECURITY_NETWORK
262  static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263  {
264  	UNIXCB(skb).secid = scm->secid;
265  }
266  
267  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268  {
269  	scm->secid = UNIXCB(skb).secid;
270  }
271  
272  static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273  {
274  	return (scm->secid == UNIXCB(skb).secid);
275  }
276  #else
277  static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278  { }
279  
280  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281  { }
282  
283  static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284  {
285  	return true;
286  }
287  #endif /* CONFIG_SECURITY_NETWORK */
288  
289  static inline int unix_our_peer(struct sock *sk, struct sock *osk)
290  {
291  	return unix_peer(osk) == sk;
292  }
293  
294  static inline int unix_may_send(struct sock *sk, struct sock *osk)
295  {
296  	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
297  }
298  
299  static inline int unix_recvq_full_lockless(const struct sock *sk)
300  {
301  	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
302  }
303  
304  struct sock *unix_peer_get(struct sock *s)
305  {
306  	struct sock *peer;
307  
308  	unix_state_lock(s);
309  	peer = unix_peer(s);
310  	if (peer)
311  		sock_hold(peer);
312  	unix_state_unlock(s);
313  	return peer;
314  }
315  EXPORT_SYMBOL_GPL(unix_peer_get);
316  
317  static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
318  					     int addr_len)
319  {
320  	struct unix_address *addr;
321  
322  	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
323  	if (!addr)
324  		return NULL;
325  
326  	refcount_set(&addr->refcnt, 1);
327  	addr->len = addr_len;
328  	memcpy(addr->name, sunaddr, addr_len);
329  
330  	return addr;
331  }
332  
333  static inline void unix_release_addr(struct unix_address *addr)
334  {
335  	if (refcount_dec_and_test(&addr->refcnt))
336  		kfree(addr);
337  }
338  
339  /*
340   *	Check unix socket name:
341   *		- should be not zero length.
342   *	        - if started by not zero, should be NULL terminated (FS object)
343   *		- if started by zero, it is abstract name.
344   */
345  
346  static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
347  {
348  	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
349  	    addr_len > sizeof(*sunaddr))
350  		return -EINVAL;
351  
352  	if (sunaddr->sun_family != AF_UNIX)
353  		return -EINVAL;
354  
355  	return 0;
356  }
357  
358  static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
359  {
360  	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
361  	short offset = offsetof(struct sockaddr_storage, __data);
362  
363  	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
364  
365  	/* This may look like an off by one error but it is a bit more
366  	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
367  	 * sun_path[108] doesn't as such exist.  However in kernel space
368  	 * we are guaranteed that it is a valid memory location in our
369  	 * kernel address buffer because syscall functions always pass
370  	 * a pointer of struct sockaddr_storage which has a bigger buffer
371  	 * than 108.  Also, we must terminate sun_path for strlen() in
372  	 * getname_kernel().
373  	 */
374  	addr->__data[addr_len - offset] = 0;
375  
376  	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
377  	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
378  	 * know the actual buffer.
379  	 */
380  	return strlen(addr->__data) + offset + 1;
381  }
382  
383  static void __unix_remove_socket(struct sock *sk)
384  {
385  	sk_del_node_init(sk);
386  }
387  
388  static void __unix_insert_socket(struct net *net, struct sock *sk)
389  {
390  	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
391  	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
392  }
393  
394  static void __unix_set_addr_hash(struct net *net, struct sock *sk,
395  				 struct unix_address *addr, unsigned int hash)
396  {
397  	__unix_remove_socket(sk);
398  	smp_store_release(&unix_sk(sk)->addr, addr);
399  
400  	sk->sk_hash = hash;
401  	__unix_insert_socket(net, sk);
402  }
403  
404  static void unix_remove_socket(struct net *net, struct sock *sk)
405  {
406  	spin_lock(&net->unx.table.locks[sk->sk_hash]);
407  	__unix_remove_socket(sk);
408  	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
409  }
410  
411  static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
412  {
413  	spin_lock(&net->unx.table.locks[sk->sk_hash]);
414  	__unix_insert_socket(net, sk);
415  	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
416  }
417  
418  static void unix_insert_bsd_socket(struct sock *sk)
419  {
420  	spin_lock(&bsd_socket_locks[sk->sk_hash]);
421  	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
422  	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
423  }
424  
425  static void unix_remove_bsd_socket(struct sock *sk)
426  {
427  	if (!hlist_unhashed(&sk->sk_bind_node)) {
428  		spin_lock(&bsd_socket_locks[sk->sk_hash]);
429  		__sk_del_bind_node(sk);
430  		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
431  
432  		sk_node_init(&sk->sk_bind_node);
433  	}
434  }
435  
436  static struct sock *__unix_find_socket_byname(struct net *net,
437  					      struct sockaddr_un *sunname,
438  					      int len, unsigned int hash)
439  {
440  	struct sock *s;
441  
442  	sk_for_each(s, &net->unx.table.buckets[hash]) {
443  		struct unix_sock *u = unix_sk(s);
444  
445  		if (u->addr->len == len &&
446  		    !memcmp(u->addr->name, sunname, len))
447  			return s;
448  	}
449  	return NULL;
450  }
451  
452  static inline struct sock *unix_find_socket_byname(struct net *net,
453  						   struct sockaddr_un *sunname,
454  						   int len, unsigned int hash)
455  {
456  	struct sock *s;
457  
458  	spin_lock(&net->unx.table.locks[hash]);
459  	s = __unix_find_socket_byname(net, sunname, len, hash);
460  	if (s)
461  		sock_hold(s);
462  	spin_unlock(&net->unx.table.locks[hash]);
463  	return s;
464  }
465  
466  static struct sock *unix_find_socket_byinode(struct inode *i)
467  {
468  	unsigned int hash = unix_bsd_hash(i);
469  	struct sock *s;
470  
471  	spin_lock(&bsd_socket_locks[hash]);
472  	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
473  		struct dentry *dentry = unix_sk(s)->path.dentry;
474  
475  		if (dentry && d_backing_inode(dentry) == i) {
476  			sock_hold(s);
477  			spin_unlock(&bsd_socket_locks[hash]);
478  			return s;
479  		}
480  	}
481  	spin_unlock(&bsd_socket_locks[hash]);
482  	return NULL;
483  }
484  
485  /* Support code for asymmetrically connected dgram sockets
486   *
487   * If a datagram socket is connected to a socket not itself connected
488   * to the first socket (eg, /dev/log), clients may only enqueue more
489   * messages if the present receive queue of the server socket is not
490   * "too large". This means there's a second writeability condition
491   * poll and sendmsg need to test. The dgram recv code will do a wake
492   * up on the peer_wait wait queue of a socket upon reception of a
493   * datagram which needs to be propagated to sleeping would-be writers
494   * since these might not have sent anything so far. This can't be
495   * accomplished via poll_wait because the lifetime of the server
496   * socket might be less than that of its clients if these break their
497   * association with it or if the server socket is closed while clients
498   * are still connected to it and there's no way to inform "a polling
499   * implementation" that it should let go of a certain wait queue
500   *
501   * In order to propagate a wake up, a wait_queue_entry_t of the client
502   * socket is enqueued on the peer_wait queue of the server socket
503   * whose wake function does a wake_up on the ordinary client socket
504   * wait queue. This connection is established whenever a write (or
505   * poll for write) hit the flow control condition and broken when the
506   * association to the server socket is dissolved or after a wake up
507   * was relayed.
508   */
509  
510  static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
511  				      void *key)
512  {
513  	struct unix_sock *u;
514  	wait_queue_head_t *u_sleep;
515  
516  	u = container_of(q, struct unix_sock, peer_wake);
517  
518  	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
519  			    q);
520  	u->peer_wake.private = NULL;
521  
522  	/* relaying can only happen while the wq still exists */
523  	u_sleep = sk_sleep(&u->sk);
524  	if (u_sleep)
525  		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
526  
527  	return 0;
528  }
529  
530  static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
531  {
532  	struct unix_sock *u, *u_other;
533  	int rc;
534  
535  	u = unix_sk(sk);
536  	u_other = unix_sk(other);
537  	rc = 0;
538  	spin_lock(&u_other->peer_wait.lock);
539  
540  	if (!u->peer_wake.private) {
541  		u->peer_wake.private = other;
542  		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
543  
544  		rc = 1;
545  	}
546  
547  	spin_unlock(&u_other->peer_wait.lock);
548  	return rc;
549  }
550  
551  static void unix_dgram_peer_wake_disconnect(struct sock *sk,
552  					    struct sock *other)
553  {
554  	struct unix_sock *u, *u_other;
555  
556  	u = unix_sk(sk);
557  	u_other = unix_sk(other);
558  	spin_lock(&u_other->peer_wait.lock);
559  
560  	if (u->peer_wake.private == other) {
561  		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
562  		u->peer_wake.private = NULL;
563  	}
564  
565  	spin_unlock(&u_other->peer_wait.lock);
566  }
567  
568  static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
569  						   struct sock *other)
570  {
571  	unix_dgram_peer_wake_disconnect(sk, other);
572  	wake_up_interruptible_poll(sk_sleep(sk),
573  				   EPOLLOUT |
574  				   EPOLLWRNORM |
575  				   EPOLLWRBAND);
576  }
577  
578  /* preconditions:
579   *	- unix_peer(sk) == other
580   *	- association is stable
581   */
582  static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
583  {
584  	int connected;
585  
586  	connected = unix_dgram_peer_wake_connect(sk, other);
587  
588  	/* If other is SOCK_DEAD, we want to make sure we signal
589  	 * POLLOUT, such that a subsequent write() can get a
590  	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
591  	 * to other and its full, we will hang waiting for POLLOUT.
592  	 */
593  	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
594  		return 1;
595  
596  	if (connected)
597  		unix_dgram_peer_wake_disconnect(sk, other);
598  
599  	return 0;
600  }
601  
602  static int unix_writable(const struct sock *sk, unsigned char state)
603  {
604  	return state != TCP_LISTEN &&
605  		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
606  }
607  
608  static void unix_write_space(struct sock *sk)
609  {
610  	struct socket_wq *wq;
611  
612  	rcu_read_lock();
613  	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
614  		wq = rcu_dereference(sk->sk_wq);
615  		if (skwq_has_sleeper(wq))
616  			wake_up_interruptible_sync_poll(&wq->wait,
617  				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
618  		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
619  	}
620  	rcu_read_unlock();
621  }
622  
623  /* When dgram socket disconnects (or changes its peer), we clear its receive
624   * queue of packets arrived from previous peer. First, it allows to do
625   * flow control based only on wmem_alloc; second, sk connected to peer
626   * may receive messages only from that peer. */
627  static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
628  {
629  	if (!skb_queue_empty(&sk->sk_receive_queue)) {
630  		skb_queue_purge(&sk->sk_receive_queue);
631  		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
632  
633  		/* If one link of bidirectional dgram pipe is disconnected,
634  		 * we signal error. Messages are lost. Do not make this,
635  		 * when peer was not connected to us.
636  		 */
637  		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
638  			WRITE_ONCE(other->sk_err, ECONNRESET);
639  			sk_error_report(other);
640  		}
641  	}
642  }
643  
644  static void unix_sock_destructor(struct sock *sk)
645  {
646  	struct unix_sock *u = unix_sk(sk);
647  
648  	skb_queue_purge(&sk->sk_receive_queue);
649  
650  	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
651  	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
652  	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
653  	if (!sock_flag(sk, SOCK_DEAD)) {
654  		pr_info("Attempt to release alive unix socket: %p\n", sk);
655  		return;
656  	}
657  
658  	if (u->addr)
659  		unix_release_addr(u->addr);
660  
661  	atomic_long_dec(&unix_nr_socks);
662  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
663  #ifdef UNIX_REFCNT_DEBUG
664  	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
665  		atomic_long_read(&unix_nr_socks));
666  #endif
667  }
668  
669  static void unix_release_sock(struct sock *sk, int embrion)
670  {
671  	struct unix_sock *u = unix_sk(sk);
672  	struct sock *skpair;
673  	struct sk_buff *skb;
674  	struct path path;
675  	int state;
676  
677  	unix_remove_socket(sock_net(sk), sk);
678  	unix_remove_bsd_socket(sk);
679  
680  	/* Clear state */
681  	unix_state_lock(sk);
682  	sock_orphan(sk);
683  	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
684  	path	     = u->path;
685  	u->path.dentry = NULL;
686  	u->path.mnt = NULL;
687  	state = sk->sk_state;
688  	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
689  
690  	skpair = unix_peer(sk);
691  	unix_peer(sk) = NULL;
692  
693  	unix_state_unlock(sk);
694  
695  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
696  	u->oob_skb = NULL;
697  #endif
698  
699  	wake_up_interruptible_all(&u->peer_wait);
700  
701  	if (skpair != NULL) {
702  		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
703  			unix_state_lock(skpair);
704  			/* No more writes */
705  			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
706  			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
707  				WRITE_ONCE(skpair->sk_err, ECONNRESET);
708  			unix_state_unlock(skpair);
709  			skpair->sk_state_change(skpair);
710  			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
711  		}
712  
713  		unix_dgram_peer_wake_disconnect(sk, skpair);
714  		sock_put(skpair); /* It may now die */
715  	}
716  
717  	/* Try to flush out this socket. Throw out buffers at least */
718  
719  	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
720  		if (state == TCP_LISTEN)
721  			unix_release_sock(skb->sk, 1);
722  
723  		/* passed fds are erased in the kfree_skb hook	      */
724  		kfree_skb(skb);
725  	}
726  
727  	if (path.dentry)
728  		path_put(&path);
729  
730  	sock_put(sk);
731  
732  	/* ---- Socket is dead now and most probably destroyed ---- */
733  
734  	/*
735  	 * Fixme: BSD difference: In BSD all sockets connected to us get
736  	 *	  ECONNRESET and we die on the spot. In Linux we behave
737  	 *	  like files and pipes do and wait for the last
738  	 *	  dereference.
739  	 *
740  	 * Can't we simply set sock->err?
741  	 *
742  	 *	  What the above comment does talk about? --ANK(980817)
743  	 */
744  
745  	if (READ_ONCE(unix_tot_inflight))
746  		unix_gc();		/* Garbage collect fds */
747  }
748  
749  static void init_peercred(struct sock *sk)
750  {
751  	sk->sk_peer_pid = get_pid(task_tgid(current));
752  	sk->sk_peer_cred = get_current_cred();
753  }
754  
755  static void update_peercred(struct sock *sk)
756  {
757  	const struct cred *old_cred;
758  	struct pid *old_pid;
759  
760  	spin_lock(&sk->sk_peer_lock);
761  	old_pid = sk->sk_peer_pid;
762  	old_cred = sk->sk_peer_cred;
763  	init_peercred(sk);
764  	spin_unlock(&sk->sk_peer_lock);
765  
766  	put_pid(old_pid);
767  	put_cred(old_cred);
768  }
769  
770  static void copy_peercred(struct sock *sk, struct sock *peersk)
771  {
772  	lockdep_assert_held(&unix_sk(peersk)->lock);
773  
774  	spin_lock(&sk->sk_peer_lock);
775  	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
776  	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
777  	spin_unlock(&sk->sk_peer_lock);
778  }
779  
780  static int unix_listen(struct socket *sock, int backlog)
781  {
782  	int err;
783  	struct sock *sk = sock->sk;
784  	struct unix_sock *u = unix_sk(sk);
785  
786  	err = -EOPNOTSUPP;
787  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
788  		goto out;	/* Only stream/seqpacket sockets accept */
789  	err = -EINVAL;
790  	if (!READ_ONCE(u->addr))
791  		goto out;	/* No listens on an unbound socket */
792  	unix_state_lock(sk);
793  	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
794  		goto out_unlock;
795  	if (backlog > sk->sk_max_ack_backlog)
796  		wake_up_interruptible_all(&u->peer_wait);
797  	sk->sk_max_ack_backlog	= backlog;
798  	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
799  
800  	/* set credentials so connect can copy them */
801  	update_peercred(sk);
802  	err = 0;
803  
804  out_unlock:
805  	unix_state_unlock(sk);
806  out:
807  	return err;
808  }
809  
810  static int unix_release(struct socket *);
811  static int unix_bind(struct socket *, struct sockaddr *, int);
812  static int unix_stream_connect(struct socket *, struct sockaddr *,
813  			       int addr_len, int flags);
814  static int unix_socketpair(struct socket *, struct socket *);
815  static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
816  static int unix_getname(struct socket *, struct sockaddr *, int);
817  static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
818  static __poll_t unix_dgram_poll(struct file *, struct socket *,
819  				    poll_table *);
820  static int unix_ioctl(struct socket *, unsigned int, unsigned long);
821  #ifdef CONFIG_COMPAT
822  static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
823  #endif
824  static int unix_shutdown(struct socket *, int);
825  static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
826  static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
827  static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
828  				       struct pipe_inode_info *, size_t size,
829  				       unsigned int flags);
830  static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
831  static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
832  static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
833  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
834  static int unix_dgram_connect(struct socket *, struct sockaddr *,
835  			      int, int);
836  static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
837  static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
838  				  int);
839  
840  #ifdef CONFIG_PROC_FS
841  static int unix_count_nr_fds(struct sock *sk)
842  {
843  	struct sk_buff *skb;
844  	struct unix_sock *u;
845  	int nr_fds = 0;
846  
847  	spin_lock(&sk->sk_receive_queue.lock);
848  	skb = skb_peek(&sk->sk_receive_queue);
849  	while (skb) {
850  		u = unix_sk(skb->sk);
851  		nr_fds += atomic_read(&u->scm_stat.nr_fds);
852  		skb = skb_peek_next(skb, &sk->sk_receive_queue);
853  	}
854  	spin_unlock(&sk->sk_receive_queue.lock);
855  
856  	return nr_fds;
857  }
858  
859  static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
860  {
861  	struct sock *sk = sock->sk;
862  	unsigned char s_state;
863  	struct unix_sock *u;
864  	int nr_fds = 0;
865  
866  	if (sk) {
867  		s_state = READ_ONCE(sk->sk_state);
868  		u = unix_sk(sk);
869  
870  		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
871  		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
872  		 * SOCK_DGRAM is ordinary. So, no lock is needed.
873  		 */
874  		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
875  			nr_fds = atomic_read(&u->scm_stat.nr_fds);
876  		else if (s_state == TCP_LISTEN)
877  			nr_fds = unix_count_nr_fds(sk);
878  
879  		seq_printf(m, "scm_fds: %u\n", nr_fds);
880  	}
881  }
882  #else
883  #define unix_show_fdinfo NULL
884  #endif
885  
886  static const struct proto_ops unix_stream_ops = {
887  	.family =	PF_UNIX,
888  	.owner =	THIS_MODULE,
889  	.release =	unix_release,
890  	.bind =		unix_bind,
891  	.connect =	unix_stream_connect,
892  	.socketpair =	unix_socketpair,
893  	.accept =	unix_accept,
894  	.getname =	unix_getname,
895  	.poll =		unix_poll,
896  	.ioctl =	unix_ioctl,
897  #ifdef CONFIG_COMPAT
898  	.compat_ioctl =	unix_compat_ioctl,
899  #endif
900  	.listen =	unix_listen,
901  	.shutdown =	unix_shutdown,
902  	.sendmsg =	unix_stream_sendmsg,
903  	.recvmsg =	unix_stream_recvmsg,
904  	.read_skb =	unix_stream_read_skb,
905  	.mmap =		sock_no_mmap,
906  	.splice_read =	unix_stream_splice_read,
907  	.set_peek_off =	sk_set_peek_off,
908  	.show_fdinfo =	unix_show_fdinfo,
909  };
910  
911  static const struct proto_ops unix_dgram_ops = {
912  	.family =	PF_UNIX,
913  	.owner =	THIS_MODULE,
914  	.release =	unix_release,
915  	.bind =		unix_bind,
916  	.connect =	unix_dgram_connect,
917  	.socketpair =	unix_socketpair,
918  	.accept =	sock_no_accept,
919  	.getname =	unix_getname,
920  	.poll =		unix_dgram_poll,
921  	.ioctl =	unix_ioctl,
922  #ifdef CONFIG_COMPAT
923  	.compat_ioctl =	unix_compat_ioctl,
924  #endif
925  	.listen =	sock_no_listen,
926  	.shutdown =	unix_shutdown,
927  	.sendmsg =	unix_dgram_sendmsg,
928  	.read_skb =	unix_read_skb,
929  	.recvmsg =	unix_dgram_recvmsg,
930  	.mmap =		sock_no_mmap,
931  	.set_peek_off =	sk_set_peek_off,
932  	.show_fdinfo =	unix_show_fdinfo,
933  };
934  
935  static const struct proto_ops unix_seqpacket_ops = {
936  	.family =	PF_UNIX,
937  	.owner =	THIS_MODULE,
938  	.release =	unix_release,
939  	.bind =		unix_bind,
940  	.connect =	unix_stream_connect,
941  	.socketpair =	unix_socketpair,
942  	.accept =	unix_accept,
943  	.getname =	unix_getname,
944  	.poll =		unix_dgram_poll,
945  	.ioctl =	unix_ioctl,
946  #ifdef CONFIG_COMPAT
947  	.compat_ioctl =	unix_compat_ioctl,
948  #endif
949  	.listen =	unix_listen,
950  	.shutdown =	unix_shutdown,
951  	.sendmsg =	unix_seqpacket_sendmsg,
952  	.recvmsg =	unix_seqpacket_recvmsg,
953  	.mmap =		sock_no_mmap,
954  	.set_peek_off =	sk_set_peek_off,
955  	.show_fdinfo =	unix_show_fdinfo,
956  };
957  
958  static void unix_close(struct sock *sk, long timeout)
959  {
960  	/* Nothing to do here, unix socket does not need a ->close().
961  	 * This is merely for sockmap.
962  	 */
963  }
964  
965  static void unix_unhash(struct sock *sk)
966  {
967  	/* Nothing to do here, unix socket does not need a ->unhash().
968  	 * This is merely for sockmap.
969  	 */
970  }
971  
972  static bool unix_bpf_bypass_getsockopt(int level, int optname)
973  {
974  	if (level == SOL_SOCKET) {
975  		switch (optname) {
976  		case SO_PEERPIDFD:
977  			return true;
978  		default:
979  			return false;
980  		}
981  	}
982  
983  	return false;
984  }
985  
986  struct proto unix_dgram_proto = {
987  	.name			= "UNIX",
988  	.owner			= THIS_MODULE,
989  	.obj_size		= sizeof(struct unix_sock),
990  	.close			= unix_close,
991  	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
992  #ifdef CONFIG_BPF_SYSCALL
993  	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
994  #endif
995  };
996  
997  struct proto unix_stream_proto = {
998  	.name			= "UNIX-STREAM",
999  	.owner			= THIS_MODULE,
1000  	.obj_size		= sizeof(struct unix_sock),
1001  	.close			= unix_close,
1002  	.unhash			= unix_unhash,
1003  	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1004  #ifdef CONFIG_BPF_SYSCALL
1005  	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1006  #endif
1007  };
1008  
1009  static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1010  {
1011  	struct unix_sock *u;
1012  	struct sock *sk;
1013  	int err;
1014  
1015  	atomic_long_inc(&unix_nr_socks);
1016  	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1017  		err = -ENFILE;
1018  		goto err;
1019  	}
1020  
1021  	if (type == SOCK_STREAM)
1022  		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1023  	else /*dgram and  seqpacket */
1024  		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1025  
1026  	if (!sk) {
1027  		err = -ENOMEM;
1028  		goto err;
1029  	}
1030  
1031  	sock_init_data(sock, sk);
1032  
1033  	sk->sk_hash		= unix_unbound_hash(sk);
1034  	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1035  	sk->sk_write_space	= unix_write_space;
1036  	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1037  	sk->sk_destruct		= unix_sock_destructor;
1038  	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1039  
1040  	u = unix_sk(sk);
1041  	u->listener = NULL;
1042  	u->vertex = NULL;
1043  	u->path.dentry = NULL;
1044  	u->path.mnt = NULL;
1045  	spin_lock_init(&u->lock);
1046  	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1047  	mutex_init(&u->iolock); /* single task reading lock */
1048  	mutex_init(&u->bindlock); /* single task binding lock */
1049  	init_waitqueue_head(&u->peer_wait);
1050  	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1051  	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1052  	unix_insert_unbound_socket(net, sk);
1053  
1054  	sock_prot_inuse_add(net, sk->sk_prot, 1);
1055  
1056  	return sk;
1057  
1058  err:
1059  	atomic_long_dec(&unix_nr_socks);
1060  	return ERR_PTR(err);
1061  }
1062  
1063  static int unix_create(struct net *net, struct socket *sock, int protocol,
1064  		       int kern)
1065  {
1066  	struct sock *sk;
1067  
1068  	if (protocol && protocol != PF_UNIX)
1069  		return -EPROTONOSUPPORT;
1070  
1071  	sock->state = SS_UNCONNECTED;
1072  
1073  	switch (sock->type) {
1074  	case SOCK_STREAM:
1075  		sock->ops = &unix_stream_ops;
1076  		break;
1077  		/*
1078  		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1079  		 *	nothing uses it.
1080  		 */
1081  	case SOCK_RAW:
1082  		sock->type = SOCK_DGRAM;
1083  		fallthrough;
1084  	case SOCK_DGRAM:
1085  		sock->ops = &unix_dgram_ops;
1086  		break;
1087  	case SOCK_SEQPACKET:
1088  		sock->ops = &unix_seqpacket_ops;
1089  		break;
1090  	default:
1091  		return -ESOCKTNOSUPPORT;
1092  	}
1093  
1094  	sk = unix_create1(net, sock, kern, sock->type);
1095  	if (IS_ERR(sk))
1096  		return PTR_ERR(sk);
1097  
1098  	return 0;
1099  }
1100  
1101  static int unix_release(struct socket *sock)
1102  {
1103  	struct sock *sk = sock->sk;
1104  
1105  	if (!sk)
1106  		return 0;
1107  
1108  	sk->sk_prot->close(sk, 0);
1109  	unix_release_sock(sk, 0);
1110  	sock->sk = NULL;
1111  
1112  	return 0;
1113  }
1114  
1115  static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1116  				  int type)
1117  {
1118  	struct inode *inode;
1119  	struct path path;
1120  	struct sock *sk;
1121  	int err;
1122  
1123  	unix_mkname_bsd(sunaddr, addr_len);
1124  	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1125  	if (err)
1126  		goto fail;
1127  
1128  	err = path_permission(&path, MAY_WRITE);
1129  	if (err)
1130  		goto path_put;
1131  
1132  	err = -ECONNREFUSED;
1133  	inode = d_backing_inode(path.dentry);
1134  	if (!S_ISSOCK(inode->i_mode))
1135  		goto path_put;
1136  
1137  	sk = unix_find_socket_byinode(inode);
1138  	if (!sk)
1139  		goto path_put;
1140  
1141  	err = -EPROTOTYPE;
1142  	if (sk->sk_type == type)
1143  		touch_atime(&path);
1144  	else
1145  		goto sock_put;
1146  
1147  	path_put(&path);
1148  
1149  	return sk;
1150  
1151  sock_put:
1152  	sock_put(sk);
1153  path_put:
1154  	path_put(&path);
1155  fail:
1156  	return ERR_PTR(err);
1157  }
1158  
1159  static struct sock *unix_find_abstract(struct net *net,
1160  				       struct sockaddr_un *sunaddr,
1161  				       int addr_len, int type)
1162  {
1163  	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1164  	struct dentry *dentry;
1165  	struct sock *sk;
1166  
1167  	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1168  	if (!sk)
1169  		return ERR_PTR(-ECONNREFUSED);
1170  
1171  	dentry = unix_sk(sk)->path.dentry;
1172  	if (dentry)
1173  		touch_atime(&unix_sk(sk)->path);
1174  
1175  	return sk;
1176  }
1177  
1178  static struct sock *unix_find_other(struct net *net,
1179  				    struct sockaddr_un *sunaddr,
1180  				    int addr_len, int type)
1181  {
1182  	struct sock *sk;
1183  
1184  	if (sunaddr->sun_path[0])
1185  		sk = unix_find_bsd(sunaddr, addr_len, type);
1186  	else
1187  		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1188  
1189  	return sk;
1190  }
1191  
1192  static int unix_autobind(struct sock *sk)
1193  {
1194  	struct unix_sock *u = unix_sk(sk);
1195  	unsigned int new_hash, old_hash;
1196  	struct net *net = sock_net(sk);
1197  	struct unix_address *addr;
1198  	u32 lastnum, ordernum;
1199  	int err;
1200  
1201  	err = mutex_lock_interruptible(&u->bindlock);
1202  	if (err)
1203  		return err;
1204  
1205  	if (u->addr)
1206  		goto out;
1207  
1208  	err = -ENOMEM;
1209  	addr = kzalloc(sizeof(*addr) +
1210  		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1211  	if (!addr)
1212  		goto out;
1213  
1214  	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1215  	addr->name->sun_family = AF_UNIX;
1216  	refcount_set(&addr->refcnt, 1);
1217  
1218  	old_hash = sk->sk_hash;
1219  	ordernum = get_random_u32();
1220  	lastnum = ordernum & 0xFFFFF;
1221  retry:
1222  	ordernum = (ordernum + 1) & 0xFFFFF;
1223  	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1224  
1225  	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1226  	unix_table_double_lock(net, old_hash, new_hash);
1227  
1228  	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1229  		unix_table_double_unlock(net, old_hash, new_hash);
1230  
1231  		/* __unix_find_socket_byname() may take long time if many names
1232  		 * are already in use.
1233  		 */
1234  		cond_resched();
1235  
1236  		if (ordernum == lastnum) {
1237  			/* Give up if all names seems to be in use. */
1238  			err = -ENOSPC;
1239  			unix_release_addr(addr);
1240  			goto out;
1241  		}
1242  
1243  		goto retry;
1244  	}
1245  
1246  	__unix_set_addr_hash(net, sk, addr, new_hash);
1247  	unix_table_double_unlock(net, old_hash, new_hash);
1248  	err = 0;
1249  
1250  out:	mutex_unlock(&u->bindlock);
1251  	return err;
1252  }
1253  
1254  static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1255  			 int addr_len)
1256  {
1257  	umode_t mode = S_IFSOCK |
1258  	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1259  	struct unix_sock *u = unix_sk(sk);
1260  	unsigned int new_hash, old_hash;
1261  	struct net *net = sock_net(sk);
1262  	struct mnt_idmap *idmap;
1263  	struct unix_address *addr;
1264  	struct dentry *dentry;
1265  	struct path parent;
1266  	int err;
1267  
1268  	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1269  	addr = unix_create_addr(sunaddr, addr_len);
1270  	if (!addr)
1271  		return -ENOMEM;
1272  
1273  	/*
1274  	 * Get the parent directory, calculate the hash for last
1275  	 * component.
1276  	 */
1277  	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1278  	if (IS_ERR(dentry)) {
1279  		err = PTR_ERR(dentry);
1280  		goto out;
1281  	}
1282  
1283  	/*
1284  	 * All right, let's create it.
1285  	 */
1286  	idmap = mnt_idmap(parent.mnt);
1287  	err = security_path_mknod(&parent, dentry, mode, 0);
1288  	if (!err)
1289  		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1290  	if (err)
1291  		goto out_path;
1292  	err = mutex_lock_interruptible(&u->bindlock);
1293  	if (err)
1294  		goto out_unlink;
1295  	if (u->addr)
1296  		goto out_unlock;
1297  
1298  	old_hash = sk->sk_hash;
1299  	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1300  	unix_table_double_lock(net, old_hash, new_hash);
1301  	u->path.mnt = mntget(parent.mnt);
1302  	u->path.dentry = dget(dentry);
1303  	__unix_set_addr_hash(net, sk, addr, new_hash);
1304  	unix_table_double_unlock(net, old_hash, new_hash);
1305  	unix_insert_bsd_socket(sk);
1306  	mutex_unlock(&u->bindlock);
1307  	done_path_create(&parent, dentry);
1308  	return 0;
1309  
1310  out_unlock:
1311  	mutex_unlock(&u->bindlock);
1312  	err = -EINVAL;
1313  out_unlink:
1314  	/* failed after successful mknod?  unlink what we'd created... */
1315  	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1316  out_path:
1317  	done_path_create(&parent, dentry);
1318  out:
1319  	unix_release_addr(addr);
1320  	return err == -EEXIST ? -EADDRINUSE : err;
1321  }
1322  
1323  static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1324  			      int addr_len)
1325  {
1326  	struct unix_sock *u = unix_sk(sk);
1327  	unsigned int new_hash, old_hash;
1328  	struct net *net = sock_net(sk);
1329  	struct unix_address *addr;
1330  	int err;
1331  
1332  	addr = unix_create_addr(sunaddr, addr_len);
1333  	if (!addr)
1334  		return -ENOMEM;
1335  
1336  	err = mutex_lock_interruptible(&u->bindlock);
1337  	if (err)
1338  		goto out;
1339  
1340  	if (u->addr) {
1341  		err = -EINVAL;
1342  		goto out_mutex;
1343  	}
1344  
1345  	old_hash = sk->sk_hash;
1346  	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1347  	unix_table_double_lock(net, old_hash, new_hash);
1348  
1349  	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1350  		goto out_spin;
1351  
1352  	__unix_set_addr_hash(net, sk, addr, new_hash);
1353  	unix_table_double_unlock(net, old_hash, new_hash);
1354  	mutex_unlock(&u->bindlock);
1355  	return 0;
1356  
1357  out_spin:
1358  	unix_table_double_unlock(net, old_hash, new_hash);
1359  	err = -EADDRINUSE;
1360  out_mutex:
1361  	mutex_unlock(&u->bindlock);
1362  out:
1363  	unix_release_addr(addr);
1364  	return err;
1365  }
1366  
1367  static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1368  {
1369  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1370  	struct sock *sk = sock->sk;
1371  	int err;
1372  
1373  	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1374  	    sunaddr->sun_family == AF_UNIX)
1375  		return unix_autobind(sk);
1376  
1377  	err = unix_validate_addr(sunaddr, addr_len);
1378  	if (err)
1379  		return err;
1380  
1381  	if (sunaddr->sun_path[0])
1382  		err = unix_bind_bsd(sk, sunaddr, addr_len);
1383  	else
1384  		err = unix_bind_abstract(sk, sunaddr, addr_len);
1385  
1386  	return err;
1387  }
1388  
1389  static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1390  {
1391  	if (unlikely(sk1 == sk2) || !sk2) {
1392  		unix_state_lock(sk1);
1393  		return;
1394  	}
1395  
1396  	if (sk1 > sk2)
1397  		swap(sk1, sk2);
1398  
1399  	unix_state_lock(sk1);
1400  	unix_state_lock(sk2);
1401  }
1402  
1403  static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1404  {
1405  	if (unlikely(sk1 == sk2) || !sk2) {
1406  		unix_state_unlock(sk1);
1407  		return;
1408  	}
1409  	unix_state_unlock(sk1);
1410  	unix_state_unlock(sk2);
1411  }
1412  
1413  static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1414  			      int alen, int flags)
1415  {
1416  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1417  	struct sock *sk = sock->sk;
1418  	struct sock *other;
1419  	int err;
1420  
1421  	err = -EINVAL;
1422  	if (alen < offsetofend(struct sockaddr, sa_family))
1423  		goto out;
1424  
1425  	if (addr->sa_family != AF_UNSPEC) {
1426  		err = unix_validate_addr(sunaddr, alen);
1427  		if (err)
1428  			goto out;
1429  
1430  		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1431  		if (err)
1432  			goto out;
1433  
1434  		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1435  		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1436  		    !READ_ONCE(unix_sk(sk)->addr)) {
1437  			err = unix_autobind(sk);
1438  			if (err)
1439  				goto out;
1440  		}
1441  
1442  restart:
1443  		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1444  		if (IS_ERR(other)) {
1445  			err = PTR_ERR(other);
1446  			goto out;
1447  		}
1448  
1449  		unix_state_double_lock(sk, other);
1450  
1451  		/* Apparently VFS overslept socket death. Retry. */
1452  		if (sock_flag(other, SOCK_DEAD)) {
1453  			unix_state_double_unlock(sk, other);
1454  			sock_put(other);
1455  			goto restart;
1456  		}
1457  
1458  		err = -EPERM;
1459  		if (!unix_may_send(sk, other))
1460  			goto out_unlock;
1461  
1462  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1463  		if (err)
1464  			goto out_unlock;
1465  
1466  		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1467  		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1468  	} else {
1469  		/*
1470  		 *	1003.1g breaking connected state with AF_UNSPEC
1471  		 */
1472  		other = NULL;
1473  		unix_state_double_lock(sk, other);
1474  	}
1475  
1476  	/*
1477  	 * If it was connected, reconnect.
1478  	 */
1479  	if (unix_peer(sk)) {
1480  		struct sock *old_peer = unix_peer(sk);
1481  
1482  		unix_peer(sk) = other;
1483  		if (!other)
1484  			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1485  		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1486  
1487  		unix_state_double_unlock(sk, other);
1488  
1489  		if (other != old_peer) {
1490  			unix_dgram_disconnected(sk, old_peer);
1491  
1492  			unix_state_lock(old_peer);
1493  			if (!unix_peer(old_peer))
1494  				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1495  			unix_state_unlock(old_peer);
1496  		}
1497  
1498  		sock_put(old_peer);
1499  	} else {
1500  		unix_peer(sk) = other;
1501  		unix_state_double_unlock(sk, other);
1502  	}
1503  
1504  	return 0;
1505  
1506  out_unlock:
1507  	unix_state_double_unlock(sk, other);
1508  	sock_put(other);
1509  out:
1510  	return err;
1511  }
1512  
1513  static long unix_wait_for_peer(struct sock *other, long timeo)
1514  	__releases(&unix_sk(other)->lock)
1515  {
1516  	struct unix_sock *u = unix_sk(other);
1517  	int sched;
1518  	DEFINE_WAIT(wait);
1519  
1520  	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1521  
1522  	sched = !sock_flag(other, SOCK_DEAD) &&
1523  		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1524  		unix_recvq_full_lockless(other);
1525  
1526  	unix_state_unlock(other);
1527  
1528  	if (sched)
1529  		timeo = schedule_timeout(timeo);
1530  
1531  	finish_wait(&u->peer_wait, &wait);
1532  	return timeo;
1533  }
1534  
1535  static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1536  			       int addr_len, int flags)
1537  {
1538  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1539  	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1540  	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1541  	struct net *net = sock_net(sk);
1542  	struct sk_buff *skb = NULL;
1543  	unsigned char state;
1544  	long timeo;
1545  	int err;
1546  
1547  	err = unix_validate_addr(sunaddr, addr_len);
1548  	if (err)
1549  		goto out;
1550  
1551  	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1552  	if (err)
1553  		goto out;
1554  
1555  	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1556  	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1557  	    !READ_ONCE(u->addr)) {
1558  		err = unix_autobind(sk);
1559  		if (err)
1560  			goto out;
1561  	}
1562  
1563  	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1564  
1565  	/* First of all allocate resources.
1566  	   If we will make it after state is locked,
1567  	   we will have to recheck all again in any case.
1568  	 */
1569  
1570  	/* create new sock for complete connection */
1571  	newsk = unix_create1(net, NULL, 0, sock->type);
1572  	if (IS_ERR(newsk)) {
1573  		err = PTR_ERR(newsk);
1574  		newsk = NULL;
1575  		goto out;
1576  	}
1577  
1578  	err = -ENOMEM;
1579  
1580  	/* Allocate skb for sending to listening sock */
1581  	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1582  	if (skb == NULL)
1583  		goto out;
1584  
1585  restart:
1586  	/*  Find listening sock. */
1587  	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1588  	if (IS_ERR(other)) {
1589  		err = PTR_ERR(other);
1590  		other = NULL;
1591  		goto out;
1592  	}
1593  
1594  	unix_state_lock(other);
1595  
1596  	/* Apparently VFS overslept socket death. Retry. */
1597  	if (sock_flag(other, SOCK_DEAD)) {
1598  		unix_state_unlock(other);
1599  		sock_put(other);
1600  		goto restart;
1601  	}
1602  
1603  	err = -ECONNREFUSED;
1604  	if (other->sk_state != TCP_LISTEN)
1605  		goto out_unlock;
1606  	if (other->sk_shutdown & RCV_SHUTDOWN)
1607  		goto out_unlock;
1608  
1609  	if (unix_recvq_full_lockless(other)) {
1610  		err = -EAGAIN;
1611  		if (!timeo)
1612  			goto out_unlock;
1613  
1614  		timeo = unix_wait_for_peer(other, timeo);
1615  
1616  		err = sock_intr_errno(timeo);
1617  		if (signal_pending(current))
1618  			goto out;
1619  		sock_put(other);
1620  		goto restart;
1621  	}
1622  
1623  	/* self connect and simultaneous connect are eliminated
1624  	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1625  	 */
1626  	state = READ_ONCE(sk->sk_state);
1627  	if (unlikely(state != TCP_CLOSE)) {
1628  		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1629  		goto out_unlock;
1630  	}
1631  
1632  	unix_state_lock(sk);
1633  
1634  	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1635  		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1636  		unix_state_unlock(sk);
1637  		goto out_unlock;
1638  	}
1639  
1640  	err = security_unix_stream_connect(sk, other, newsk);
1641  	if (err) {
1642  		unix_state_unlock(sk);
1643  		goto out_unlock;
1644  	}
1645  
1646  	/* The way is open! Fastly set all the necessary fields... */
1647  
1648  	sock_hold(sk);
1649  	unix_peer(newsk)	= sk;
1650  	newsk->sk_state		= TCP_ESTABLISHED;
1651  	newsk->sk_type		= sk->sk_type;
1652  	init_peercred(newsk);
1653  	newu = unix_sk(newsk);
1654  	newu->listener = other;
1655  	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1656  	otheru = unix_sk(other);
1657  
1658  	/* copy address information from listening to new sock
1659  	 *
1660  	 * The contents of *(otheru->addr) and otheru->path
1661  	 * are seen fully set up here, since we have found
1662  	 * otheru in hash under its lock.  Insertion into the
1663  	 * hash chain we'd found it in had been done in an
1664  	 * earlier critical area protected by the chain's lock,
1665  	 * the same one where we'd set *(otheru->addr) contents,
1666  	 * as well as otheru->path and otheru->addr itself.
1667  	 *
1668  	 * Using smp_store_release() here to set newu->addr
1669  	 * is enough to make those stores, as well as stores
1670  	 * to newu->path visible to anyone who gets newu->addr
1671  	 * by smp_load_acquire().  IOW, the same warranties
1672  	 * as for unix_sock instances bound in unix_bind() or
1673  	 * in unix_autobind().
1674  	 */
1675  	if (otheru->path.dentry) {
1676  		path_get(&otheru->path);
1677  		newu->path = otheru->path;
1678  	}
1679  	refcount_inc(&otheru->addr->refcnt);
1680  	smp_store_release(&newu->addr, otheru->addr);
1681  
1682  	/* Set credentials */
1683  	copy_peercred(sk, other);
1684  
1685  	sock->state	= SS_CONNECTED;
1686  	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1687  	sock_hold(newsk);
1688  
1689  	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1690  	unix_peer(sk)	= newsk;
1691  
1692  	unix_state_unlock(sk);
1693  
1694  	/* take ten and send info to listening sock */
1695  	spin_lock(&other->sk_receive_queue.lock);
1696  	__skb_queue_tail(&other->sk_receive_queue, skb);
1697  	spin_unlock(&other->sk_receive_queue.lock);
1698  	unix_state_unlock(other);
1699  	other->sk_data_ready(other);
1700  	sock_put(other);
1701  	return 0;
1702  
1703  out_unlock:
1704  	if (other)
1705  		unix_state_unlock(other);
1706  
1707  out:
1708  	kfree_skb(skb);
1709  	if (newsk)
1710  		unix_release_sock(newsk, 0);
1711  	if (other)
1712  		sock_put(other);
1713  	return err;
1714  }
1715  
1716  static int unix_socketpair(struct socket *socka, struct socket *sockb)
1717  {
1718  	struct sock *ska = socka->sk, *skb = sockb->sk;
1719  
1720  	/* Join our sockets back to back */
1721  	sock_hold(ska);
1722  	sock_hold(skb);
1723  	unix_peer(ska) = skb;
1724  	unix_peer(skb) = ska;
1725  	init_peercred(ska);
1726  	init_peercred(skb);
1727  
1728  	ska->sk_state = TCP_ESTABLISHED;
1729  	skb->sk_state = TCP_ESTABLISHED;
1730  	socka->state  = SS_CONNECTED;
1731  	sockb->state  = SS_CONNECTED;
1732  	return 0;
1733  }
1734  
1735  static void unix_sock_inherit_flags(const struct socket *old,
1736  				    struct socket *new)
1737  {
1738  	if (test_bit(SOCK_PASSCRED, &old->flags))
1739  		set_bit(SOCK_PASSCRED, &new->flags);
1740  	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1741  		set_bit(SOCK_PASSPIDFD, &new->flags);
1742  	if (test_bit(SOCK_PASSSEC, &old->flags))
1743  		set_bit(SOCK_PASSSEC, &new->flags);
1744  }
1745  
1746  static int unix_accept(struct socket *sock, struct socket *newsock,
1747  		       struct proto_accept_arg *arg)
1748  {
1749  	struct sock *sk = sock->sk;
1750  	struct sk_buff *skb;
1751  	struct sock *tsk;
1752  
1753  	arg->err = -EOPNOTSUPP;
1754  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1755  		goto out;
1756  
1757  	arg->err = -EINVAL;
1758  	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1759  		goto out;
1760  
1761  	/* If socket state is TCP_LISTEN it cannot change (for now...),
1762  	 * so that no locks are necessary.
1763  	 */
1764  
1765  	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1766  				&arg->err);
1767  	if (!skb) {
1768  		/* This means receive shutdown. */
1769  		if (arg->err == 0)
1770  			arg->err = -EINVAL;
1771  		goto out;
1772  	}
1773  
1774  	tsk = skb->sk;
1775  	skb_free_datagram(sk, skb);
1776  	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1777  
1778  	/* attach accepted sock to socket */
1779  	unix_state_lock(tsk);
1780  	unix_update_edges(unix_sk(tsk));
1781  	newsock->state = SS_CONNECTED;
1782  	unix_sock_inherit_flags(sock, newsock);
1783  	sock_graft(tsk, newsock);
1784  	unix_state_unlock(tsk);
1785  	return 0;
1786  
1787  out:
1788  	return arg->err;
1789  }
1790  
1791  
1792  static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1793  {
1794  	struct sock *sk = sock->sk;
1795  	struct unix_address *addr;
1796  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1797  	int err = 0;
1798  
1799  	if (peer) {
1800  		sk = unix_peer_get(sk);
1801  
1802  		err = -ENOTCONN;
1803  		if (!sk)
1804  			goto out;
1805  		err = 0;
1806  	} else {
1807  		sock_hold(sk);
1808  	}
1809  
1810  	addr = smp_load_acquire(&unix_sk(sk)->addr);
1811  	if (!addr) {
1812  		sunaddr->sun_family = AF_UNIX;
1813  		sunaddr->sun_path[0] = 0;
1814  		err = offsetof(struct sockaddr_un, sun_path);
1815  	} else {
1816  		err = addr->len;
1817  		memcpy(sunaddr, addr->name, addr->len);
1818  
1819  		if (peer)
1820  			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1821  					       CGROUP_UNIX_GETPEERNAME);
1822  		else
1823  			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1824  					       CGROUP_UNIX_GETSOCKNAME);
1825  	}
1826  	sock_put(sk);
1827  out:
1828  	return err;
1829  }
1830  
1831  /* The "user->unix_inflight" variable is protected by the garbage
1832   * collection lock, and we just read it locklessly here. If you go
1833   * over the limit, there might be a tiny race in actually noticing
1834   * it across threads. Tough.
1835   */
1836  static inline bool too_many_unix_fds(struct task_struct *p)
1837  {
1838  	struct user_struct *user = current_user();
1839  
1840  	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1841  		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1842  	return false;
1843  }
1844  
1845  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1846  {
1847  	if (too_many_unix_fds(current))
1848  		return -ETOOMANYREFS;
1849  
1850  	UNIXCB(skb).fp = scm->fp;
1851  	scm->fp = NULL;
1852  
1853  	if (unix_prepare_fpl(UNIXCB(skb).fp))
1854  		return -ENOMEM;
1855  
1856  	return 0;
1857  }
1858  
1859  static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1860  {
1861  	scm->fp = UNIXCB(skb).fp;
1862  	UNIXCB(skb).fp = NULL;
1863  
1864  	unix_destroy_fpl(scm->fp);
1865  }
1866  
1867  static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1868  {
1869  	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1870  }
1871  
1872  static void unix_destruct_scm(struct sk_buff *skb)
1873  {
1874  	struct scm_cookie scm;
1875  
1876  	memset(&scm, 0, sizeof(scm));
1877  	scm.pid  = UNIXCB(skb).pid;
1878  	if (UNIXCB(skb).fp)
1879  		unix_detach_fds(&scm, skb);
1880  
1881  	/* Alas, it calls VFS */
1882  	/* So fscking what? fput() had been SMP-safe since the last Summer */
1883  	scm_destroy(&scm);
1884  	sock_wfree(skb);
1885  }
1886  
1887  static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1888  {
1889  	int err = 0;
1890  
1891  	UNIXCB(skb).pid  = get_pid(scm->pid);
1892  	UNIXCB(skb).uid = scm->creds.uid;
1893  	UNIXCB(skb).gid = scm->creds.gid;
1894  	UNIXCB(skb).fp = NULL;
1895  	unix_get_secdata(scm, skb);
1896  	if (scm->fp && send_fds)
1897  		err = unix_attach_fds(scm, skb);
1898  
1899  	skb->destructor = unix_destruct_scm;
1900  	return err;
1901  }
1902  
1903  static bool unix_passcred_enabled(const struct socket *sock,
1904  				  const struct sock *other)
1905  {
1906  	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1907  	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1908  	       !other->sk_socket ||
1909  	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1910  	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1911  }
1912  
1913  /*
1914   * Some apps rely on write() giving SCM_CREDENTIALS
1915   * We include credentials if source or destination socket
1916   * asserted SOCK_PASSCRED.
1917   */
1918  static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1919  			    const struct sock *other)
1920  {
1921  	if (UNIXCB(skb).pid)
1922  		return;
1923  	if (unix_passcred_enabled(sock, other)) {
1924  		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1925  		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1926  	}
1927  }
1928  
1929  static bool unix_skb_scm_eq(struct sk_buff *skb,
1930  			    struct scm_cookie *scm)
1931  {
1932  	return UNIXCB(skb).pid == scm->pid &&
1933  	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1934  	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1935  	       unix_secdata_eq(scm, skb);
1936  }
1937  
1938  static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1939  {
1940  	struct scm_fp_list *fp = UNIXCB(skb).fp;
1941  	struct unix_sock *u = unix_sk(sk);
1942  
1943  	if (unlikely(fp && fp->count)) {
1944  		atomic_add(fp->count, &u->scm_stat.nr_fds);
1945  		unix_add_edges(fp, u);
1946  	}
1947  }
1948  
1949  static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1950  {
1951  	struct scm_fp_list *fp = UNIXCB(skb).fp;
1952  	struct unix_sock *u = unix_sk(sk);
1953  
1954  	if (unlikely(fp && fp->count)) {
1955  		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1956  		unix_del_edges(fp);
1957  	}
1958  }
1959  
1960  /*
1961   *	Send AF_UNIX data.
1962   */
1963  
1964  static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1965  			      size_t len)
1966  {
1967  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1968  	struct sock *sk = sock->sk, *other = NULL;
1969  	struct unix_sock *u = unix_sk(sk);
1970  	struct scm_cookie scm;
1971  	struct sk_buff *skb;
1972  	int data_len = 0;
1973  	int sk_locked;
1974  	long timeo;
1975  	int err;
1976  
1977  	err = scm_send(sock, msg, &scm, false);
1978  	if (err < 0)
1979  		return err;
1980  
1981  	wait_for_unix_gc(scm.fp);
1982  
1983  	err = -EOPNOTSUPP;
1984  	if (msg->msg_flags&MSG_OOB)
1985  		goto out;
1986  
1987  	if (msg->msg_namelen) {
1988  		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1989  		if (err)
1990  			goto out;
1991  
1992  		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1993  							    msg->msg_name,
1994  							    &msg->msg_namelen,
1995  							    NULL);
1996  		if (err)
1997  			goto out;
1998  	} else {
1999  		sunaddr = NULL;
2000  		err = -ENOTCONN;
2001  		other = unix_peer_get(sk);
2002  		if (!other)
2003  			goto out;
2004  	}
2005  
2006  	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2007  	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
2008  	    !READ_ONCE(u->addr)) {
2009  		err = unix_autobind(sk);
2010  		if (err)
2011  			goto out;
2012  	}
2013  
2014  	err = -EMSGSIZE;
2015  	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
2016  		goto out;
2017  
2018  	if (len > SKB_MAX_ALLOC) {
2019  		data_len = min_t(size_t,
2020  				 len - SKB_MAX_ALLOC,
2021  				 MAX_SKB_FRAGS * PAGE_SIZE);
2022  		data_len = PAGE_ALIGN(data_len);
2023  
2024  		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2025  	}
2026  
2027  	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2028  				   msg->msg_flags & MSG_DONTWAIT, &err,
2029  				   PAGE_ALLOC_COSTLY_ORDER);
2030  	if (skb == NULL)
2031  		goto out;
2032  
2033  	err = unix_scm_to_skb(&scm, skb, true);
2034  	if (err < 0)
2035  		goto out_free;
2036  
2037  	skb_put(skb, len - data_len);
2038  	skb->data_len = data_len;
2039  	skb->len = len;
2040  	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2041  	if (err)
2042  		goto out_free;
2043  
2044  	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2045  
2046  restart:
2047  	if (!other) {
2048  		err = -ECONNRESET;
2049  		if (sunaddr == NULL)
2050  			goto out_free;
2051  
2052  		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2053  					sk->sk_type);
2054  		if (IS_ERR(other)) {
2055  			err = PTR_ERR(other);
2056  			other = NULL;
2057  			goto out_free;
2058  		}
2059  	}
2060  
2061  	if (sk_filter(other, skb) < 0) {
2062  		/* Toss the packet but do not return any error to the sender */
2063  		err = len;
2064  		goto out_free;
2065  	}
2066  
2067  	sk_locked = 0;
2068  	unix_state_lock(other);
2069  restart_locked:
2070  	err = -EPERM;
2071  	if (!unix_may_send(sk, other))
2072  		goto out_unlock;
2073  
2074  	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2075  		/*
2076  		 *	Check with 1003.1g - what should
2077  		 *	datagram error
2078  		 */
2079  		unix_state_unlock(other);
2080  		sock_put(other);
2081  
2082  		if (!sk_locked)
2083  			unix_state_lock(sk);
2084  
2085  		err = 0;
2086  		if (sk->sk_type == SOCK_SEQPACKET) {
2087  			/* We are here only when racing with unix_release_sock()
2088  			 * is clearing @other. Never change state to TCP_CLOSE
2089  			 * unlike SOCK_DGRAM wants.
2090  			 */
2091  			unix_state_unlock(sk);
2092  			err = -EPIPE;
2093  		} else if (unix_peer(sk) == other) {
2094  			unix_peer(sk) = NULL;
2095  			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2096  
2097  			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2098  			unix_state_unlock(sk);
2099  
2100  			unix_dgram_disconnected(sk, other);
2101  			sock_put(other);
2102  			err = -ECONNREFUSED;
2103  		} else {
2104  			unix_state_unlock(sk);
2105  		}
2106  
2107  		other = NULL;
2108  		if (err)
2109  			goto out_free;
2110  		goto restart;
2111  	}
2112  
2113  	err = -EPIPE;
2114  	if (other->sk_shutdown & RCV_SHUTDOWN)
2115  		goto out_unlock;
2116  
2117  	if (sk->sk_type != SOCK_SEQPACKET) {
2118  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2119  		if (err)
2120  			goto out_unlock;
2121  	}
2122  
2123  	/* other == sk && unix_peer(other) != sk if
2124  	 * - unix_peer(sk) == NULL, destination address bound to sk
2125  	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2126  	 */
2127  	if (other != sk &&
2128  	    unlikely(unix_peer(other) != sk &&
2129  	    unix_recvq_full_lockless(other))) {
2130  		if (timeo) {
2131  			timeo = unix_wait_for_peer(other, timeo);
2132  
2133  			err = sock_intr_errno(timeo);
2134  			if (signal_pending(current))
2135  				goto out_free;
2136  
2137  			goto restart;
2138  		}
2139  
2140  		if (!sk_locked) {
2141  			unix_state_unlock(other);
2142  			unix_state_double_lock(sk, other);
2143  		}
2144  
2145  		if (unix_peer(sk) != other ||
2146  		    unix_dgram_peer_wake_me(sk, other)) {
2147  			err = -EAGAIN;
2148  			sk_locked = 1;
2149  			goto out_unlock;
2150  		}
2151  
2152  		if (!sk_locked) {
2153  			sk_locked = 1;
2154  			goto restart_locked;
2155  		}
2156  	}
2157  
2158  	if (unlikely(sk_locked))
2159  		unix_state_unlock(sk);
2160  
2161  	if (sock_flag(other, SOCK_RCVTSTAMP))
2162  		__net_timestamp(skb);
2163  	maybe_add_creds(skb, sock, other);
2164  	scm_stat_add(other, skb);
2165  	skb_queue_tail(&other->sk_receive_queue, skb);
2166  	unix_state_unlock(other);
2167  	other->sk_data_ready(other);
2168  	sock_put(other);
2169  	scm_destroy(&scm);
2170  	return len;
2171  
2172  out_unlock:
2173  	if (sk_locked)
2174  		unix_state_unlock(sk);
2175  	unix_state_unlock(other);
2176  out_free:
2177  	kfree_skb(skb);
2178  out:
2179  	if (other)
2180  		sock_put(other);
2181  	scm_destroy(&scm);
2182  	return err;
2183  }
2184  
2185  /* We use paged skbs for stream sockets, and limit occupancy to 32768
2186   * bytes, and a minimum of a full page.
2187   */
2188  #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2189  
2190  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2191  static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2192  		     struct scm_cookie *scm, bool fds_sent)
2193  {
2194  	struct unix_sock *ousk = unix_sk(other);
2195  	struct sk_buff *skb;
2196  	int err = 0;
2197  
2198  	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2199  
2200  	if (!skb)
2201  		return err;
2202  
2203  	err = unix_scm_to_skb(scm, skb, !fds_sent);
2204  	if (err < 0) {
2205  		kfree_skb(skb);
2206  		return err;
2207  	}
2208  	skb_put(skb, 1);
2209  	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2210  
2211  	if (err) {
2212  		kfree_skb(skb);
2213  		return err;
2214  	}
2215  
2216  	unix_state_lock(other);
2217  
2218  	if (sock_flag(other, SOCK_DEAD) ||
2219  	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2220  		unix_state_unlock(other);
2221  		kfree_skb(skb);
2222  		return -EPIPE;
2223  	}
2224  
2225  	maybe_add_creds(skb, sock, other);
2226  	scm_stat_add(other, skb);
2227  
2228  	spin_lock(&other->sk_receive_queue.lock);
2229  	WRITE_ONCE(ousk->oob_skb, skb);
2230  	__skb_queue_tail(&other->sk_receive_queue, skb);
2231  	spin_unlock(&other->sk_receive_queue.lock);
2232  
2233  	sk_send_sigurg(other);
2234  	unix_state_unlock(other);
2235  	other->sk_data_ready(other);
2236  
2237  	return err;
2238  }
2239  #endif
2240  
2241  static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2242  			       size_t len)
2243  {
2244  	struct sock *sk = sock->sk;
2245  	struct sock *other = NULL;
2246  	int err, size;
2247  	struct sk_buff *skb;
2248  	int sent = 0;
2249  	struct scm_cookie scm;
2250  	bool fds_sent = false;
2251  	int data_len;
2252  
2253  	err = scm_send(sock, msg, &scm, false);
2254  	if (err < 0)
2255  		return err;
2256  
2257  	wait_for_unix_gc(scm.fp);
2258  
2259  	err = -EOPNOTSUPP;
2260  	if (msg->msg_flags & MSG_OOB) {
2261  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2262  		if (len)
2263  			len--;
2264  		else
2265  #endif
2266  			goto out_err;
2267  	}
2268  
2269  	if (msg->msg_namelen) {
2270  		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2271  		goto out_err;
2272  	} else {
2273  		err = -ENOTCONN;
2274  		other = unix_peer(sk);
2275  		if (!other)
2276  			goto out_err;
2277  	}
2278  
2279  	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2280  		goto pipe_err;
2281  
2282  	while (sent < len) {
2283  		size = len - sent;
2284  
2285  		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2286  			skb = sock_alloc_send_pskb(sk, 0, 0,
2287  						   msg->msg_flags & MSG_DONTWAIT,
2288  						   &err, 0);
2289  		} else {
2290  			/* Keep two messages in the pipe so it schedules better */
2291  			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2292  
2293  			/* allow fallback to order-0 allocations */
2294  			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2295  
2296  			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2297  
2298  			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2299  
2300  			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2301  						   msg->msg_flags & MSG_DONTWAIT, &err,
2302  						   get_order(UNIX_SKB_FRAGS_SZ));
2303  		}
2304  		if (!skb)
2305  			goto out_err;
2306  
2307  		/* Only send the fds in the first buffer */
2308  		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2309  		if (err < 0) {
2310  			kfree_skb(skb);
2311  			goto out_err;
2312  		}
2313  		fds_sent = true;
2314  
2315  		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2316  			skb->ip_summed = CHECKSUM_UNNECESSARY;
2317  			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2318  						   sk->sk_allocation);
2319  			if (err < 0) {
2320  				kfree_skb(skb);
2321  				goto out_err;
2322  			}
2323  			size = err;
2324  			refcount_add(size, &sk->sk_wmem_alloc);
2325  		} else {
2326  			skb_put(skb, size - data_len);
2327  			skb->data_len = data_len;
2328  			skb->len = size;
2329  			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2330  			if (err) {
2331  				kfree_skb(skb);
2332  				goto out_err;
2333  			}
2334  		}
2335  
2336  		unix_state_lock(other);
2337  
2338  		if (sock_flag(other, SOCK_DEAD) ||
2339  		    (other->sk_shutdown & RCV_SHUTDOWN))
2340  			goto pipe_err_free;
2341  
2342  		maybe_add_creds(skb, sock, other);
2343  		scm_stat_add(other, skb);
2344  		skb_queue_tail(&other->sk_receive_queue, skb);
2345  		unix_state_unlock(other);
2346  		other->sk_data_ready(other);
2347  		sent += size;
2348  	}
2349  
2350  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2351  	if (msg->msg_flags & MSG_OOB) {
2352  		err = queue_oob(sock, msg, other, &scm, fds_sent);
2353  		if (err)
2354  			goto out_err;
2355  		sent++;
2356  	}
2357  #endif
2358  
2359  	scm_destroy(&scm);
2360  
2361  	return sent;
2362  
2363  pipe_err_free:
2364  	unix_state_unlock(other);
2365  	kfree_skb(skb);
2366  pipe_err:
2367  	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2368  		send_sig(SIGPIPE, current, 0);
2369  	err = -EPIPE;
2370  out_err:
2371  	scm_destroy(&scm);
2372  	return sent ? : err;
2373  }
2374  
2375  static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2376  				  size_t len)
2377  {
2378  	int err;
2379  	struct sock *sk = sock->sk;
2380  
2381  	err = sock_error(sk);
2382  	if (err)
2383  		return err;
2384  
2385  	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2386  		return -ENOTCONN;
2387  
2388  	if (msg->msg_namelen)
2389  		msg->msg_namelen = 0;
2390  
2391  	return unix_dgram_sendmsg(sock, msg, len);
2392  }
2393  
2394  static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2395  				  size_t size, int flags)
2396  {
2397  	struct sock *sk = sock->sk;
2398  
2399  	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2400  		return -ENOTCONN;
2401  
2402  	return unix_dgram_recvmsg(sock, msg, size, flags);
2403  }
2404  
2405  static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2406  {
2407  	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2408  
2409  	if (addr) {
2410  		msg->msg_namelen = addr->len;
2411  		memcpy(msg->msg_name, addr->name, addr->len);
2412  	}
2413  }
2414  
2415  int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2416  			 int flags)
2417  {
2418  	struct scm_cookie scm;
2419  	struct socket *sock = sk->sk_socket;
2420  	struct unix_sock *u = unix_sk(sk);
2421  	struct sk_buff *skb, *last;
2422  	long timeo;
2423  	int skip;
2424  	int err;
2425  
2426  	err = -EOPNOTSUPP;
2427  	if (flags&MSG_OOB)
2428  		goto out;
2429  
2430  	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2431  
2432  	do {
2433  		mutex_lock(&u->iolock);
2434  
2435  		skip = sk_peek_offset(sk, flags);
2436  		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2437  					      &skip, &err, &last);
2438  		if (skb) {
2439  			if (!(flags & MSG_PEEK))
2440  				scm_stat_del(sk, skb);
2441  			break;
2442  		}
2443  
2444  		mutex_unlock(&u->iolock);
2445  
2446  		if (err != -EAGAIN)
2447  			break;
2448  	} while (timeo &&
2449  		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2450  					      &err, &timeo, last));
2451  
2452  	if (!skb) { /* implies iolock unlocked */
2453  		unix_state_lock(sk);
2454  		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2455  		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2456  		    (sk->sk_shutdown & RCV_SHUTDOWN))
2457  			err = 0;
2458  		unix_state_unlock(sk);
2459  		goto out;
2460  	}
2461  
2462  	if (wq_has_sleeper(&u->peer_wait))
2463  		wake_up_interruptible_sync_poll(&u->peer_wait,
2464  						EPOLLOUT | EPOLLWRNORM |
2465  						EPOLLWRBAND);
2466  
2467  	if (msg->msg_name) {
2468  		unix_copy_addr(msg, skb->sk);
2469  
2470  		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2471  						      msg->msg_name,
2472  						      &msg->msg_namelen);
2473  	}
2474  
2475  	if (size > skb->len - skip)
2476  		size = skb->len - skip;
2477  	else if (size < skb->len - skip)
2478  		msg->msg_flags |= MSG_TRUNC;
2479  
2480  	err = skb_copy_datagram_msg(skb, skip, msg, size);
2481  	if (err)
2482  		goto out_free;
2483  
2484  	if (sock_flag(sk, SOCK_RCVTSTAMP))
2485  		__sock_recv_timestamp(msg, sk, skb);
2486  
2487  	memset(&scm, 0, sizeof(scm));
2488  
2489  	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2490  	unix_set_secdata(&scm, skb);
2491  
2492  	if (!(flags & MSG_PEEK)) {
2493  		if (UNIXCB(skb).fp)
2494  			unix_detach_fds(&scm, skb);
2495  
2496  		sk_peek_offset_bwd(sk, skb->len);
2497  	} else {
2498  		/* It is questionable: on PEEK we could:
2499  		   - do not return fds - good, but too simple 8)
2500  		   - return fds, and do not return them on read (old strategy,
2501  		     apparently wrong)
2502  		   - clone fds (I chose it for now, it is the most universal
2503  		     solution)
2504  
2505  		   POSIX 1003.1g does not actually define this clearly
2506  		   at all. POSIX 1003.1g doesn't define a lot of things
2507  		   clearly however!
2508  
2509  		*/
2510  
2511  		sk_peek_offset_fwd(sk, size);
2512  
2513  		if (UNIXCB(skb).fp)
2514  			unix_peek_fds(&scm, skb);
2515  	}
2516  	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2517  
2518  	scm_recv_unix(sock, msg, &scm, flags);
2519  
2520  out_free:
2521  	skb_free_datagram(sk, skb);
2522  	mutex_unlock(&u->iolock);
2523  out:
2524  	return err;
2525  }
2526  
2527  static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2528  			      int flags)
2529  {
2530  	struct sock *sk = sock->sk;
2531  
2532  #ifdef CONFIG_BPF_SYSCALL
2533  	const struct proto *prot = READ_ONCE(sk->sk_prot);
2534  
2535  	if (prot != &unix_dgram_proto)
2536  		return prot->recvmsg(sk, msg, size, flags, NULL);
2537  #endif
2538  	return __unix_dgram_recvmsg(sk, msg, size, flags);
2539  }
2540  
2541  static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2542  {
2543  	struct unix_sock *u = unix_sk(sk);
2544  	struct sk_buff *skb;
2545  	int err;
2546  
2547  	mutex_lock(&u->iolock);
2548  	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2549  	mutex_unlock(&u->iolock);
2550  	if (!skb)
2551  		return err;
2552  
2553  	return recv_actor(sk, skb);
2554  }
2555  
2556  /*
2557   *	Sleep until more data has arrived. But check for races..
2558   */
2559  static long unix_stream_data_wait(struct sock *sk, long timeo,
2560  				  struct sk_buff *last, unsigned int last_len,
2561  				  bool freezable)
2562  {
2563  	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2564  	struct sk_buff *tail;
2565  	DEFINE_WAIT(wait);
2566  
2567  	unix_state_lock(sk);
2568  
2569  	for (;;) {
2570  		prepare_to_wait(sk_sleep(sk), &wait, state);
2571  
2572  		tail = skb_peek_tail(&sk->sk_receive_queue);
2573  		if (tail != last ||
2574  		    (tail && tail->len != last_len) ||
2575  		    sk->sk_err ||
2576  		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2577  		    signal_pending(current) ||
2578  		    !timeo)
2579  			break;
2580  
2581  		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2582  		unix_state_unlock(sk);
2583  		timeo = schedule_timeout(timeo);
2584  		unix_state_lock(sk);
2585  
2586  		if (sock_flag(sk, SOCK_DEAD))
2587  			break;
2588  
2589  		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2590  	}
2591  
2592  	finish_wait(sk_sleep(sk), &wait);
2593  	unix_state_unlock(sk);
2594  	return timeo;
2595  }
2596  
2597  static unsigned int unix_skb_len(const struct sk_buff *skb)
2598  {
2599  	return skb->len - UNIXCB(skb).consumed;
2600  }
2601  
2602  struct unix_stream_read_state {
2603  	int (*recv_actor)(struct sk_buff *, int, int,
2604  			  struct unix_stream_read_state *);
2605  	struct socket *socket;
2606  	struct msghdr *msg;
2607  	struct pipe_inode_info *pipe;
2608  	size_t size;
2609  	int flags;
2610  	unsigned int splice_flags;
2611  };
2612  
2613  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2614  static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2615  {
2616  	struct socket *sock = state->socket;
2617  	struct sock *sk = sock->sk;
2618  	struct unix_sock *u = unix_sk(sk);
2619  	int chunk = 1;
2620  	struct sk_buff *oob_skb;
2621  
2622  	mutex_lock(&u->iolock);
2623  	unix_state_lock(sk);
2624  	spin_lock(&sk->sk_receive_queue.lock);
2625  
2626  	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2627  		spin_unlock(&sk->sk_receive_queue.lock);
2628  		unix_state_unlock(sk);
2629  		mutex_unlock(&u->iolock);
2630  		return -EINVAL;
2631  	}
2632  
2633  	oob_skb = u->oob_skb;
2634  
2635  	if (!(state->flags & MSG_PEEK))
2636  		WRITE_ONCE(u->oob_skb, NULL);
2637  
2638  	spin_unlock(&sk->sk_receive_queue.lock);
2639  	unix_state_unlock(sk);
2640  
2641  	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2642  
2643  	if (!(state->flags & MSG_PEEK))
2644  		UNIXCB(oob_skb).consumed += 1;
2645  
2646  	mutex_unlock(&u->iolock);
2647  
2648  	if (chunk < 0)
2649  		return -EFAULT;
2650  
2651  	state->msg->msg_flags |= MSG_OOB;
2652  	return 1;
2653  }
2654  
2655  static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2656  				  int flags, int copied)
2657  {
2658  	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2659  	struct unix_sock *u = unix_sk(sk);
2660  
2661  	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2662  		return skb;
2663  
2664  	spin_lock(&sk->sk_receive_queue.lock);
2665  
2666  	if (!unix_skb_len(skb)) {
2667  		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2668  			skb = NULL;
2669  		} else if (flags & MSG_PEEK) {
2670  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2671  		} else {
2672  			read_skb = skb;
2673  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2674  			__skb_unlink(read_skb, &sk->sk_receive_queue);
2675  		}
2676  
2677  		if (!skb)
2678  			goto unlock;
2679  	}
2680  
2681  	if (skb != u->oob_skb)
2682  		goto unlock;
2683  
2684  	if (copied) {
2685  		skb = NULL;
2686  	} else if (!(flags & MSG_PEEK)) {
2687  		WRITE_ONCE(u->oob_skb, NULL);
2688  
2689  		if (!sock_flag(sk, SOCK_URGINLINE)) {
2690  			__skb_unlink(skb, &sk->sk_receive_queue);
2691  			unread_skb = skb;
2692  			skb = skb_peek(&sk->sk_receive_queue);
2693  		}
2694  	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2695  		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2696  	}
2697  
2698  unlock:
2699  	spin_unlock(&sk->sk_receive_queue.lock);
2700  
2701  	consume_skb(read_skb);
2702  	kfree_skb(unread_skb);
2703  
2704  	return skb;
2705  }
2706  #endif
2707  
2708  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2709  {
2710  	struct unix_sock *u = unix_sk(sk);
2711  	struct sk_buff *skb;
2712  	int err;
2713  
2714  	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2715  		return -ENOTCONN;
2716  
2717  	mutex_lock(&u->iolock);
2718  	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2719  	mutex_unlock(&u->iolock);
2720  	if (!skb)
2721  		return err;
2722  
2723  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2724  	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2725  		bool drop = false;
2726  
2727  		unix_state_lock(sk);
2728  
2729  		if (sock_flag(sk, SOCK_DEAD)) {
2730  			unix_state_unlock(sk);
2731  			kfree_skb(skb);
2732  			return -ECONNRESET;
2733  		}
2734  
2735  		spin_lock(&sk->sk_receive_queue.lock);
2736  		if (likely(skb == u->oob_skb)) {
2737  			WRITE_ONCE(u->oob_skb, NULL);
2738  			drop = true;
2739  		}
2740  		spin_unlock(&sk->sk_receive_queue.lock);
2741  
2742  		unix_state_unlock(sk);
2743  
2744  		if (drop) {
2745  			kfree_skb(skb);
2746  			return -EAGAIN;
2747  		}
2748  	}
2749  #endif
2750  
2751  	return recv_actor(sk, skb);
2752  }
2753  
2754  static int unix_stream_read_generic(struct unix_stream_read_state *state,
2755  				    bool freezable)
2756  {
2757  	struct scm_cookie scm;
2758  	struct socket *sock = state->socket;
2759  	struct sock *sk = sock->sk;
2760  	struct unix_sock *u = unix_sk(sk);
2761  	int copied = 0;
2762  	int flags = state->flags;
2763  	int noblock = flags & MSG_DONTWAIT;
2764  	bool check_creds = false;
2765  	int target;
2766  	int err = 0;
2767  	long timeo;
2768  	int skip;
2769  	size_t size = state->size;
2770  	unsigned int last_len;
2771  
2772  	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2773  		err = -EINVAL;
2774  		goto out;
2775  	}
2776  
2777  	if (unlikely(flags & MSG_OOB)) {
2778  		err = -EOPNOTSUPP;
2779  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2780  		err = unix_stream_recv_urg(state);
2781  #endif
2782  		goto out;
2783  	}
2784  
2785  	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2786  	timeo = sock_rcvtimeo(sk, noblock);
2787  
2788  	memset(&scm, 0, sizeof(scm));
2789  
2790  	/* Lock the socket to prevent queue disordering
2791  	 * while sleeps in memcpy_tomsg
2792  	 */
2793  	mutex_lock(&u->iolock);
2794  
2795  	skip = max(sk_peek_offset(sk, flags), 0);
2796  
2797  	do {
2798  		struct sk_buff *skb, *last;
2799  		int chunk;
2800  
2801  redo:
2802  		unix_state_lock(sk);
2803  		if (sock_flag(sk, SOCK_DEAD)) {
2804  			err = -ECONNRESET;
2805  			goto unlock;
2806  		}
2807  		last = skb = skb_peek(&sk->sk_receive_queue);
2808  		last_len = last ? last->len : 0;
2809  
2810  again:
2811  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2812  		if (skb) {
2813  			skb = manage_oob(skb, sk, flags, copied);
2814  			if (!skb && copied) {
2815  				unix_state_unlock(sk);
2816  				break;
2817  			}
2818  		}
2819  #endif
2820  		if (skb == NULL) {
2821  			if (copied >= target)
2822  				goto unlock;
2823  
2824  			/*
2825  			 *	POSIX 1003.1g mandates this order.
2826  			 */
2827  
2828  			err = sock_error(sk);
2829  			if (err)
2830  				goto unlock;
2831  			if (sk->sk_shutdown & RCV_SHUTDOWN)
2832  				goto unlock;
2833  
2834  			unix_state_unlock(sk);
2835  			if (!timeo) {
2836  				err = -EAGAIN;
2837  				break;
2838  			}
2839  
2840  			mutex_unlock(&u->iolock);
2841  
2842  			timeo = unix_stream_data_wait(sk, timeo, last,
2843  						      last_len, freezable);
2844  
2845  			if (signal_pending(current)) {
2846  				err = sock_intr_errno(timeo);
2847  				scm_destroy(&scm);
2848  				goto out;
2849  			}
2850  
2851  			mutex_lock(&u->iolock);
2852  			goto redo;
2853  unlock:
2854  			unix_state_unlock(sk);
2855  			break;
2856  		}
2857  
2858  		while (skip >= unix_skb_len(skb)) {
2859  			skip -= unix_skb_len(skb);
2860  			last = skb;
2861  			last_len = skb->len;
2862  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2863  			if (!skb)
2864  				goto again;
2865  		}
2866  
2867  		unix_state_unlock(sk);
2868  
2869  		if (check_creds) {
2870  			/* Never glue messages from different writers */
2871  			if (!unix_skb_scm_eq(skb, &scm))
2872  				break;
2873  		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2874  			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2875  			/* Copy credentials */
2876  			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2877  			unix_set_secdata(&scm, skb);
2878  			check_creds = true;
2879  		}
2880  
2881  		/* Copy address just once */
2882  		if (state->msg && state->msg->msg_name) {
2883  			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2884  					 state->msg->msg_name);
2885  			unix_copy_addr(state->msg, skb->sk);
2886  
2887  			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2888  							      state->msg->msg_name,
2889  							      &state->msg->msg_namelen);
2890  
2891  			sunaddr = NULL;
2892  		}
2893  
2894  		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2895  		chunk = state->recv_actor(skb, skip, chunk, state);
2896  		if (chunk < 0) {
2897  			if (copied == 0)
2898  				copied = -EFAULT;
2899  			break;
2900  		}
2901  		copied += chunk;
2902  		size -= chunk;
2903  
2904  		/* Mark read part of skb as used */
2905  		if (!(flags & MSG_PEEK)) {
2906  			UNIXCB(skb).consumed += chunk;
2907  
2908  			sk_peek_offset_bwd(sk, chunk);
2909  
2910  			if (UNIXCB(skb).fp) {
2911  				scm_stat_del(sk, skb);
2912  				unix_detach_fds(&scm, skb);
2913  			}
2914  
2915  			if (unix_skb_len(skb))
2916  				break;
2917  
2918  			skb_unlink(skb, &sk->sk_receive_queue);
2919  			consume_skb(skb);
2920  
2921  			if (scm.fp)
2922  				break;
2923  		} else {
2924  			/* It is questionable, see note in unix_dgram_recvmsg.
2925  			 */
2926  			if (UNIXCB(skb).fp)
2927  				unix_peek_fds(&scm, skb);
2928  
2929  			sk_peek_offset_fwd(sk, chunk);
2930  
2931  			if (UNIXCB(skb).fp)
2932  				break;
2933  
2934  			skip = 0;
2935  			last = skb;
2936  			last_len = skb->len;
2937  			unix_state_lock(sk);
2938  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2939  			if (skb)
2940  				goto again;
2941  			unix_state_unlock(sk);
2942  			break;
2943  		}
2944  	} while (size);
2945  
2946  	mutex_unlock(&u->iolock);
2947  	if (state->msg)
2948  		scm_recv_unix(sock, state->msg, &scm, flags);
2949  	else
2950  		scm_destroy(&scm);
2951  out:
2952  	return copied ? : err;
2953  }
2954  
2955  static int unix_stream_read_actor(struct sk_buff *skb,
2956  				  int skip, int chunk,
2957  				  struct unix_stream_read_state *state)
2958  {
2959  	int ret;
2960  
2961  	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2962  				    state->msg, chunk);
2963  	return ret ?: chunk;
2964  }
2965  
2966  int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2967  			  size_t size, int flags)
2968  {
2969  	struct unix_stream_read_state state = {
2970  		.recv_actor = unix_stream_read_actor,
2971  		.socket = sk->sk_socket,
2972  		.msg = msg,
2973  		.size = size,
2974  		.flags = flags
2975  	};
2976  
2977  	return unix_stream_read_generic(&state, true);
2978  }
2979  
2980  static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2981  			       size_t size, int flags)
2982  {
2983  	struct unix_stream_read_state state = {
2984  		.recv_actor = unix_stream_read_actor,
2985  		.socket = sock,
2986  		.msg = msg,
2987  		.size = size,
2988  		.flags = flags
2989  	};
2990  
2991  #ifdef CONFIG_BPF_SYSCALL
2992  	struct sock *sk = sock->sk;
2993  	const struct proto *prot = READ_ONCE(sk->sk_prot);
2994  
2995  	if (prot != &unix_stream_proto)
2996  		return prot->recvmsg(sk, msg, size, flags, NULL);
2997  #endif
2998  	return unix_stream_read_generic(&state, true);
2999  }
3000  
3001  static int unix_stream_splice_actor(struct sk_buff *skb,
3002  				    int skip, int chunk,
3003  				    struct unix_stream_read_state *state)
3004  {
3005  	return skb_splice_bits(skb, state->socket->sk,
3006  			       UNIXCB(skb).consumed + skip,
3007  			       state->pipe, chunk, state->splice_flags);
3008  }
3009  
3010  static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3011  				       struct pipe_inode_info *pipe,
3012  				       size_t size, unsigned int flags)
3013  {
3014  	struct unix_stream_read_state state = {
3015  		.recv_actor = unix_stream_splice_actor,
3016  		.socket = sock,
3017  		.pipe = pipe,
3018  		.size = size,
3019  		.splice_flags = flags,
3020  	};
3021  
3022  	if (unlikely(*ppos))
3023  		return -ESPIPE;
3024  
3025  	if (sock->file->f_flags & O_NONBLOCK ||
3026  	    flags & SPLICE_F_NONBLOCK)
3027  		state.flags = MSG_DONTWAIT;
3028  
3029  	return unix_stream_read_generic(&state, false);
3030  }
3031  
3032  static int unix_shutdown(struct socket *sock, int mode)
3033  {
3034  	struct sock *sk = sock->sk;
3035  	struct sock *other;
3036  
3037  	if (mode < SHUT_RD || mode > SHUT_RDWR)
3038  		return -EINVAL;
3039  	/* This maps:
3040  	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3041  	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3042  	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3043  	 */
3044  	++mode;
3045  
3046  	unix_state_lock(sk);
3047  	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3048  	other = unix_peer(sk);
3049  	if (other)
3050  		sock_hold(other);
3051  	unix_state_unlock(sk);
3052  	sk->sk_state_change(sk);
3053  
3054  	if (other &&
3055  		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3056  
3057  		int peer_mode = 0;
3058  		const struct proto *prot = READ_ONCE(other->sk_prot);
3059  
3060  		if (prot->unhash)
3061  			prot->unhash(other);
3062  		if (mode&RCV_SHUTDOWN)
3063  			peer_mode |= SEND_SHUTDOWN;
3064  		if (mode&SEND_SHUTDOWN)
3065  			peer_mode |= RCV_SHUTDOWN;
3066  		unix_state_lock(other);
3067  		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3068  		unix_state_unlock(other);
3069  		other->sk_state_change(other);
3070  		if (peer_mode == SHUTDOWN_MASK)
3071  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3072  		else if (peer_mode & RCV_SHUTDOWN)
3073  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3074  	}
3075  	if (other)
3076  		sock_put(other);
3077  
3078  	return 0;
3079  }
3080  
3081  long unix_inq_len(struct sock *sk)
3082  {
3083  	struct sk_buff *skb;
3084  	long amount = 0;
3085  
3086  	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3087  		return -EINVAL;
3088  
3089  	spin_lock(&sk->sk_receive_queue.lock);
3090  	if (sk->sk_type == SOCK_STREAM ||
3091  	    sk->sk_type == SOCK_SEQPACKET) {
3092  		skb_queue_walk(&sk->sk_receive_queue, skb)
3093  			amount += unix_skb_len(skb);
3094  	} else {
3095  		skb = skb_peek(&sk->sk_receive_queue);
3096  		if (skb)
3097  			amount = skb->len;
3098  	}
3099  	spin_unlock(&sk->sk_receive_queue.lock);
3100  
3101  	return amount;
3102  }
3103  EXPORT_SYMBOL_GPL(unix_inq_len);
3104  
3105  long unix_outq_len(struct sock *sk)
3106  {
3107  	return sk_wmem_alloc_get(sk);
3108  }
3109  EXPORT_SYMBOL_GPL(unix_outq_len);
3110  
3111  static int unix_open_file(struct sock *sk)
3112  {
3113  	struct path path;
3114  	struct file *f;
3115  	int fd;
3116  
3117  	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3118  		return -EPERM;
3119  
3120  	if (!smp_load_acquire(&unix_sk(sk)->addr))
3121  		return -ENOENT;
3122  
3123  	path = unix_sk(sk)->path;
3124  	if (!path.dentry)
3125  		return -ENOENT;
3126  
3127  	path_get(&path);
3128  
3129  	fd = get_unused_fd_flags(O_CLOEXEC);
3130  	if (fd < 0)
3131  		goto out;
3132  
3133  	f = dentry_open(&path, O_PATH, current_cred());
3134  	if (IS_ERR(f)) {
3135  		put_unused_fd(fd);
3136  		fd = PTR_ERR(f);
3137  		goto out;
3138  	}
3139  
3140  	fd_install(fd, f);
3141  out:
3142  	path_put(&path);
3143  
3144  	return fd;
3145  }
3146  
3147  static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3148  {
3149  	struct sock *sk = sock->sk;
3150  	long amount = 0;
3151  	int err;
3152  
3153  	switch (cmd) {
3154  	case SIOCOUTQ:
3155  		amount = unix_outq_len(sk);
3156  		err = put_user(amount, (int __user *)arg);
3157  		break;
3158  	case SIOCINQ:
3159  		amount = unix_inq_len(sk);
3160  		if (amount < 0)
3161  			err = amount;
3162  		else
3163  			err = put_user(amount, (int __user *)arg);
3164  		break;
3165  	case SIOCUNIXFILE:
3166  		err = unix_open_file(sk);
3167  		break;
3168  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3169  	case SIOCATMARK:
3170  		{
3171  			struct unix_sock *u = unix_sk(sk);
3172  			struct sk_buff *skb;
3173  			int answ = 0;
3174  
3175  			mutex_lock(&u->iolock);
3176  
3177  			skb = skb_peek(&sk->sk_receive_queue);
3178  			if (skb) {
3179  				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3180  				struct sk_buff *next_skb;
3181  
3182  				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3183  
3184  				if (skb == oob_skb ||
3185  				    (!unix_skb_len(skb) &&
3186  				     (!oob_skb || next_skb == oob_skb)))
3187  					answ = 1;
3188  			}
3189  
3190  			mutex_unlock(&u->iolock);
3191  
3192  			err = put_user(answ, (int __user *)arg);
3193  		}
3194  		break;
3195  #endif
3196  	default:
3197  		err = -ENOIOCTLCMD;
3198  		break;
3199  	}
3200  	return err;
3201  }
3202  
3203  #ifdef CONFIG_COMPAT
3204  static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3205  {
3206  	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3207  }
3208  #endif
3209  
3210  static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3211  {
3212  	struct sock *sk = sock->sk;
3213  	unsigned char state;
3214  	__poll_t mask;
3215  	u8 shutdown;
3216  
3217  	sock_poll_wait(file, sock, wait);
3218  	mask = 0;
3219  	shutdown = READ_ONCE(sk->sk_shutdown);
3220  	state = READ_ONCE(sk->sk_state);
3221  
3222  	/* exceptional events? */
3223  	if (READ_ONCE(sk->sk_err))
3224  		mask |= EPOLLERR;
3225  	if (shutdown == SHUTDOWN_MASK)
3226  		mask |= EPOLLHUP;
3227  	if (shutdown & RCV_SHUTDOWN)
3228  		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3229  
3230  	/* readable? */
3231  	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3232  		mask |= EPOLLIN | EPOLLRDNORM;
3233  	if (sk_is_readable(sk))
3234  		mask |= EPOLLIN | EPOLLRDNORM;
3235  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3236  	if (READ_ONCE(unix_sk(sk)->oob_skb))
3237  		mask |= EPOLLPRI;
3238  #endif
3239  
3240  	/* Connection-based need to check for termination and startup */
3241  	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3242  	    state == TCP_CLOSE)
3243  		mask |= EPOLLHUP;
3244  
3245  	/*
3246  	 * we set writable also when the other side has shut down the
3247  	 * connection. This prevents stuck sockets.
3248  	 */
3249  	if (unix_writable(sk, state))
3250  		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3251  
3252  	return mask;
3253  }
3254  
3255  static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3256  				    poll_table *wait)
3257  {
3258  	struct sock *sk = sock->sk, *other;
3259  	unsigned int writable;
3260  	unsigned char state;
3261  	__poll_t mask;
3262  	u8 shutdown;
3263  
3264  	sock_poll_wait(file, sock, wait);
3265  	mask = 0;
3266  	shutdown = READ_ONCE(sk->sk_shutdown);
3267  	state = READ_ONCE(sk->sk_state);
3268  
3269  	/* exceptional events? */
3270  	if (READ_ONCE(sk->sk_err) ||
3271  	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3272  		mask |= EPOLLERR |
3273  			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3274  
3275  	if (shutdown & RCV_SHUTDOWN)
3276  		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3277  	if (shutdown == SHUTDOWN_MASK)
3278  		mask |= EPOLLHUP;
3279  
3280  	/* readable? */
3281  	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3282  		mask |= EPOLLIN | EPOLLRDNORM;
3283  	if (sk_is_readable(sk))
3284  		mask |= EPOLLIN | EPOLLRDNORM;
3285  
3286  	/* Connection-based need to check for termination and startup */
3287  	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3288  		mask |= EPOLLHUP;
3289  
3290  	/* No write status requested, avoid expensive OUT tests. */
3291  	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3292  		return mask;
3293  
3294  	writable = unix_writable(sk, state);
3295  	if (writable) {
3296  		unix_state_lock(sk);
3297  
3298  		other = unix_peer(sk);
3299  		if (other && unix_peer(other) != sk &&
3300  		    unix_recvq_full_lockless(other) &&
3301  		    unix_dgram_peer_wake_me(sk, other))
3302  			writable = 0;
3303  
3304  		unix_state_unlock(sk);
3305  	}
3306  
3307  	if (writable)
3308  		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3309  	else
3310  		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3311  
3312  	return mask;
3313  }
3314  
3315  #ifdef CONFIG_PROC_FS
3316  
3317  #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3318  
3319  #define get_bucket(x) ((x) >> BUCKET_SPACE)
3320  #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3321  #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3322  
3323  static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3324  {
3325  	unsigned long offset = get_offset(*pos);
3326  	unsigned long bucket = get_bucket(*pos);
3327  	unsigned long count = 0;
3328  	struct sock *sk;
3329  
3330  	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3331  	     sk; sk = sk_next(sk)) {
3332  		if (++count == offset)
3333  			break;
3334  	}
3335  
3336  	return sk;
3337  }
3338  
3339  static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3340  {
3341  	unsigned long bucket = get_bucket(*pos);
3342  	struct net *net = seq_file_net(seq);
3343  	struct sock *sk;
3344  
3345  	while (bucket < UNIX_HASH_SIZE) {
3346  		spin_lock(&net->unx.table.locks[bucket]);
3347  
3348  		sk = unix_from_bucket(seq, pos);
3349  		if (sk)
3350  			return sk;
3351  
3352  		spin_unlock(&net->unx.table.locks[bucket]);
3353  
3354  		*pos = set_bucket_offset(++bucket, 1);
3355  	}
3356  
3357  	return NULL;
3358  }
3359  
3360  static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3361  				  loff_t *pos)
3362  {
3363  	unsigned long bucket = get_bucket(*pos);
3364  
3365  	sk = sk_next(sk);
3366  	if (sk)
3367  		return sk;
3368  
3369  
3370  	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3371  
3372  	*pos = set_bucket_offset(++bucket, 1);
3373  
3374  	return unix_get_first(seq, pos);
3375  }
3376  
3377  static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3378  {
3379  	if (!*pos)
3380  		return SEQ_START_TOKEN;
3381  
3382  	return unix_get_first(seq, pos);
3383  }
3384  
3385  static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3386  {
3387  	++*pos;
3388  
3389  	if (v == SEQ_START_TOKEN)
3390  		return unix_get_first(seq, pos);
3391  
3392  	return unix_get_next(seq, v, pos);
3393  }
3394  
3395  static void unix_seq_stop(struct seq_file *seq, void *v)
3396  {
3397  	struct sock *sk = v;
3398  
3399  	if (sk)
3400  		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3401  }
3402  
3403  static int unix_seq_show(struct seq_file *seq, void *v)
3404  {
3405  
3406  	if (v == SEQ_START_TOKEN)
3407  		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3408  			 "Inode Path\n");
3409  	else {
3410  		struct sock *s = v;
3411  		struct unix_sock *u = unix_sk(s);
3412  		unix_state_lock(s);
3413  
3414  		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3415  			s,
3416  			refcount_read(&s->sk_refcnt),
3417  			0,
3418  			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3419  			s->sk_type,
3420  			s->sk_socket ?
3421  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3422  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3423  			sock_i_ino(s));
3424  
3425  		if (u->addr) {	// under a hash table lock here
3426  			int i, len;
3427  			seq_putc(seq, ' ');
3428  
3429  			i = 0;
3430  			len = u->addr->len -
3431  				offsetof(struct sockaddr_un, sun_path);
3432  			if (u->addr->name->sun_path[0]) {
3433  				len--;
3434  			} else {
3435  				seq_putc(seq, '@');
3436  				i++;
3437  			}
3438  			for ( ; i < len; i++)
3439  				seq_putc(seq, u->addr->name->sun_path[i] ?:
3440  					 '@');
3441  		}
3442  		unix_state_unlock(s);
3443  		seq_putc(seq, '\n');
3444  	}
3445  
3446  	return 0;
3447  }
3448  
3449  static const struct seq_operations unix_seq_ops = {
3450  	.start  = unix_seq_start,
3451  	.next   = unix_seq_next,
3452  	.stop   = unix_seq_stop,
3453  	.show   = unix_seq_show,
3454  };
3455  
3456  #ifdef CONFIG_BPF_SYSCALL
3457  struct bpf_unix_iter_state {
3458  	struct seq_net_private p;
3459  	unsigned int cur_sk;
3460  	unsigned int end_sk;
3461  	unsigned int max_sk;
3462  	struct sock **batch;
3463  	bool st_bucket_done;
3464  };
3465  
3466  struct bpf_iter__unix {
3467  	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3468  	__bpf_md_ptr(struct unix_sock *, unix_sk);
3469  	uid_t uid __aligned(8);
3470  };
3471  
3472  static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3473  			      struct unix_sock *unix_sk, uid_t uid)
3474  {
3475  	struct bpf_iter__unix ctx;
3476  
3477  	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3478  	ctx.meta = meta;
3479  	ctx.unix_sk = unix_sk;
3480  	ctx.uid = uid;
3481  	return bpf_iter_run_prog(prog, &ctx);
3482  }
3483  
3484  static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3485  
3486  {
3487  	struct bpf_unix_iter_state *iter = seq->private;
3488  	unsigned int expected = 1;
3489  	struct sock *sk;
3490  
3491  	sock_hold(start_sk);
3492  	iter->batch[iter->end_sk++] = start_sk;
3493  
3494  	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3495  		if (iter->end_sk < iter->max_sk) {
3496  			sock_hold(sk);
3497  			iter->batch[iter->end_sk++] = sk;
3498  		}
3499  
3500  		expected++;
3501  	}
3502  
3503  	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3504  
3505  	return expected;
3506  }
3507  
3508  static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3509  {
3510  	while (iter->cur_sk < iter->end_sk)
3511  		sock_put(iter->batch[iter->cur_sk++]);
3512  }
3513  
3514  static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3515  				       unsigned int new_batch_sz)
3516  {
3517  	struct sock **new_batch;
3518  
3519  	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3520  			     GFP_USER | __GFP_NOWARN);
3521  	if (!new_batch)
3522  		return -ENOMEM;
3523  
3524  	bpf_iter_unix_put_batch(iter);
3525  	kvfree(iter->batch);
3526  	iter->batch = new_batch;
3527  	iter->max_sk = new_batch_sz;
3528  
3529  	return 0;
3530  }
3531  
3532  static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3533  					loff_t *pos)
3534  {
3535  	struct bpf_unix_iter_state *iter = seq->private;
3536  	unsigned int expected;
3537  	bool resized = false;
3538  	struct sock *sk;
3539  
3540  	if (iter->st_bucket_done)
3541  		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3542  
3543  again:
3544  	/* Get a new batch */
3545  	iter->cur_sk = 0;
3546  	iter->end_sk = 0;
3547  
3548  	sk = unix_get_first(seq, pos);
3549  	if (!sk)
3550  		return NULL; /* Done */
3551  
3552  	expected = bpf_iter_unix_hold_batch(seq, sk);
3553  
3554  	if (iter->end_sk == expected) {
3555  		iter->st_bucket_done = true;
3556  		return sk;
3557  	}
3558  
3559  	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3560  		resized = true;
3561  		goto again;
3562  	}
3563  
3564  	return sk;
3565  }
3566  
3567  static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3568  {
3569  	if (!*pos)
3570  		return SEQ_START_TOKEN;
3571  
3572  	/* bpf iter does not support lseek, so it always
3573  	 * continue from where it was stop()-ped.
3574  	 */
3575  	return bpf_iter_unix_batch(seq, pos);
3576  }
3577  
3578  static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3579  {
3580  	struct bpf_unix_iter_state *iter = seq->private;
3581  	struct sock *sk;
3582  
3583  	/* Whenever seq_next() is called, the iter->cur_sk is
3584  	 * done with seq_show(), so advance to the next sk in
3585  	 * the batch.
3586  	 */
3587  	if (iter->cur_sk < iter->end_sk)
3588  		sock_put(iter->batch[iter->cur_sk++]);
3589  
3590  	++*pos;
3591  
3592  	if (iter->cur_sk < iter->end_sk)
3593  		sk = iter->batch[iter->cur_sk];
3594  	else
3595  		sk = bpf_iter_unix_batch(seq, pos);
3596  
3597  	return sk;
3598  }
3599  
3600  static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3601  {
3602  	struct bpf_iter_meta meta;
3603  	struct bpf_prog *prog;
3604  	struct sock *sk = v;
3605  	uid_t uid;
3606  	bool slow;
3607  	int ret;
3608  
3609  	if (v == SEQ_START_TOKEN)
3610  		return 0;
3611  
3612  	slow = lock_sock_fast(sk);
3613  
3614  	if (unlikely(sk_unhashed(sk))) {
3615  		ret = SEQ_SKIP;
3616  		goto unlock;
3617  	}
3618  
3619  	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3620  	meta.seq = seq;
3621  	prog = bpf_iter_get_info(&meta, false);
3622  	ret = unix_prog_seq_show(prog, &meta, v, uid);
3623  unlock:
3624  	unlock_sock_fast(sk, slow);
3625  	return ret;
3626  }
3627  
3628  static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3629  {
3630  	struct bpf_unix_iter_state *iter = seq->private;
3631  	struct bpf_iter_meta meta;
3632  	struct bpf_prog *prog;
3633  
3634  	if (!v) {
3635  		meta.seq = seq;
3636  		prog = bpf_iter_get_info(&meta, true);
3637  		if (prog)
3638  			(void)unix_prog_seq_show(prog, &meta, v, 0);
3639  	}
3640  
3641  	if (iter->cur_sk < iter->end_sk)
3642  		bpf_iter_unix_put_batch(iter);
3643  }
3644  
3645  static const struct seq_operations bpf_iter_unix_seq_ops = {
3646  	.start	= bpf_iter_unix_seq_start,
3647  	.next	= bpf_iter_unix_seq_next,
3648  	.stop	= bpf_iter_unix_seq_stop,
3649  	.show	= bpf_iter_unix_seq_show,
3650  };
3651  #endif
3652  #endif
3653  
3654  static const struct net_proto_family unix_family_ops = {
3655  	.family = PF_UNIX,
3656  	.create = unix_create,
3657  	.owner	= THIS_MODULE,
3658  };
3659  
3660  
3661  static int __net_init unix_net_init(struct net *net)
3662  {
3663  	int i;
3664  
3665  	net->unx.sysctl_max_dgram_qlen = 10;
3666  	if (unix_sysctl_register(net))
3667  		goto out;
3668  
3669  #ifdef CONFIG_PROC_FS
3670  	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3671  			     sizeof(struct seq_net_private)))
3672  		goto err_sysctl;
3673  #endif
3674  
3675  	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3676  					      sizeof(spinlock_t), GFP_KERNEL);
3677  	if (!net->unx.table.locks)
3678  		goto err_proc;
3679  
3680  	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3681  						sizeof(struct hlist_head),
3682  						GFP_KERNEL);
3683  	if (!net->unx.table.buckets)
3684  		goto free_locks;
3685  
3686  	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3687  		spin_lock_init(&net->unx.table.locks[i]);
3688  		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3689  		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3690  	}
3691  
3692  	return 0;
3693  
3694  free_locks:
3695  	kvfree(net->unx.table.locks);
3696  err_proc:
3697  #ifdef CONFIG_PROC_FS
3698  	remove_proc_entry("unix", net->proc_net);
3699  err_sysctl:
3700  #endif
3701  	unix_sysctl_unregister(net);
3702  out:
3703  	return -ENOMEM;
3704  }
3705  
3706  static void __net_exit unix_net_exit(struct net *net)
3707  {
3708  	kvfree(net->unx.table.buckets);
3709  	kvfree(net->unx.table.locks);
3710  	unix_sysctl_unregister(net);
3711  	remove_proc_entry("unix", net->proc_net);
3712  }
3713  
3714  static struct pernet_operations unix_net_ops = {
3715  	.init = unix_net_init,
3716  	.exit = unix_net_exit,
3717  };
3718  
3719  #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3720  DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3721  		     struct unix_sock *unix_sk, uid_t uid)
3722  
3723  #define INIT_BATCH_SZ 16
3724  
3725  static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3726  {
3727  	struct bpf_unix_iter_state *iter = priv_data;
3728  	int err;
3729  
3730  	err = bpf_iter_init_seq_net(priv_data, aux);
3731  	if (err)
3732  		return err;
3733  
3734  	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3735  	if (err) {
3736  		bpf_iter_fini_seq_net(priv_data);
3737  		return err;
3738  	}
3739  
3740  	return 0;
3741  }
3742  
3743  static void bpf_iter_fini_unix(void *priv_data)
3744  {
3745  	struct bpf_unix_iter_state *iter = priv_data;
3746  
3747  	bpf_iter_fini_seq_net(priv_data);
3748  	kvfree(iter->batch);
3749  }
3750  
3751  static const struct bpf_iter_seq_info unix_seq_info = {
3752  	.seq_ops		= &bpf_iter_unix_seq_ops,
3753  	.init_seq_private	= bpf_iter_init_unix,
3754  	.fini_seq_private	= bpf_iter_fini_unix,
3755  	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3756  };
3757  
3758  static const struct bpf_func_proto *
3759  bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3760  			     const struct bpf_prog *prog)
3761  {
3762  	switch (func_id) {
3763  	case BPF_FUNC_setsockopt:
3764  		return &bpf_sk_setsockopt_proto;
3765  	case BPF_FUNC_getsockopt:
3766  		return &bpf_sk_getsockopt_proto;
3767  	default:
3768  		return NULL;
3769  	}
3770  }
3771  
3772  static struct bpf_iter_reg unix_reg_info = {
3773  	.target			= "unix",
3774  	.ctx_arg_info_size	= 1,
3775  	.ctx_arg_info		= {
3776  		{ offsetof(struct bpf_iter__unix, unix_sk),
3777  		  PTR_TO_BTF_ID_OR_NULL },
3778  	},
3779  	.get_func_proto         = bpf_iter_unix_get_func_proto,
3780  	.seq_info		= &unix_seq_info,
3781  };
3782  
3783  static void __init bpf_iter_register(void)
3784  {
3785  	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3786  	if (bpf_iter_reg_target(&unix_reg_info))
3787  		pr_warn("Warning: could not register bpf iterator unix\n");
3788  }
3789  #endif
3790  
3791  static int __init af_unix_init(void)
3792  {
3793  	int i, rc = -1;
3794  
3795  	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3796  
3797  	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3798  		spin_lock_init(&bsd_socket_locks[i]);
3799  		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3800  	}
3801  
3802  	rc = proto_register(&unix_dgram_proto, 1);
3803  	if (rc != 0) {
3804  		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3805  		goto out;
3806  	}
3807  
3808  	rc = proto_register(&unix_stream_proto, 1);
3809  	if (rc != 0) {
3810  		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3811  		proto_unregister(&unix_dgram_proto);
3812  		goto out;
3813  	}
3814  
3815  	sock_register(&unix_family_ops);
3816  	register_pernet_subsys(&unix_net_ops);
3817  	unix_bpf_build_proto();
3818  
3819  #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3820  	bpf_iter_register();
3821  #endif
3822  
3823  out:
3824  	return rc;
3825  }
3826  
3827  /* Later than subsys_initcall() because we depend on stuff initialised there */
3828  fs_initcall(af_unix_init);
3829