xref: /linux/net/unix/af_unix.c (revision c3eddf5e8c30adb6f43fc0b149e88b9feb76f381)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * NET4:	Implementation of BSD Unix domain sockets.
4   *
5   * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6   *
7   * Fixes:
8   *		Linus Torvalds	:	Assorted bug cures.
9   *		Niibe Yutaka	:	async I/O support.
10   *		Carsten Paeth	:	PF_UNIX check, address fixes.
11   *		Alan Cox	:	Limit size of allocated blocks.
12   *		Alan Cox	:	Fixed the stupid socketpair bug.
13   *		Alan Cox	:	BSD compatibility fine tuning.
14   *		Alan Cox	:	Fixed a bug in connect when interrupted.
15   *		Alan Cox	:	Sorted out a proper draft version of
16   *					file descriptor passing hacked up from
17   *					Mike Shaver's work.
18   *		Marty Leisner	:	Fixes to fd passing
19   *		Nick Nevin	:	recvmsg bugfix.
20   *		Alan Cox	:	Started proper garbage collector
21   *		Heiko EiBfeldt	:	Missing verify_area check
22   *		Alan Cox	:	Started POSIXisms
23   *		Andreas Schwab	:	Replace inode by dentry for proper
24   *					reference counting
25   *		Kirk Petersen	:	Made this a module
26   *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27   *					Lots of bug fixes.
28   *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29   *					by above two patches.
30   *	     Andrea Arcangeli	:	If possible we block in connect(2)
31   *					if the max backlog of the listen socket
32   *					is been reached. This won't break
33   *					old apps and it will avoid huge amount
34   *					of socks hashed (this for unix_gc()
35   *					performances reasons).
36   *					Security fix that limits the max
37   *					number of socks to 2*max_files and
38   *					the number of skb queueable in the
39   *					dgram receiver.
40   *		Artur Skawina   :	Hash function optimizations
41   *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42   *	      Malcolm Beattie   :	Set peercred for socketpair
43   *	     Michal Ostrowski   :       Module initialization cleanup.
44   *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45   *	     				the core infrastructure is doing that
46   *	     				for all net proto families now (2.5.69+)
47   *
48   * Known differences from reference BSD that was tested:
49   *
50   *	[TO FIX]
51   *	ECONNREFUSED is not returned from one end of a connected() socket to the
52   *		other the moment one end closes.
53   *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54   *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55   *	[NOT TO FIX]
56   *	accept() returns a path name even if the connecting socket has closed
57   *		in the meantime (BSD loses the path and gives up).
58   *	accept() returns 0 length path for an unbound connector. BSD returns 16
59   *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60   *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61   *	BSD af_unix apparently has connect forgetting to block properly.
62   *		(need to check this with the POSIX spec in detail)
63   *
64   * Differences from 2.0.0-11-... (ANK)
65   *	Bug fixes and improvements.
66   *		- client shutdown killed server socket.
67   *		- removed all useless cli/sti pairs.
68   *
69   *	Semantic changes/extensions.
70   *		- generic control message passing.
71   *		- SCM_CREDENTIALS control message.
72   *		- "Abstract" (not FS based) socket bindings.
73   *		  Abstract names are sequences of bytes (not zero terminated)
74   *		  started by 0, so that this name space does not intersect
75   *		  with BSD names.
76   */
77  
78  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79  
80  #include <linux/module.h>
81  #include <linux/kernel.h>
82  #include <linux/signal.h>
83  #include <linux/sched/signal.h>
84  #include <linux/errno.h>
85  #include <linux/string.h>
86  #include <linux/stat.h>
87  #include <linux/dcache.h>
88  #include <linux/namei.h>
89  #include <linux/socket.h>
90  #include <linux/un.h>
91  #include <linux/fcntl.h>
92  #include <linux/filter.h>
93  #include <linux/termios.h>
94  #include <linux/sockios.h>
95  #include <linux/net.h>
96  #include <linux/in.h>
97  #include <linux/fs.h>
98  #include <linux/slab.h>
99  #include <linux/uaccess.h>
100  #include <linux/skbuff.h>
101  #include <linux/netdevice.h>
102  #include <net/net_namespace.h>
103  #include <net/sock.h>
104  #include <net/tcp_states.h>
105  #include <net/af_unix.h>
106  #include <linux/proc_fs.h>
107  #include <linux/seq_file.h>
108  #include <net/scm.h>
109  #include <linux/init.h>
110  #include <linux/poll.h>
111  #include <linux/rtnetlink.h>
112  #include <linux/mount.h>
113  #include <net/checksum.h>
114  #include <linux/security.h>
115  #include <linux/splice.h>
116  #include <linux/freezer.h>
117  #include <linux/file.h>
118  #include <linux/btf_ids.h>
119  #include <linux/bpf-cgroup.h>
120  
121  static atomic_long_t unix_nr_socks;
122  static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123  static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124  
125  /* SMP locking strategy:
126   *    hash table is protected with spinlock.
127   *    each socket state is protected by separate spinlock.
128   */
129  #ifdef CONFIG_PROVE_LOCKING
130  #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
131  
132  static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133  				  const struct lockdep_map *b)
134  {
135  	return cmp_ptr(a, b);
136  }
137  
138  static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139  				  const struct lockdep_map *_b)
140  {
141  	const struct unix_sock *a, *b;
142  
143  	a = container_of(_a, struct unix_sock, lock.dep_map);
144  	b = container_of(_b, struct unix_sock, lock.dep_map);
145  
146  	if (a->sk.sk_state == TCP_LISTEN) {
147  		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
148  		 *
149  		 *   1. a is TCP_LISTEN.
150  		 *   2. b is not a.
151  		 *   3. concurrent connect(b -> a) must fail.
152  		 *
153  		 * Except for 2. & 3., the b's state can be any possible
154  		 * value due to concurrent connect() or listen().
155  		 *
156  		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157  		 * be expressed as lock_cmp_fn.
158  		 */
159  		switch (b->sk.sk_state) {
160  		case TCP_CLOSE:
161  		case TCP_ESTABLISHED:
162  		case TCP_LISTEN:
163  			return -1;
164  		default:
165  			/* Invalid case. */
166  			return 0;
167  		}
168  	}
169  
170  	/* Should never happen.  Just to be symmetric. */
171  	if (b->sk.sk_state == TCP_LISTEN) {
172  		switch (b->sk.sk_state) {
173  		case TCP_CLOSE:
174  		case TCP_ESTABLISHED:
175  			return 1;
176  		default:
177  			return 0;
178  		}
179  	}
180  
181  	/* unix_state_double_lock(): ascending address order. */
182  	return cmp_ptr(a, b);
183  }
184  
185  static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186  				  const struct lockdep_map *_b)
187  {
188  	const struct sock *a, *b;
189  
190  	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191  	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192  
193  	/* unix_collect_skb(): listener -> embryo order. */
194  	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195  		return -1;
196  
197  	/* Should never happen.  Just to be symmetric. */
198  	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199  		return 1;
200  
201  	return 0;
202  }
203  #endif
204  
205  static unsigned int unix_unbound_hash(struct sock *sk)
206  {
207  	unsigned long hash = (unsigned long)sk;
208  
209  	hash ^= hash >> 16;
210  	hash ^= hash >> 8;
211  	hash ^= sk->sk_type;
212  
213  	return hash & UNIX_HASH_MOD;
214  }
215  
216  static unsigned int unix_bsd_hash(struct inode *i)
217  {
218  	return i->i_ino & UNIX_HASH_MOD;
219  }
220  
221  static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222  				       int addr_len, int type)
223  {
224  	__wsum csum = csum_partial(sunaddr, addr_len, 0);
225  	unsigned int hash;
226  
227  	hash = (__force unsigned int)csum_fold(csum);
228  	hash ^= hash >> 8;
229  	hash ^= type;
230  
231  	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232  }
233  
234  static void unix_table_double_lock(struct net *net,
235  				   unsigned int hash1, unsigned int hash2)
236  {
237  	if (hash1 == hash2) {
238  		spin_lock(&net->unx.table.locks[hash1]);
239  		return;
240  	}
241  
242  	if (hash1 > hash2)
243  		swap(hash1, hash2);
244  
245  	spin_lock(&net->unx.table.locks[hash1]);
246  	spin_lock(&net->unx.table.locks[hash2]);
247  }
248  
249  static void unix_table_double_unlock(struct net *net,
250  				     unsigned int hash1, unsigned int hash2)
251  {
252  	if (hash1 == hash2) {
253  		spin_unlock(&net->unx.table.locks[hash1]);
254  		return;
255  	}
256  
257  	spin_unlock(&net->unx.table.locks[hash1]);
258  	spin_unlock(&net->unx.table.locks[hash2]);
259  }
260  
261  #ifdef CONFIG_SECURITY_NETWORK
262  static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263  {
264  	UNIXCB(skb).secid = scm->secid;
265  }
266  
267  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268  {
269  	scm->secid = UNIXCB(skb).secid;
270  }
271  
272  static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273  {
274  	return (scm->secid == UNIXCB(skb).secid);
275  }
276  #else
277  static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278  { }
279  
280  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281  { }
282  
283  static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284  {
285  	return true;
286  }
287  #endif /* CONFIG_SECURITY_NETWORK */
288  
289  static inline int unix_our_peer(struct sock *sk, struct sock *osk)
290  {
291  	return unix_peer(osk) == sk;
292  }
293  
294  static inline int unix_may_send(struct sock *sk, struct sock *osk)
295  {
296  	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
297  }
298  
299  static inline int unix_recvq_full_lockless(const struct sock *sk)
300  {
301  	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
302  }
303  
304  struct sock *unix_peer_get(struct sock *s)
305  {
306  	struct sock *peer;
307  
308  	unix_state_lock(s);
309  	peer = unix_peer(s);
310  	if (peer)
311  		sock_hold(peer);
312  	unix_state_unlock(s);
313  	return peer;
314  }
315  EXPORT_SYMBOL_GPL(unix_peer_get);
316  
317  static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
318  					     int addr_len)
319  {
320  	struct unix_address *addr;
321  
322  	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
323  	if (!addr)
324  		return NULL;
325  
326  	refcount_set(&addr->refcnt, 1);
327  	addr->len = addr_len;
328  	memcpy(addr->name, sunaddr, addr_len);
329  
330  	return addr;
331  }
332  
333  static inline void unix_release_addr(struct unix_address *addr)
334  {
335  	if (refcount_dec_and_test(&addr->refcnt))
336  		kfree(addr);
337  }
338  
339  /*
340   *	Check unix socket name:
341   *		- should be not zero length.
342   *	        - if started by not zero, should be NULL terminated (FS object)
343   *		- if started by zero, it is abstract name.
344   */
345  
346  static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
347  {
348  	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
349  	    addr_len > sizeof(*sunaddr))
350  		return -EINVAL;
351  
352  	if (sunaddr->sun_family != AF_UNIX)
353  		return -EINVAL;
354  
355  	return 0;
356  }
357  
358  static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
359  {
360  	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
361  	short offset = offsetof(struct sockaddr_storage, __data);
362  
363  	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
364  
365  	/* This may look like an off by one error but it is a bit more
366  	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
367  	 * sun_path[108] doesn't as such exist.  However in kernel space
368  	 * we are guaranteed that it is a valid memory location in our
369  	 * kernel address buffer because syscall functions always pass
370  	 * a pointer of struct sockaddr_storage which has a bigger buffer
371  	 * than 108.  Also, we must terminate sun_path for strlen() in
372  	 * getname_kernel().
373  	 */
374  	addr->__data[addr_len - offset] = 0;
375  
376  	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
377  	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
378  	 * know the actual buffer.
379  	 */
380  	return strlen(addr->__data) + offset + 1;
381  }
382  
383  static void __unix_remove_socket(struct sock *sk)
384  {
385  	sk_del_node_init(sk);
386  }
387  
388  static void __unix_insert_socket(struct net *net, struct sock *sk)
389  {
390  	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
391  	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
392  }
393  
394  static void __unix_set_addr_hash(struct net *net, struct sock *sk,
395  				 struct unix_address *addr, unsigned int hash)
396  {
397  	__unix_remove_socket(sk);
398  	smp_store_release(&unix_sk(sk)->addr, addr);
399  
400  	sk->sk_hash = hash;
401  	__unix_insert_socket(net, sk);
402  }
403  
404  static void unix_remove_socket(struct net *net, struct sock *sk)
405  {
406  	spin_lock(&net->unx.table.locks[sk->sk_hash]);
407  	__unix_remove_socket(sk);
408  	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
409  }
410  
411  static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
412  {
413  	spin_lock(&net->unx.table.locks[sk->sk_hash]);
414  	__unix_insert_socket(net, sk);
415  	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
416  }
417  
418  static void unix_insert_bsd_socket(struct sock *sk)
419  {
420  	spin_lock(&bsd_socket_locks[sk->sk_hash]);
421  	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
422  	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
423  }
424  
425  static void unix_remove_bsd_socket(struct sock *sk)
426  {
427  	if (!hlist_unhashed(&sk->sk_bind_node)) {
428  		spin_lock(&bsd_socket_locks[sk->sk_hash]);
429  		__sk_del_bind_node(sk);
430  		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
431  
432  		sk_node_init(&sk->sk_bind_node);
433  	}
434  }
435  
436  static struct sock *__unix_find_socket_byname(struct net *net,
437  					      struct sockaddr_un *sunname,
438  					      int len, unsigned int hash)
439  {
440  	struct sock *s;
441  
442  	sk_for_each(s, &net->unx.table.buckets[hash]) {
443  		struct unix_sock *u = unix_sk(s);
444  
445  		if (u->addr->len == len &&
446  		    !memcmp(u->addr->name, sunname, len))
447  			return s;
448  	}
449  	return NULL;
450  }
451  
452  static inline struct sock *unix_find_socket_byname(struct net *net,
453  						   struct sockaddr_un *sunname,
454  						   int len, unsigned int hash)
455  {
456  	struct sock *s;
457  
458  	spin_lock(&net->unx.table.locks[hash]);
459  	s = __unix_find_socket_byname(net, sunname, len, hash);
460  	if (s)
461  		sock_hold(s);
462  	spin_unlock(&net->unx.table.locks[hash]);
463  	return s;
464  }
465  
466  static struct sock *unix_find_socket_byinode(struct inode *i)
467  {
468  	unsigned int hash = unix_bsd_hash(i);
469  	struct sock *s;
470  
471  	spin_lock(&bsd_socket_locks[hash]);
472  	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
473  		struct dentry *dentry = unix_sk(s)->path.dentry;
474  
475  		if (dentry && d_backing_inode(dentry) == i) {
476  			sock_hold(s);
477  			spin_unlock(&bsd_socket_locks[hash]);
478  			return s;
479  		}
480  	}
481  	spin_unlock(&bsd_socket_locks[hash]);
482  	return NULL;
483  }
484  
485  /* Support code for asymmetrically connected dgram sockets
486   *
487   * If a datagram socket is connected to a socket not itself connected
488   * to the first socket (eg, /dev/log), clients may only enqueue more
489   * messages if the present receive queue of the server socket is not
490   * "too large". This means there's a second writeability condition
491   * poll and sendmsg need to test. The dgram recv code will do a wake
492   * up on the peer_wait wait queue of a socket upon reception of a
493   * datagram which needs to be propagated to sleeping would-be writers
494   * since these might not have sent anything so far. This can't be
495   * accomplished via poll_wait because the lifetime of the server
496   * socket might be less than that of its clients if these break their
497   * association with it or if the server socket is closed while clients
498   * are still connected to it and there's no way to inform "a polling
499   * implementation" that it should let go of a certain wait queue
500   *
501   * In order to propagate a wake up, a wait_queue_entry_t of the client
502   * socket is enqueued on the peer_wait queue of the server socket
503   * whose wake function does a wake_up on the ordinary client socket
504   * wait queue. This connection is established whenever a write (or
505   * poll for write) hit the flow control condition and broken when the
506   * association to the server socket is dissolved or after a wake up
507   * was relayed.
508   */
509  
510  static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
511  				      void *key)
512  {
513  	struct unix_sock *u;
514  	wait_queue_head_t *u_sleep;
515  
516  	u = container_of(q, struct unix_sock, peer_wake);
517  
518  	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
519  			    q);
520  	u->peer_wake.private = NULL;
521  
522  	/* relaying can only happen while the wq still exists */
523  	u_sleep = sk_sleep(&u->sk);
524  	if (u_sleep)
525  		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
526  
527  	return 0;
528  }
529  
530  static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
531  {
532  	struct unix_sock *u, *u_other;
533  	int rc;
534  
535  	u = unix_sk(sk);
536  	u_other = unix_sk(other);
537  	rc = 0;
538  	spin_lock(&u_other->peer_wait.lock);
539  
540  	if (!u->peer_wake.private) {
541  		u->peer_wake.private = other;
542  		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
543  
544  		rc = 1;
545  	}
546  
547  	spin_unlock(&u_other->peer_wait.lock);
548  	return rc;
549  }
550  
551  static void unix_dgram_peer_wake_disconnect(struct sock *sk,
552  					    struct sock *other)
553  {
554  	struct unix_sock *u, *u_other;
555  
556  	u = unix_sk(sk);
557  	u_other = unix_sk(other);
558  	spin_lock(&u_other->peer_wait.lock);
559  
560  	if (u->peer_wake.private == other) {
561  		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
562  		u->peer_wake.private = NULL;
563  	}
564  
565  	spin_unlock(&u_other->peer_wait.lock);
566  }
567  
568  static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
569  						   struct sock *other)
570  {
571  	unix_dgram_peer_wake_disconnect(sk, other);
572  	wake_up_interruptible_poll(sk_sleep(sk),
573  				   EPOLLOUT |
574  				   EPOLLWRNORM |
575  				   EPOLLWRBAND);
576  }
577  
578  /* preconditions:
579   *	- unix_peer(sk) == other
580   *	- association is stable
581   */
582  static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
583  {
584  	int connected;
585  
586  	connected = unix_dgram_peer_wake_connect(sk, other);
587  
588  	/* If other is SOCK_DEAD, we want to make sure we signal
589  	 * POLLOUT, such that a subsequent write() can get a
590  	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
591  	 * to other and its full, we will hang waiting for POLLOUT.
592  	 */
593  	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
594  		return 1;
595  
596  	if (connected)
597  		unix_dgram_peer_wake_disconnect(sk, other);
598  
599  	return 0;
600  }
601  
602  static int unix_writable(const struct sock *sk, unsigned char state)
603  {
604  	return state != TCP_LISTEN &&
605  		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
606  }
607  
608  static void unix_write_space(struct sock *sk)
609  {
610  	struct socket_wq *wq;
611  
612  	rcu_read_lock();
613  	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
614  		wq = rcu_dereference(sk->sk_wq);
615  		if (skwq_has_sleeper(wq))
616  			wake_up_interruptible_sync_poll(&wq->wait,
617  				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
618  		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
619  	}
620  	rcu_read_unlock();
621  }
622  
623  /* When dgram socket disconnects (or changes its peer), we clear its receive
624   * queue of packets arrived from previous peer. First, it allows to do
625   * flow control based only on wmem_alloc; second, sk connected to peer
626   * may receive messages only from that peer. */
627  static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
628  {
629  	if (!skb_queue_empty(&sk->sk_receive_queue)) {
630  		skb_queue_purge(&sk->sk_receive_queue);
631  		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
632  
633  		/* If one link of bidirectional dgram pipe is disconnected,
634  		 * we signal error. Messages are lost. Do not make this,
635  		 * when peer was not connected to us.
636  		 */
637  		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
638  			WRITE_ONCE(other->sk_err, ECONNRESET);
639  			sk_error_report(other);
640  		}
641  	}
642  }
643  
644  static void unix_sock_destructor(struct sock *sk)
645  {
646  	struct unix_sock *u = unix_sk(sk);
647  
648  	skb_queue_purge(&sk->sk_receive_queue);
649  
650  	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
651  	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
652  	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
653  	if (!sock_flag(sk, SOCK_DEAD)) {
654  		pr_info("Attempt to release alive unix socket: %p\n", sk);
655  		return;
656  	}
657  
658  	if (u->addr)
659  		unix_release_addr(u->addr);
660  
661  	atomic_long_dec(&unix_nr_socks);
662  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
663  #ifdef UNIX_REFCNT_DEBUG
664  	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
665  		atomic_long_read(&unix_nr_socks));
666  #endif
667  }
668  
669  static void unix_release_sock(struct sock *sk, int embrion)
670  {
671  	struct unix_sock *u = unix_sk(sk);
672  	struct sock *skpair;
673  	struct sk_buff *skb;
674  	struct path path;
675  	int state;
676  
677  	unix_remove_socket(sock_net(sk), sk);
678  	unix_remove_bsd_socket(sk);
679  
680  	/* Clear state */
681  	unix_state_lock(sk);
682  	sock_orphan(sk);
683  	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
684  	path	     = u->path;
685  	u->path.dentry = NULL;
686  	u->path.mnt = NULL;
687  	state = sk->sk_state;
688  	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
689  
690  	skpair = unix_peer(sk);
691  	unix_peer(sk) = NULL;
692  
693  	unix_state_unlock(sk);
694  
695  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
696  	if (u->oob_skb) {
697  		kfree_skb(u->oob_skb);
698  		u->oob_skb = NULL;
699  	}
700  #endif
701  
702  	wake_up_interruptible_all(&u->peer_wait);
703  
704  	if (skpair != NULL) {
705  		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
706  			unix_state_lock(skpair);
707  			/* No more writes */
708  			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
709  			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
710  				WRITE_ONCE(skpair->sk_err, ECONNRESET);
711  			unix_state_unlock(skpair);
712  			skpair->sk_state_change(skpair);
713  			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
714  		}
715  
716  		unix_dgram_peer_wake_disconnect(sk, skpair);
717  		sock_put(skpair); /* It may now die */
718  	}
719  
720  	/* Try to flush out this socket. Throw out buffers at least */
721  
722  	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
723  		if (state == TCP_LISTEN)
724  			unix_release_sock(skb->sk, 1);
725  
726  		/* passed fds are erased in the kfree_skb hook	      */
727  		kfree_skb(skb);
728  	}
729  
730  	if (path.dentry)
731  		path_put(&path);
732  
733  	sock_put(sk);
734  
735  	/* ---- Socket is dead now and most probably destroyed ---- */
736  
737  	/*
738  	 * Fixme: BSD difference: In BSD all sockets connected to us get
739  	 *	  ECONNRESET and we die on the spot. In Linux we behave
740  	 *	  like files and pipes do and wait for the last
741  	 *	  dereference.
742  	 *
743  	 * Can't we simply set sock->err?
744  	 *
745  	 *	  What the above comment does talk about? --ANK(980817)
746  	 */
747  
748  	if (READ_ONCE(unix_tot_inflight))
749  		unix_gc();		/* Garbage collect fds */
750  }
751  
752  static void init_peercred(struct sock *sk)
753  {
754  	sk->sk_peer_pid = get_pid(task_tgid(current));
755  	sk->sk_peer_cred = get_current_cred();
756  }
757  
758  static void update_peercred(struct sock *sk)
759  {
760  	const struct cred *old_cred;
761  	struct pid *old_pid;
762  
763  	spin_lock(&sk->sk_peer_lock);
764  	old_pid = sk->sk_peer_pid;
765  	old_cred = sk->sk_peer_cred;
766  	init_peercred(sk);
767  	spin_unlock(&sk->sk_peer_lock);
768  
769  	put_pid(old_pid);
770  	put_cred(old_cred);
771  }
772  
773  static void copy_peercred(struct sock *sk, struct sock *peersk)
774  {
775  	lockdep_assert_held(&unix_sk(peersk)->lock);
776  
777  	spin_lock(&sk->sk_peer_lock);
778  	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
779  	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
780  	spin_unlock(&sk->sk_peer_lock);
781  }
782  
783  static int unix_listen(struct socket *sock, int backlog)
784  {
785  	int err;
786  	struct sock *sk = sock->sk;
787  	struct unix_sock *u = unix_sk(sk);
788  
789  	err = -EOPNOTSUPP;
790  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
791  		goto out;	/* Only stream/seqpacket sockets accept */
792  	err = -EINVAL;
793  	if (!READ_ONCE(u->addr))
794  		goto out;	/* No listens on an unbound socket */
795  	unix_state_lock(sk);
796  	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
797  		goto out_unlock;
798  	if (backlog > sk->sk_max_ack_backlog)
799  		wake_up_interruptible_all(&u->peer_wait);
800  	sk->sk_max_ack_backlog	= backlog;
801  	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
802  
803  	/* set credentials so connect can copy them */
804  	update_peercred(sk);
805  	err = 0;
806  
807  out_unlock:
808  	unix_state_unlock(sk);
809  out:
810  	return err;
811  }
812  
813  static int unix_release(struct socket *);
814  static int unix_bind(struct socket *, struct sockaddr *, int);
815  static int unix_stream_connect(struct socket *, struct sockaddr *,
816  			       int addr_len, int flags);
817  static int unix_socketpair(struct socket *, struct socket *);
818  static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
819  static int unix_getname(struct socket *, struct sockaddr *, int);
820  static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
821  static __poll_t unix_dgram_poll(struct file *, struct socket *,
822  				    poll_table *);
823  static int unix_ioctl(struct socket *, unsigned int, unsigned long);
824  #ifdef CONFIG_COMPAT
825  static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
826  #endif
827  static int unix_shutdown(struct socket *, int);
828  static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
829  static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
830  static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
831  				       struct pipe_inode_info *, size_t size,
832  				       unsigned int flags);
833  static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
834  static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
835  static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
836  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
837  static int unix_dgram_connect(struct socket *, struct sockaddr *,
838  			      int, int);
839  static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
840  static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
841  				  int);
842  
843  #ifdef CONFIG_PROC_FS
844  static int unix_count_nr_fds(struct sock *sk)
845  {
846  	struct sk_buff *skb;
847  	struct unix_sock *u;
848  	int nr_fds = 0;
849  
850  	spin_lock(&sk->sk_receive_queue.lock);
851  	skb = skb_peek(&sk->sk_receive_queue);
852  	while (skb) {
853  		u = unix_sk(skb->sk);
854  		nr_fds += atomic_read(&u->scm_stat.nr_fds);
855  		skb = skb_peek_next(skb, &sk->sk_receive_queue);
856  	}
857  	spin_unlock(&sk->sk_receive_queue.lock);
858  
859  	return nr_fds;
860  }
861  
862  static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
863  {
864  	struct sock *sk = sock->sk;
865  	unsigned char s_state;
866  	struct unix_sock *u;
867  	int nr_fds = 0;
868  
869  	if (sk) {
870  		s_state = READ_ONCE(sk->sk_state);
871  		u = unix_sk(sk);
872  
873  		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
874  		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
875  		 * SOCK_DGRAM is ordinary. So, no lock is needed.
876  		 */
877  		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
878  			nr_fds = atomic_read(&u->scm_stat.nr_fds);
879  		else if (s_state == TCP_LISTEN)
880  			nr_fds = unix_count_nr_fds(sk);
881  
882  		seq_printf(m, "scm_fds: %u\n", nr_fds);
883  	}
884  }
885  #else
886  #define unix_show_fdinfo NULL
887  #endif
888  
889  static const struct proto_ops unix_stream_ops = {
890  	.family =	PF_UNIX,
891  	.owner =	THIS_MODULE,
892  	.release =	unix_release,
893  	.bind =		unix_bind,
894  	.connect =	unix_stream_connect,
895  	.socketpair =	unix_socketpair,
896  	.accept =	unix_accept,
897  	.getname =	unix_getname,
898  	.poll =		unix_poll,
899  	.ioctl =	unix_ioctl,
900  #ifdef CONFIG_COMPAT
901  	.compat_ioctl =	unix_compat_ioctl,
902  #endif
903  	.listen =	unix_listen,
904  	.shutdown =	unix_shutdown,
905  	.sendmsg =	unix_stream_sendmsg,
906  	.recvmsg =	unix_stream_recvmsg,
907  	.read_skb =	unix_stream_read_skb,
908  	.mmap =		sock_no_mmap,
909  	.splice_read =	unix_stream_splice_read,
910  	.set_peek_off =	sk_set_peek_off,
911  	.show_fdinfo =	unix_show_fdinfo,
912  };
913  
914  static const struct proto_ops unix_dgram_ops = {
915  	.family =	PF_UNIX,
916  	.owner =	THIS_MODULE,
917  	.release =	unix_release,
918  	.bind =		unix_bind,
919  	.connect =	unix_dgram_connect,
920  	.socketpair =	unix_socketpair,
921  	.accept =	sock_no_accept,
922  	.getname =	unix_getname,
923  	.poll =		unix_dgram_poll,
924  	.ioctl =	unix_ioctl,
925  #ifdef CONFIG_COMPAT
926  	.compat_ioctl =	unix_compat_ioctl,
927  #endif
928  	.listen =	sock_no_listen,
929  	.shutdown =	unix_shutdown,
930  	.sendmsg =	unix_dgram_sendmsg,
931  	.read_skb =	unix_read_skb,
932  	.recvmsg =	unix_dgram_recvmsg,
933  	.mmap =		sock_no_mmap,
934  	.set_peek_off =	sk_set_peek_off,
935  	.show_fdinfo =	unix_show_fdinfo,
936  };
937  
938  static const struct proto_ops unix_seqpacket_ops = {
939  	.family =	PF_UNIX,
940  	.owner =	THIS_MODULE,
941  	.release =	unix_release,
942  	.bind =		unix_bind,
943  	.connect =	unix_stream_connect,
944  	.socketpair =	unix_socketpair,
945  	.accept =	unix_accept,
946  	.getname =	unix_getname,
947  	.poll =		unix_dgram_poll,
948  	.ioctl =	unix_ioctl,
949  #ifdef CONFIG_COMPAT
950  	.compat_ioctl =	unix_compat_ioctl,
951  #endif
952  	.listen =	unix_listen,
953  	.shutdown =	unix_shutdown,
954  	.sendmsg =	unix_seqpacket_sendmsg,
955  	.recvmsg =	unix_seqpacket_recvmsg,
956  	.mmap =		sock_no_mmap,
957  	.set_peek_off =	sk_set_peek_off,
958  	.show_fdinfo =	unix_show_fdinfo,
959  };
960  
961  static void unix_close(struct sock *sk, long timeout)
962  {
963  	/* Nothing to do here, unix socket does not need a ->close().
964  	 * This is merely for sockmap.
965  	 */
966  }
967  
968  static void unix_unhash(struct sock *sk)
969  {
970  	/* Nothing to do here, unix socket does not need a ->unhash().
971  	 * This is merely for sockmap.
972  	 */
973  }
974  
975  static bool unix_bpf_bypass_getsockopt(int level, int optname)
976  {
977  	if (level == SOL_SOCKET) {
978  		switch (optname) {
979  		case SO_PEERPIDFD:
980  			return true;
981  		default:
982  			return false;
983  		}
984  	}
985  
986  	return false;
987  }
988  
989  struct proto unix_dgram_proto = {
990  	.name			= "UNIX",
991  	.owner			= THIS_MODULE,
992  	.obj_size		= sizeof(struct unix_sock),
993  	.close			= unix_close,
994  	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
995  #ifdef CONFIG_BPF_SYSCALL
996  	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
997  #endif
998  };
999  
1000  struct proto unix_stream_proto = {
1001  	.name			= "UNIX-STREAM",
1002  	.owner			= THIS_MODULE,
1003  	.obj_size		= sizeof(struct unix_sock),
1004  	.close			= unix_close,
1005  	.unhash			= unix_unhash,
1006  	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1007  #ifdef CONFIG_BPF_SYSCALL
1008  	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1009  #endif
1010  };
1011  
1012  static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1013  {
1014  	struct unix_sock *u;
1015  	struct sock *sk;
1016  	int err;
1017  
1018  	atomic_long_inc(&unix_nr_socks);
1019  	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1020  		err = -ENFILE;
1021  		goto err;
1022  	}
1023  
1024  	if (type == SOCK_STREAM)
1025  		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1026  	else /*dgram and  seqpacket */
1027  		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1028  
1029  	if (!sk) {
1030  		err = -ENOMEM;
1031  		goto err;
1032  	}
1033  
1034  	sock_init_data(sock, sk);
1035  
1036  	sk->sk_hash		= unix_unbound_hash(sk);
1037  	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1038  	sk->sk_write_space	= unix_write_space;
1039  	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1040  	sk->sk_destruct		= unix_sock_destructor;
1041  	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1042  
1043  	u = unix_sk(sk);
1044  	u->listener = NULL;
1045  	u->vertex = NULL;
1046  	u->path.dentry = NULL;
1047  	u->path.mnt = NULL;
1048  	spin_lock_init(&u->lock);
1049  	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1050  	mutex_init(&u->iolock); /* single task reading lock */
1051  	mutex_init(&u->bindlock); /* single task binding lock */
1052  	init_waitqueue_head(&u->peer_wait);
1053  	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1054  	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1055  	unix_insert_unbound_socket(net, sk);
1056  
1057  	sock_prot_inuse_add(net, sk->sk_prot, 1);
1058  
1059  	return sk;
1060  
1061  err:
1062  	atomic_long_dec(&unix_nr_socks);
1063  	return ERR_PTR(err);
1064  }
1065  
1066  static int unix_create(struct net *net, struct socket *sock, int protocol,
1067  		       int kern)
1068  {
1069  	struct sock *sk;
1070  
1071  	if (protocol && protocol != PF_UNIX)
1072  		return -EPROTONOSUPPORT;
1073  
1074  	sock->state = SS_UNCONNECTED;
1075  
1076  	switch (sock->type) {
1077  	case SOCK_STREAM:
1078  		sock->ops = &unix_stream_ops;
1079  		break;
1080  		/*
1081  		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1082  		 *	nothing uses it.
1083  		 */
1084  	case SOCK_RAW:
1085  		sock->type = SOCK_DGRAM;
1086  		fallthrough;
1087  	case SOCK_DGRAM:
1088  		sock->ops = &unix_dgram_ops;
1089  		break;
1090  	case SOCK_SEQPACKET:
1091  		sock->ops = &unix_seqpacket_ops;
1092  		break;
1093  	default:
1094  		return -ESOCKTNOSUPPORT;
1095  	}
1096  
1097  	sk = unix_create1(net, sock, kern, sock->type);
1098  	if (IS_ERR(sk))
1099  		return PTR_ERR(sk);
1100  
1101  	return 0;
1102  }
1103  
1104  static int unix_release(struct socket *sock)
1105  {
1106  	struct sock *sk = sock->sk;
1107  
1108  	if (!sk)
1109  		return 0;
1110  
1111  	sk->sk_prot->close(sk, 0);
1112  	unix_release_sock(sk, 0);
1113  	sock->sk = NULL;
1114  
1115  	return 0;
1116  }
1117  
1118  static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1119  				  int type)
1120  {
1121  	struct inode *inode;
1122  	struct path path;
1123  	struct sock *sk;
1124  	int err;
1125  
1126  	unix_mkname_bsd(sunaddr, addr_len);
1127  	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1128  	if (err)
1129  		goto fail;
1130  
1131  	err = path_permission(&path, MAY_WRITE);
1132  	if (err)
1133  		goto path_put;
1134  
1135  	err = -ECONNREFUSED;
1136  	inode = d_backing_inode(path.dentry);
1137  	if (!S_ISSOCK(inode->i_mode))
1138  		goto path_put;
1139  
1140  	sk = unix_find_socket_byinode(inode);
1141  	if (!sk)
1142  		goto path_put;
1143  
1144  	err = -EPROTOTYPE;
1145  	if (sk->sk_type == type)
1146  		touch_atime(&path);
1147  	else
1148  		goto sock_put;
1149  
1150  	path_put(&path);
1151  
1152  	return sk;
1153  
1154  sock_put:
1155  	sock_put(sk);
1156  path_put:
1157  	path_put(&path);
1158  fail:
1159  	return ERR_PTR(err);
1160  }
1161  
1162  static struct sock *unix_find_abstract(struct net *net,
1163  				       struct sockaddr_un *sunaddr,
1164  				       int addr_len, int type)
1165  {
1166  	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1167  	struct dentry *dentry;
1168  	struct sock *sk;
1169  
1170  	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1171  	if (!sk)
1172  		return ERR_PTR(-ECONNREFUSED);
1173  
1174  	dentry = unix_sk(sk)->path.dentry;
1175  	if (dentry)
1176  		touch_atime(&unix_sk(sk)->path);
1177  
1178  	return sk;
1179  }
1180  
1181  static struct sock *unix_find_other(struct net *net,
1182  				    struct sockaddr_un *sunaddr,
1183  				    int addr_len, int type)
1184  {
1185  	struct sock *sk;
1186  
1187  	if (sunaddr->sun_path[0])
1188  		sk = unix_find_bsd(sunaddr, addr_len, type);
1189  	else
1190  		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1191  
1192  	return sk;
1193  }
1194  
1195  static int unix_autobind(struct sock *sk)
1196  {
1197  	struct unix_sock *u = unix_sk(sk);
1198  	unsigned int new_hash, old_hash;
1199  	struct net *net = sock_net(sk);
1200  	struct unix_address *addr;
1201  	u32 lastnum, ordernum;
1202  	int err;
1203  
1204  	err = mutex_lock_interruptible(&u->bindlock);
1205  	if (err)
1206  		return err;
1207  
1208  	if (u->addr)
1209  		goto out;
1210  
1211  	err = -ENOMEM;
1212  	addr = kzalloc(sizeof(*addr) +
1213  		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1214  	if (!addr)
1215  		goto out;
1216  
1217  	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1218  	addr->name->sun_family = AF_UNIX;
1219  	refcount_set(&addr->refcnt, 1);
1220  
1221  	old_hash = sk->sk_hash;
1222  	ordernum = get_random_u32();
1223  	lastnum = ordernum & 0xFFFFF;
1224  retry:
1225  	ordernum = (ordernum + 1) & 0xFFFFF;
1226  	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1227  
1228  	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1229  	unix_table_double_lock(net, old_hash, new_hash);
1230  
1231  	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1232  		unix_table_double_unlock(net, old_hash, new_hash);
1233  
1234  		/* __unix_find_socket_byname() may take long time if many names
1235  		 * are already in use.
1236  		 */
1237  		cond_resched();
1238  
1239  		if (ordernum == lastnum) {
1240  			/* Give up if all names seems to be in use. */
1241  			err = -ENOSPC;
1242  			unix_release_addr(addr);
1243  			goto out;
1244  		}
1245  
1246  		goto retry;
1247  	}
1248  
1249  	__unix_set_addr_hash(net, sk, addr, new_hash);
1250  	unix_table_double_unlock(net, old_hash, new_hash);
1251  	err = 0;
1252  
1253  out:	mutex_unlock(&u->bindlock);
1254  	return err;
1255  }
1256  
1257  static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1258  			 int addr_len)
1259  {
1260  	umode_t mode = S_IFSOCK |
1261  	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1262  	struct unix_sock *u = unix_sk(sk);
1263  	unsigned int new_hash, old_hash;
1264  	struct net *net = sock_net(sk);
1265  	struct mnt_idmap *idmap;
1266  	struct unix_address *addr;
1267  	struct dentry *dentry;
1268  	struct path parent;
1269  	int err;
1270  
1271  	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1272  	addr = unix_create_addr(sunaddr, addr_len);
1273  	if (!addr)
1274  		return -ENOMEM;
1275  
1276  	/*
1277  	 * Get the parent directory, calculate the hash for last
1278  	 * component.
1279  	 */
1280  	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1281  	if (IS_ERR(dentry)) {
1282  		err = PTR_ERR(dentry);
1283  		goto out;
1284  	}
1285  
1286  	/*
1287  	 * All right, let's create it.
1288  	 */
1289  	idmap = mnt_idmap(parent.mnt);
1290  	err = security_path_mknod(&parent, dentry, mode, 0);
1291  	if (!err)
1292  		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1293  	if (err)
1294  		goto out_path;
1295  	err = mutex_lock_interruptible(&u->bindlock);
1296  	if (err)
1297  		goto out_unlink;
1298  	if (u->addr)
1299  		goto out_unlock;
1300  
1301  	old_hash = sk->sk_hash;
1302  	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1303  	unix_table_double_lock(net, old_hash, new_hash);
1304  	u->path.mnt = mntget(parent.mnt);
1305  	u->path.dentry = dget(dentry);
1306  	__unix_set_addr_hash(net, sk, addr, new_hash);
1307  	unix_table_double_unlock(net, old_hash, new_hash);
1308  	unix_insert_bsd_socket(sk);
1309  	mutex_unlock(&u->bindlock);
1310  	done_path_create(&parent, dentry);
1311  	return 0;
1312  
1313  out_unlock:
1314  	mutex_unlock(&u->bindlock);
1315  	err = -EINVAL;
1316  out_unlink:
1317  	/* failed after successful mknod?  unlink what we'd created... */
1318  	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1319  out_path:
1320  	done_path_create(&parent, dentry);
1321  out:
1322  	unix_release_addr(addr);
1323  	return err == -EEXIST ? -EADDRINUSE : err;
1324  }
1325  
1326  static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1327  			      int addr_len)
1328  {
1329  	struct unix_sock *u = unix_sk(sk);
1330  	unsigned int new_hash, old_hash;
1331  	struct net *net = sock_net(sk);
1332  	struct unix_address *addr;
1333  	int err;
1334  
1335  	addr = unix_create_addr(sunaddr, addr_len);
1336  	if (!addr)
1337  		return -ENOMEM;
1338  
1339  	err = mutex_lock_interruptible(&u->bindlock);
1340  	if (err)
1341  		goto out;
1342  
1343  	if (u->addr) {
1344  		err = -EINVAL;
1345  		goto out_mutex;
1346  	}
1347  
1348  	old_hash = sk->sk_hash;
1349  	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1350  	unix_table_double_lock(net, old_hash, new_hash);
1351  
1352  	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1353  		goto out_spin;
1354  
1355  	__unix_set_addr_hash(net, sk, addr, new_hash);
1356  	unix_table_double_unlock(net, old_hash, new_hash);
1357  	mutex_unlock(&u->bindlock);
1358  	return 0;
1359  
1360  out_spin:
1361  	unix_table_double_unlock(net, old_hash, new_hash);
1362  	err = -EADDRINUSE;
1363  out_mutex:
1364  	mutex_unlock(&u->bindlock);
1365  out:
1366  	unix_release_addr(addr);
1367  	return err;
1368  }
1369  
1370  static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1371  {
1372  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1373  	struct sock *sk = sock->sk;
1374  	int err;
1375  
1376  	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1377  	    sunaddr->sun_family == AF_UNIX)
1378  		return unix_autobind(sk);
1379  
1380  	err = unix_validate_addr(sunaddr, addr_len);
1381  	if (err)
1382  		return err;
1383  
1384  	if (sunaddr->sun_path[0])
1385  		err = unix_bind_bsd(sk, sunaddr, addr_len);
1386  	else
1387  		err = unix_bind_abstract(sk, sunaddr, addr_len);
1388  
1389  	return err;
1390  }
1391  
1392  static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1393  {
1394  	if (unlikely(sk1 == sk2) || !sk2) {
1395  		unix_state_lock(sk1);
1396  		return;
1397  	}
1398  
1399  	if (sk1 > sk2)
1400  		swap(sk1, sk2);
1401  
1402  	unix_state_lock(sk1);
1403  	unix_state_lock(sk2);
1404  }
1405  
1406  static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1407  {
1408  	if (unlikely(sk1 == sk2) || !sk2) {
1409  		unix_state_unlock(sk1);
1410  		return;
1411  	}
1412  	unix_state_unlock(sk1);
1413  	unix_state_unlock(sk2);
1414  }
1415  
1416  static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1417  			      int alen, int flags)
1418  {
1419  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1420  	struct sock *sk = sock->sk;
1421  	struct sock *other;
1422  	int err;
1423  
1424  	err = -EINVAL;
1425  	if (alen < offsetofend(struct sockaddr, sa_family))
1426  		goto out;
1427  
1428  	if (addr->sa_family != AF_UNSPEC) {
1429  		err = unix_validate_addr(sunaddr, alen);
1430  		if (err)
1431  			goto out;
1432  
1433  		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1434  		if (err)
1435  			goto out;
1436  
1437  		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1438  		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1439  		    !READ_ONCE(unix_sk(sk)->addr)) {
1440  			err = unix_autobind(sk);
1441  			if (err)
1442  				goto out;
1443  		}
1444  
1445  restart:
1446  		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1447  		if (IS_ERR(other)) {
1448  			err = PTR_ERR(other);
1449  			goto out;
1450  		}
1451  
1452  		unix_state_double_lock(sk, other);
1453  
1454  		/* Apparently VFS overslept socket death. Retry. */
1455  		if (sock_flag(other, SOCK_DEAD)) {
1456  			unix_state_double_unlock(sk, other);
1457  			sock_put(other);
1458  			goto restart;
1459  		}
1460  
1461  		err = -EPERM;
1462  		if (!unix_may_send(sk, other))
1463  			goto out_unlock;
1464  
1465  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1466  		if (err)
1467  			goto out_unlock;
1468  
1469  		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1470  		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1471  	} else {
1472  		/*
1473  		 *	1003.1g breaking connected state with AF_UNSPEC
1474  		 */
1475  		other = NULL;
1476  		unix_state_double_lock(sk, other);
1477  	}
1478  
1479  	/*
1480  	 * If it was connected, reconnect.
1481  	 */
1482  	if (unix_peer(sk)) {
1483  		struct sock *old_peer = unix_peer(sk);
1484  
1485  		unix_peer(sk) = other;
1486  		if (!other)
1487  			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1488  		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1489  
1490  		unix_state_double_unlock(sk, other);
1491  
1492  		if (other != old_peer) {
1493  			unix_dgram_disconnected(sk, old_peer);
1494  
1495  			unix_state_lock(old_peer);
1496  			if (!unix_peer(old_peer))
1497  				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1498  			unix_state_unlock(old_peer);
1499  		}
1500  
1501  		sock_put(old_peer);
1502  	} else {
1503  		unix_peer(sk) = other;
1504  		unix_state_double_unlock(sk, other);
1505  	}
1506  
1507  	return 0;
1508  
1509  out_unlock:
1510  	unix_state_double_unlock(sk, other);
1511  	sock_put(other);
1512  out:
1513  	return err;
1514  }
1515  
1516  static long unix_wait_for_peer(struct sock *other, long timeo)
1517  	__releases(&unix_sk(other)->lock)
1518  {
1519  	struct unix_sock *u = unix_sk(other);
1520  	int sched;
1521  	DEFINE_WAIT(wait);
1522  
1523  	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1524  
1525  	sched = !sock_flag(other, SOCK_DEAD) &&
1526  		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1527  		unix_recvq_full_lockless(other);
1528  
1529  	unix_state_unlock(other);
1530  
1531  	if (sched)
1532  		timeo = schedule_timeout(timeo);
1533  
1534  	finish_wait(&u->peer_wait, &wait);
1535  	return timeo;
1536  }
1537  
1538  static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1539  			       int addr_len, int flags)
1540  {
1541  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1542  	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1543  	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1544  	struct net *net = sock_net(sk);
1545  	struct sk_buff *skb = NULL;
1546  	unsigned char state;
1547  	long timeo;
1548  	int err;
1549  
1550  	err = unix_validate_addr(sunaddr, addr_len);
1551  	if (err)
1552  		goto out;
1553  
1554  	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1555  	if (err)
1556  		goto out;
1557  
1558  	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1559  	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1560  	    !READ_ONCE(u->addr)) {
1561  		err = unix_autobind(sk);
1562  		if (err)
1563  			goto out;
1564  	}
1565  
1566  	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1567  
1568  	/* First of all allocate resources.
1569  	   If we will make it after state is locked,
1570  	   we will have to recheck all again in any case.
1571  	 */
1572  
1573  	/* create new sock for complete connection */
1574  	newsk = unix_create1(net, NULL, 0, sock->type);
1575  	if (IS_ERR(newsk)) {
1576  		err = PTR_ERR(newsk);
1577  		newsk = NULL;
1578  		goto out;
1579  	}
1580  
1581  	err = -ENOMEM;
1582  
1583  	/* Allocate skb for sending to listening sock */
1584  	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1585  	if (skb == NULL)
1586  		goto out;
1587  
1588  restart:
1589  	/*  Find listening sock. */
1590  	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1591  	if (IS_ERR(other)) {
1592  		err = PTR_ERR(other);
1593  		other = NULL;
1594  		goto out;
1595  	}
1596  
1597  	unix_state_lock(other);
1598  
1599  	/* Apparently VFS overslept socket death. Retry. */
1600  	if (sock_flag(other, SOCK_DEAD)) {
1601  		unix_state_unlock(other);
1602  		sock_put(other);
1603  		goto restart;
1604  	}
1605  
1606  	err = -ECONNREFUSED;
1607  	if (other->sk_state != TCP_LISTEN)
1608  		goto out_unlock;
1609  	if (other->sk_shutdown & RCV_SHUTDOWN)
1610  		goto out_unlock;
1611  
1612  	if (unix_recvq_full_lockless(other)) {
1613  		err = -EAGAIN;
1614  		if (!timeo)
1615  			goto out_unlock;
1616  
1617  		timeo = unix_wait_for_peer(other, timeo);
1618  
1619  		err = sock_intr_errno(timeo);
1620  		if (signal_pending(current))
1621  			goto out;
1622  		sock_put(other);
1623  		goto restart;
1624  	}
1625  
1626  	/* self connect and simultaneous connect are eliminated
1627  	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1628  	 */
1629  	state = READ_ONCE(sk->sk_state);
1630  	if (unlikely(state != TCP_CLOSE)) {
1631  		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1632  		goto out_unlock;
1633  	}
1634  
1635  	unix_state_lock(sk);
1636  
1637  	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1638  		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1639  		unix_state_unlock(sk);
1640  		goto out_unlock;
1641  	}
1642  
1643  	err = security_unix_stream_connect(sk, other, newsk);
1644  	if (err) {
1645  		unix_state_unlock(sk);
1646  		goto out_unlock;
1647  	}
1648  
1649  	/* The way is open! Fastly set all the necessary fields... */
1650  
1651  	sock_hold(sk);
1652  	unix_peer(newsk)	= sk;
1653  	newsk->sk_state		= TCP_ESTABLISHED;
1654  	newsk->sk_type		= sk->sk_type;
1655  	init_peercred(newsk);
1656  	newu = unix_sk(newsk);
1657  	newu->listener = other;
1658  	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1659  	otheru = unix_sk(other);
1660  
1661  	/* copy address information from listening to new sock
1662  	 *
1663  	 * The contents of *(otheru->addr) and otheru->path
1664  	 * are seen fully set up here, since we have found
1665  	 * otheru in hash under its lock.  Insertion into the
1666  	 * hash chain we'd found it in had been done in an
1667  	 * earlier critical area protected by the chain's lock,
1668  	 * the same one where we'd set *(otheru->addr) contents,
1669  	 * as well as otheru->path and otheru->addr itself.
1670  	 *
1671  	 * Using smp_store_release() here to set newu->addr
1672  	 * is enough to make those stores, as well as stores
1673  	 * to newu->path visible to anyone who gets newu->addr
1674  	 * by smp_load_acquire().  IOW, the same warranties
1675  	 * as for unix_sock instances bound in unix_bind() or
1676  	 * in unix_autobind().
1677  	 */
1678  	if (otheru->path.dentry) {
1679  		path_get(&otheru->path);
1680  		newu->path = otheru->path;
1681  	}
1682  	refcount_inc(&otheru->addr->refcnt);
1683  	smp_store_release(&newu->addr, otheru->addr);
1684  
1685  	/* Set credentials */
1686  	copy_peercred(sk, other);
1687  
1688  	sock->state	= SS_CONNECTED;
1689  	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1690  	sock_hold(newsk);
1691  
1692  	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1693  	unix_peer(sk)	= newsk;
1694  
1695  	unix_state_unlock(sk);
1696  
1697  	/* take ten and send info to listening sock */
1698  	spin_lock(&other->sk_receive_queue.lock);
1699  	__skb_queue_tail(&other->sk_receive_queue, skb);
1700  	spin_unlock(&other->sk_receive_queue.lock);
1701  	unix_state_unlock(other);
1702  	other->sk_data_ready(other);
1703  	sock_put(other);
1704  	return 0;
1705  
1706  out_unlock:
1707  	if (other)
1708  		unix_state_unlock(other);
1709  
1710  out:
1711  	kfree_skb(skb);
1712  	if (newsk)
1713  		unix_release_sock(newsk, 0);
1714  	if (other)
1715  		sock_put(other);
1716  	return err;
1717  }
1718  
1719  static int unix_socketpair(struct socket *socka, struct socket *sockb)
1720  {
1721  	struct sock *ska = socka->sk, *skb = sockb->sk;
1722  
1723  	/* Join our sockets back to back */
1724  	sock_hold(ska);
1725  	sock_hold(skb);
1726  	unix_peer(ska) = skb;
1727  	unix_peer(skb) = ska;
1728  	init_peercred(ska);
1729  	init_peercred(skb);
1730  
1731  	ska->sk_state = TCP_ESTABLISHED;
1732  	skb->sk_state = TCP_ESTABLISHED;
1733  	socka->state  = SS_CONNECTED;
1734  	sockb->state  = SS_CONNECTED;
1735  	return 0;
1736  }
1737  
1738  static void unix_sock_inherit_flags(const struct socket *old,
1739  				    struct socket *new)
1740  {
1741  	if (test_bit(SOCK_PASSCRED, &old->flags))
1742  		set_bit(SOCK_PASSCRED, &new->flags);
1743  	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1744  		set_bit(SOCK_PASSPIDFD, &new->flags);
1745  	if (test_bit(SOCK_PASSSEC, &old->flags))
1746  		set_bit(SOCK_PASSSEC, &new->flags);
1747  }
1748  
1749  static int unix_accept(struct socket *sock, struct socket *newsock,
1750  		       struct proto_accept_arg *arg)
1751  {
1752  	struct sock *sk = sock->sk;
1753  	struct sk_buff *skb;
1754  	struct sock *tsk;
1755  
1756  	arg->err = -EOPNOTSUPP;
1757  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1758  		goto out;
1759  
1760  	arg->err = -EINVAL;
1761  	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1762  		goto out;
1763  
1764  	/* If socket state is TCP_LISTEN it cannot change (for now...),
1765  	 * so that no locks are necessary.
1766  	 */
1767  
1768  	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1769  				&arg->err);
1770  	if (!skb) {
1771  		/* This means receive shutdown. */
1772  		if (arg->err == 0)
1773  			arg->err = -EINVAL;
1774  		goto out;
1775  	}
1776  
1777  	tsk = skb->sk;
1778  	skb_free_datagram(sk, skb);
1779  	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1780  
1781  	/* attach accepted sock to socket */
1782  	unix_state_lock(tsk);
1783  	unix_update_edges(unix_sk(tsk));
1784  	newsock->state = SS_CONNECTED;
1785  	unix_sock_inherit_flags(sock, newsock);
1786  	sock_graft(tsk, newsock);
1787  	unix_state_unlock(tsk);
1788  	return 0;
1789  
1790  out:
1791  	return arg->err;
1792  }
1793  
1794  
1795  static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1796  {
1797  	struct sock *sk = sock->sk;
1798  	struct unix_address *addr;
1799  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1800  	int err = 0;
1801  
1802  	if (peer) {
1803  		sk = unix_peer_get(sk);
1804  
1805  		err = -ENOTCONN;
1806  		if (!sk)
1807  			goto out;
1808  		err = 0;
1809  	} else {
1810  		sock_hold(sk);
1811  	}
1812  
1813  	addr = smp_load_acquire(&unix_sk(sk)->addr);
1814  	if (!addr) {
1815  		sunaddr->sun_family = AF_UNIX;
1816  		sunaddr->sun_path[0] = 0;
1817  		err = offsetof(struct sockaddr_un, sun_path);
1818  	} else {
1819  		err = addr->len;
1820  		memcpy(sunaddr, addr->name, addr->len);
1821  
1822  		if (peer)
1823  			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1824  					       CGROUP_UNIX_GETPEERNAME);
1825  		else
1826  			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1827  					       CGROUP_UNIX_GETSOCKNAME);
1828  	}
1829  	sock_put(sk);
1830  out:
1831  	return err;
1832  }
1833  
1834  /* The "user->unix_inflight" variable is protected by the garbage
1835   * collection lock, and we just read it locklessly here. If you go
1836   * over the limit, there might be a tiny race in actually noticing
1837   * it across threads. Tough.
1838   */
1839  static inline bool too_many_unix_fds(struct task_struct *p)
1840  {
1841  	struct user_struct *user = current_user();
1842  
1843  	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1844  		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1845  	return false;
1846  }
1847  
1848  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1849  {
1850  	if (too_many_unix_fds(current))
1851  		return -ETOOMANYREFS;
1852  
1853  	UNIXCB(skb).fp = scm->fp;
1854  	scm->fp = NULL;
1855  
1856  	if (unix_prepare_fpl(UNIXCB(skb).fp))
1857  		return -ENOMEM;
1858  
1859  	return 0;
1860  }
1861  
1862  static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1863  {
1864  	scm->fp = UNIXCB(skb).fp;
1865  	UNIXCB(skb).fp = NULL;
1866  
1867  	unix_destroy_fpl(scm->fp);
1868  }
1869  
1870  static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1871  {
1872  	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1873  }
1874  
1875  static void unix_destruct_scm(struct sk_buff *skb)
1876  {
1877  	struct scm_cookie scm;
1878  
1879  	memset(&scm, 0, sizeof(scm));
1880  	scm.pid  = UNIXCB(skb).pid;
1881  	if (UNIXCB(skb).fp)
1882  		unix_detach_fds(&scm, skb);
1883  
1884  	/* Alas, it calls VFS */
1885  	/* So fscking what? fput() had been SMP-safe since the last Summer */
1886  	scm_destroy(&scm);
1887  	sock_wfree(skb);
1888  }
1889  
1890  static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1891  {
1892  	int err = 0;
1893  
1894  	UNIXCB(skb).pid  = get_pid(scm->pid);
1895  	UNIXCB(skb).uid = scm->creds.uid;
1896  	UNIXCB(skb).gid = scm->creds.gid;
1897  	UNIXCB(skb).fp = NULL;
1898  	unix_get_secdata(scm, skb);
1899  	if (scm->fp && send_fds)
1900  		err = unix_attach_fds(scm, skb);
1901  
1902  	skb->destructor = unix_destruct_scm;
1903  	return err;
1904  }
1905  
1906  static bool unix_passcred_enabled(const struct socket *sock,
1907  				  const struct sock *other)
1908  {
1909  	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1910  	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1911  	       !other->sk_socket ||
1912  	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1913  	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1914  }
1915  
1916  /*
1917   * Some apps rely on write() giving SCM_CREDENTIALS
1918   * We include credentials if source or destination socket
1919   * asserted SOCK_PASSCRED.
1920   */
1921  static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1922  			    const struct sock *other)
1923  {
1924  	if (UNIXCB(skb).pid)
1925  		return;
1926  	if (unix_passcred_enabled(sock, other)) {
1927  		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1928  		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1929  	}
1930  }
1931  
1932  static bool unix_skb_scm_eq(struct sk_buff *skb,
1933  			    struct scm_cookie *scm)
1934  {
1935  	return UNIXCB(skb).pid == scm->pid &&
1936  	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1937  	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1938  	       unix_secdata_eq(scm, skb);
1939  }
1940  
1941  static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1942  {
1943  	struct scm_fp_list *fp = UNIXCB(skb).fp;
1944  	struct unix_sock *u = unix_sk(sk);
1945  
1946  	if (unlikely(fp && fp->count)) {
1947  		atomic_add(fp->count, &u->scm_stat.nr_fds);
1948  		unix_add_edges(fp, u);
1949  	}
1950  }
1951  
1952  static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1953  {
1954  	struct scm_fp_list *fp = UNIXCB(skb).fp;
1955  	struct unix_sock *u = unix_sk(sk);
1956  
1957  	if (unlikely(fp && fp->count)) {
1958  		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1959  		unix_del_edges(fp);
1960  	}
1961  }
1962  
1963  /*
1964   *	Send AF_UNIX data.
1965   */
1966  
1967  static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1968  			      size_t len)
1969  {
1970  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1971  	struct sock *sk = sock->sk, *other = NULL;
1972  	struct unix_sock *u = unix_sk(sk);
1973  	struct scm_cookie scm;
1974  	struct sk_buff *skb;
1975  	int data_len = 0;
1976  	int sk_locked;
1977  	long timeo;
1978  	int err;
1979  
1980  	err = scm_send(sock, msg, &scm, false);
1981  	if (err < 0)
1982  		return err;
1983  
1984  	wait_for_unix_gc(scm.fp);
1985  
1986  	err = -EOPNOTSUPP;
1987  	if (msg->msg_flags&MSG_OOB)
1988  		goto out;
1989  
1990  	if (msg->msg_namelen) {
1991  		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1992  		if (err)
1993  			goto out;
1994  
1995  		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1996  							    msg->msg_name,
1997  							    &msg->msg_namelen,
1998  							    NULL);
1999  		if (err)
2000  			goto out;
2001  	} else {
2002  		sunaddr = NULL;
2003  		err = -ENOTCONN;
2004  		other = unix_peer_get(sk);
2005  		if (!other)
2006  			goto out;
2007  	}
2008  
2009  	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
2010  	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
2011  	    !READ_ONCE(u->addr)) {
2012  		err = unix_autobind(sk);
2013  		if (err)
2014  			goto out;
2015  	}
2016  
2017  	err = -EMSGSIZE;
2018  	if (len > READ_ONCE(sk->sk_sndbuf) - 32)
2019  		goto out;
2020  
2021  	if (len > SKB_MAX_ALLOC) {
2022  		data_len = min_t(size_t,
2023  				 len - SKB_MAX_ALLOC,
2024  				 MAX_SKB_FRAGS * PAGE_SIZE);
2025  		data_len = PAGE_ALIGN(data_len);
2026  
2027  		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2028  	}
2029  
2030  	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2031  				   msg->msg_flags & MSG_DONTWAIT, &err,
2032  				   PAGE_ALLOC_COSTLY_ORDER);
2033  	if (skb == NULL)
2034  		goto out;
2035  
2036  	err = unix_scm_to_skb(&scm, skb, true);
2037  	if (err < 0)
2038  		goto out_free;
2039  
2040  	skb_put(skb, len - data_len);
2041  	skb->data_len = data_len;
2042  	skb->len = len;
2043  	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2044  	if (err)
2045  		goto out_free;
2046  
2047  	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2048  
2049  restart:
2050  	if (!other) {
2051  		err = -ECONNRESET;
2052  		if (sunaddr == NULL)
2053  			goto out_free;
2054  
2055  		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
2056  					sk->sk_type);
2057  		if (IS_ERR(other)) {
2058  			err = PTR_ERR(other);
2059  			other = NULL;
2060  			goto out_free;
2061  		}
2062  	}
2063  
2064  	if (sk_filter(other, skb) < 0) {
2065  		/* Toss the packet but do not return any error to the sender */
2066  		err = len;
2067  		goto out_free;
2068  	}
2069  
2070  	sk_locked = 0;
2071  	unix_state_lock(other);
2072  restart_locked:
2073  	err = -EPERM;
2074  	if (!unix_may_send(sk, other))
2075  		goto out_unlock;
2076  
2077  	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2078  		/*
2079  		 *	Check with 1003.1g - what should
2080  		 *	datagram error
2081  		 */
2082  		unix_state_unlock(other);
2083  		sock_put(other);
2084  
2085  		if (!sk_locked)
2086  			unix_state_lock(sk);
2087  
2088  		err = 0;
2089  		if (sk->sk_type == SOCK_SEQPACKET) {
2090  			/* We are here only when racing with unix_release_sock()
2091  			 * is clearing @other. Never change state to TCP_CLOSE
2092  			 * unlike SOCK_DGRAM wants.
2093  			 */
2094  			unix_state_unlock(sk);
2095  			err = -EPIPE;
2096  		} else if (unix_peer(sk) == other) {
2097  			unix_peer(sk) = NULL;
2098  			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2099  
2100  			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2101  			unix_state_unlock(sk);
2102  
2103  			unix_dgram_disconnected(sk, other);
2104  			sock_put(other);
2105  			err = -ECONNREFUSED;
2106  		} else {
2107  			unix_state_unlock(sk);
2108  		}
2109  
2110  		other = NULL;
2111  		if (err)
2112  			goto out_free;
2113  		goto restart;
2114  	}
2115  
2116  	err = -EPIPE;
2117  	if (other->sk_shutdown & RCV_SHUTDOWN)
2118  		goto out_unlock;
2119  
2120  	if (sk->sk_type != SOCK_SEQPACKET) {
2121  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2122  		if (err)
2123  			goto out_unlock;
2124  	}
2125  
2126  	/* other == sk && unix_peer(other) != sk if
2127  	 * - unix_peer(sk) == NULL, destination address bound to sk
2128  	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2129  	 */
2130  	if (other != sk &&
2131  	    unlikely(unix_peer(other) != sk &&
2132  	    unix_recvq_full_lockless(other))) {
2133  		if (timeo) {
2134  			timeo = unix_wait_for_peer(other, timeo);
2135  
2136  			err = sock_intr_errno(timeo);
2137  			if (signal_pending(current))
2138  				goto out_free;
2139  
2140  			goto restart;
2141  		}
2142  
2143  		if (!sk_locked) {
2144  			unix_state_unlock(other);
2145  			unix_state_double_lock(sk, other);
2146  		}
2147  
2148  		if (unix_peer(sk) != other ||
2149  		    unix_dgram_peer_wake_me(sk, other)) {
2150  			err = -EAGAIN;
2151  			sk_locked = 1;
2152  			goto out_unlock;
2153  		}
2154  
2155  		if (!sk_locked) {
2156  			sk_locked = 1;
2157  			goto restart_locked;
2158  		}
2159  	}
2160  
2161  	if (unlikely(sk_locked))
2162  		unix_state_unlock(sk);
2163  
2164  	if (sock_flag(other, SOCK_RCVTSTAMP))
2165  		__net_timestamp(skb);
2166  	maybe_add_creds(skb, sock, other);
2167  	scm_stat_add(other, skb);
2168  	skb_queue_tail(&other->sk_receive_queue, skb);
2169  	unix_state_unlock(other);
2170  	other->sk_data_ready(other);
2171  	sock_put(other);
2172  	scm_destroy(&scm);
2173  	return len;
2174  
2175  out_unlock:
2176  	if (sk_locked)
2177  		unix_state_unlock(sk);
2178  	unix_state_unlock(other);
2179  out_free:
2180  	kfree_skb(skb);
2181  out:
2182  	if (other)
2183  		sock_put(other);
2184  	scm_destroy(&scm);
2185  	return err;
2186  }
2187  
2188  /* We use paged skbs for stream sockets, and limit occupancy to 32768
2189   * bytes, and a minimum of a full page.
2190   */
2191  #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2192  
2193  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2194  static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2195  		     struct scm_cookie *scm, bool fds_sent)
2196  {
2197  	struct unix_sock *ousk = unix_sk(other);
2198  	struct sk_buff *skb;
2199  	int err = 0;
2200  
2201  	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2202  
2203  	if (!skb)
2204  		return err;
2205  
2206  	err = unix_scm_to_skb(scm, skb, !fds_sent);
2207  	if (err < 0) {
2208  		kfree_skb(skb);
2209  		return err;
2210  	}
2211  	skb_put(skb, 1);
2212  	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2213  
2214  	if (err) {
2215  		kfree_skb(skb);
2216  		return err;
2217  	}
2218  
2219  	unix_state_lock(other);
2220  
2221  	if (sock_flag(other, SOCK_DEAD) ||
2222  	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2223  		unix_state_unlock(other);
2224  		kfree_skb(skb);
2225  		return -EPIPE;
2226  	}
2227  
2228  	maybe_add_creds(skb, sock, other);
2229  	skb_get(skb);
2230  
2231  	scm_stat_add(other, skb);
2232  
2233  	spin_lock(&other->sk_receive_queue.lock);
2234  	if (ousk->oob_skb)
2235  		consume_skb(ousk->oob_skb);
2236  	WRITE_ONCE(ousk->oob_skb, skb);
2237  	__skb_queue_tail(&other->sk_receive_queue, skb);
2238  	spin_unlock(&other->sk_receive_queue.lock);
2239  
2240  	sk_send_sigurg(other);
2241  	unix_state_unlock(other);
2242  	other->sk_data_ready(other);
2243  
2244  	return err;
2245  }
2246  #endif
2247  
2248  static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2249  			       size_t len)
2250  {
2251  	struct sock *sk = sock->sk;
2252  	struct sock *other = NULL;
2253  	int err, size;
2254  	struct sk_buff *skb;
2255  	int sent = 0;
2256  	struct scm_cookie scm;
2257  	bool fds_sent = false;
2258  	int data_len;
2259  
2260  	err = scm_send(sock, msg, &scm, false);
2261  	if (err < 0)
2262  		return err;
2263  
2264  	wait_for_unix_gc(scm.fp);
2265  
2266  	err = -EOPNOTSUPP;
2267  	if (msg->msg_flags & MSG_OOB) {
2268  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2269  		if (len)
2270  			len--;
2271  		else
2272  #endif
2273  			goto out_err;
2274  	}
2275  
2276  	if (msg->msg_namelen) {
2277  		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2278  		goto out_err;
2279  	} else {
2280  		err = -ENOTCONN;
2281  		other = unix_peer(sk);
2282  		if (!other)
2283  			goto out_err;
2284  	}
2285  
2286  	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2287  		goto pipe_err;
2288  
2289  	while (sent < len) {
2290  		size = len - sent;
2291  
2292  		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2293  			skb = sock_alloc_send_pskb(sk, 0, 0,
2294  						   msg->msg_flags & MSG_DONTWAIT,
2295  						   &err, 0);
2296  		} else {
2297  			/* Keep two messages in the pipe so it schedules better */
2298  			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2299  
2300  			/* allow fallback to order-0 allocations */
2301  			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2302  
2303  			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2304  
2305  			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2306  
2307  			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2308  						   msg->msg_flags & MSG_DONTWAIT, &err,
2309  						   get_order(UNIX_SKB_FRAGS_SZ));
2310  		}
2311  		if (!skb)
2312  			goto out_err;
2313  
2314  		/* Only send the fds in the first buffer */
2315  		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2316  		if (err < 0) {
2317  			kfree_skb(skb);
2318  			goto out_err;
2319  		}
2320  		fds_sent = true;
2321  
2322  		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2323  			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2324  						   sk->sk_allocation);
2325  			if (err < 0) {
2326  				kfree_skb(skb);
2327  				goto out_err;
2328  			}
2329  			size = err;
2330  			refcount_add(size, &sk->sk_wmem_alloc);
2331  		} else {
2332  			skb_put(skb, size - data_len);
2333  			skb->data_len = data_len;
2334  			skb->len = size;
2335  			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2336  			if (err) {
2337  				kfree_skb(skb);
2338  				goto out_err;
2339  			}
2340  		}
2341  
2342  		unix_state_lock(other);
2343  
2344  		if (sock_flag(other, SOCK_DEAD) ||
2345  		    (other->sk_shutdown & RCV_SHUTDOWN))
2346  			goto pipe_err_free;
2347  
2348  		maybe_add_creds(skb, sock, other);
2349  		scm_stat_add(other, skb);
2350  		skb_queue_tail(&other->sk_receive_queue, skb);
2351  		unix_state_unlock(other);
2352  		other->sk_data_ready(other);
2353  		sent += size;
2354  	}
2355  
2356  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2357  	if (msg->msg_flags & MSG_OOB) {
2358  		err = queue_oob(sock, msg, other, &scm, fds_sent);
2359  		if (err)
2360  			goto out_err;
2361  		sent++;
2362  	}
2363  #endif
2364  
2365  	scm_destroy(&scm);
2366  
2367  	return sent;
2368  
2369  pipe_err_free:
2370  	unix_state_unlock(other);
2371  	kfree_skb(skb);
2372  pipe_err:
2373  	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2374  		send_sig(SIGPIPE, current, 0);
2375  	err = -EPIPE;
2376  out_err:
2377  	scm_destroy(&scm);
2378  	return sent ? : err;
2379  }
2380  
2381  static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2382  				  size_t len)
2383  {
2384  	int err;
2385  	struct sock *sk = sock->sk;
2386  
2387  	err = sock_error(sk);
2388  	if (err)
2389  		return err;
2390  
2391  	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2392  		return -ENOTCONN;
2393  
2394  	if (msg->msg_namelen)
2395  		msg->msg_namelen = 0;
2396  
2397  	return unix_dgram_sendmsg(sock, msg, len);
2398  }
2399  
2400  static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2401  				  size_t size, int flags)
2402  {
2403  	struct sock *sk = sock->sk;
2404  
2405  	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2406  		return -ENOTCONN;
2407  
2408  	return unix_dgram_recvmsg(sock, msg, size, flags);
2409  }
2410  
2411  static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2412  {
2413  	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2414  
2415  	if (addr) {
2416  		msg->msg_namelen = addr->len;
2417  		memcpy(msg->msg_name, addr->name, addr->len);
2418  	}
2419  }
2420  
2421  int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2422  			 int flags)
2423  {
2424  	struct scm_cookie scm;
2425  	struct socket *sock = sk->sk_socket;
2426  	struct unix_sock *u = unix_sk(sk);
2427  	struct sk_buff *skb, *last;
2428  	long timeo;
2429  	int skip;
2430  	int err;
2431  
2432  	err = -EOPNOTSUPP;
2433  	if (flags&MSG_OOB)
2434  		goto out;
2435  
2436  	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2437  
2438  	do {
2439  		mutex_lock(&u->iolock);
2440  
2441  		skip = sk_peek_offset(sk, flags);
2442  		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2443  					      &skip, &err, &last);
2444  		if (skb) {
2445  			if (!(flags & MSG_PEEK))
2446  				scm_stat_del(sk, skb);
2447  			break;
2448  		}
2449  
2450  		mutex_unlock(&u->iolock);
2451  
2452  		if (err != -EAGAIN)
2453  			break;
2454  	} while (timeo &&
2455  		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2456  					      &err, &timeo, last));
2457  
2458  	if (!skb) { /* implies iolock unlocked */
2459  		unix_state_lock(sk);
2460  		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2461  		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2462  		    (sk->sk_shutdown & RCV_SHUTDOWN))
2463  			err = 0;
2464  		unix_state_unlock(sk);
2465  		goto out;
2466  	}
2467  
2468  	if (wq_has_sleeper(&u->peer_wait))
2469  		wake_up_interruptible_sync_poll(&u->peer_wait,
2470  						EPOLLOUT | EPOLLWRNORM |
2471  						EPOLLWRBAND);
2472  
2473  	if (msg->msg_name) {
2474  		unix_copy_addr(msg, skb->sk);
2475  
2476  		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2477  						      msg->msg_name,
2478  						      &msg->msg_namelen);
2479  	}
2480  
2481  	if (size > skb->len - skip)
2482  		size = skb->len - skip;
2483  	else if (size < skb->len - skip)
2484  		msg->msg_flags |= MSG_TRUNC;
2485  
2486  	err = skb_copy_datagram_msg(skb, skip, msg, size);
2487  	if (err)
2488  		goto out_free;
2489  
2490  	if (sock_flag(sk, SOCK_RCVTSTAMP))
2491  		__sock_recv_timestamp(msg, sk, skb);
2492  
2493  	memset(&scm, 0, sizeof(scm));
2494  
2495  	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2496  	unix_set_secdata(&scm, skb);
2497  
2498  	if (!(flags & MSG_PEEK)) {
2499  		if (UNIXCB(skb).fp)
2500  			unix_detach_fds(&scm, skb);
2501  
2502  		sk_peek_offset_bwd(sk, skb->len);
2503  	} else {
2504  		/* It is questionable: on PEEK we could:
2505  		   - do not return fds - good, but too simple 8)
2506  		   - return fds, and do not return them on read (old strategy,
2507  		     apparently wrong)
2508  		   - clone fds (I chose it for now, it is the most universal
2509  		     solution)
2510  
2511  		   POSIX 1003.1g does not actually define this clearly
2512  		   at all. POSIX 1003.1g doesn't define a lot of things
2513  		   clearly however!
2514  
2515  		*/
2516  
2517  		sk_peek_offset_fwd(sk, size);
2518  
2519  		if (UNIXCB(skb).fp)
2520  			unix_peek_fds(&scm, skb);
2521  	}
2522  	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2523  
2524  	scm_recv_unix(sock, msg, &scm, flags);
2525  
2526  out_free:
2527  	skb_free_datagram(sk, skb);
2528  	mutex_unlock(&u->iolock);
2529  out:
2530  	return err;
2531  }
2532  
2533  static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2534  			      int flags)
2535  {
2536  	struct sock *sk = sock->sk;
2537  
2538  #ifdef CONFIG_BPF_SYSCALL
2539  	const struct proto *prot = READ_ONCE(sk->sk_prot);
2540  
2541  	if (prot != &unix_dgram_proto)
2542  		return prot->recvmsg(sk, msg, size, flags, NULL);
2543  #endif
2544  	return __unix_dgram_recvmsg(sk, msg, size, flags);
2545  }
2546  
2547  static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2548  {
2549  	struct unix_sock *u = unix_sk(sk);
2550  	struct sk_buff *skb;
2551  	int err;
2552  
2553  	mutex_lock(&u->iolock);
2554  	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2555  	mutex_unlock(&u->iolock);
2556  	if (!skb)
2557  		return err;
2558  
2559  	return recv_actor(sk, skb);
2560  }
2561  
2562  /*
2563   *	Sleep until more data has arrived. But check for races..
2564   */
2565  static long unix_stream_data_wait(struct sock *sk, long timeo,
2566  				  struct sk_buff *last, unsigned int last_len,
2567  				  bool freezable)
2568  {
2569  	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2570  	struct sk_buff *tail;
2571  	DEFINE_WAIT(wait);
2572  
2573  	unix_state_lock(sk);
2574  
2575  	for (;;) {
2576  		prepare_to_wait(sk_sleep(sk), &wait, state);
2577  
2578  		tail = skb_peek_tail(&sk->sk_receive_queue);
2579  		if (tail != last ||
2580  		    (tail && tail->len != last_len) ||
2581  		    sk->sk_err ||
2582  		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2583  		    signal_pending(current) ||
2584  		    !timeo)
2585  			break;
2586  
2587  		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2588  		unix_state_unlock(sk);
2589  		timeo = schedule_timeout(timeo);
2590  		unix_state_lock(sk);
2591  
2592  		if (sock_flag(sk, SOCK_DEAD))
2593  			break;
2594  
2595  		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2596  	}
2597  
2598  	finish_wait(sk_sleep(sk), &wait);
2599  	unix_state_unlock(sk);
2600  	return timeo;
2601  }
2602  
2603  static unsigned int unix_skb_len(const struct sk_buff *skb)
2604  {
2605  	return skb->len - UNIXCB(skb).consumed;
2606  }
2607  
2608  struct unix_stream_read_state {
2609  	int (*recv_actor)(struct sk_buff *, int, int,
2610  			  struct unix_stream_read_state *);
2611  	struct socket *socket;
2612  	struct msghdr *msg;
2613  	struct pipe_inode_info *pipe;
2614  	size_t size;
2615  	int flags;
2616  	unsigned int splice_flags;
2617  };
2618  
2619  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2620  static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2621  {
2622  	struct socket *sock = state->socket;
2623  	struct sock *sk = sock->sk;
2624  	struct unix_sock *u = unix_sk(sk);
2625  	int chunk = 1;
2626  	struct sk_buff *oob_skb;
2627  
2628  	mutex_lock(&u->iolock);
2629  	unix_state_lock(sk);
2630  	spin_lock(&sk->sk_receive_queue.lock);
2631  
2632  	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2633  		spin_unlock(&sk->sk_receive_queue.lock);
2634  		unix_state_unlock(sk);
2635  		mutex_unlock(&u->iolock);
2636  		return -EINVAL;
2637  	}
2638  
2639  	oob_skb = u->oob_skb;
2640  
2641  	if (!(state->flags & MSG_PEEK))
2642  		WRITE_ONCE(u->oob_skb, NULL);
2643  	else
2644  		skb_get(oob_skb);
2645  
2646  	spin_unlock(&sk->sk_receive_queue.lock);
2647  	unix_state_unlock(sk);
2648  
2649  	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2650  
2651  	if (!(state->flags & MSG_PEEK))
2652  		UNIXCB(oob_skb).consumed += 1;
2653  
2654  	consume_skb(oob_skb);
2655  
2656  	mutex_unlock(&u->iolock);
2657  
2658  	if (chunk < 0)
2659  		return -EFAULT;
2660  
2661  	state->msg->msg_flags |= MSG_OOB;
2662  	return 1;
2663  }
2664  
2665  static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2666  				  int flags, int copied)
2667  {
2668  	struct unix_sock *u = unix_sk(sk);
2669  
2670  	if (!unix_skb_len(skb)) {
2671  		struct sk_buff *unlinked_skb = NULL;
2672  
2673  		spin_lock(&sk->sk_receive_queue.lock);
2674  
2675  		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2676  			skb = NULL;
2677  		} else if (flags & MSG_PEEK) {
2678  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2679  		} else {
2680  			unlinked_skb = skb;
2681  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2682  			__skb_unlink(unlinked_skb, &sk->sk_receive_queue);
2683  		}
2684  
2685  		spin_unlock(&sk->sk_receive_queue.lock);
2686  
2687  		consume_skb(unlinked_skb);
2688  	} else {
2689  		struct sk_buff *unlinked_skb = NULL;
2690  
2691  		spin_lock(&sk->sk_receive_queue.lock);
2692  
2693  		if (skb == u->oob_skb) {
2694  			if (copied) {
2695  				skb = NULL;
2696  			} else if (!(flags & MSG_PEEK)) {
2697  				if (sock_flag(sk, SOCK_URGINLINE)) {
2698  					WRITE_ONCE(u->oob_skb, NULL);
2699  					consume_skb(skb);
2700  				} else {
2701  					__skb_unlink(skb, &sk->sk_receive_queue);
2702  					WRITE_ONCE(u->oob_skb, NULL);
2703  					unlinked_skb = skb;
2704  					skb = skb_peek(&sk->sk_receive_queue);
2705  				}
2706  			} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2707  				skb = skb_peek_next(skb, &sk->sk_receive_queue);
2708  			}
2709  		}
2710  
2711  		spin_unlock(&sk->sk_receive_queue.lock);
2712  
2713  		if (unlinked_skb) {
2714  			WARN_ON_ONCE(skb_unref(unlinked_skb));
2715  			kfree_skb(unlinked_skb);
2716  		}
2717  	}
2718  	return skb;
2719  }
2720  #endif
2721  
2722  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2723  {
2724  	struct unix_sock *u = unix_sk(sk);
2725  	struct sk_buff *skb;
2726  	int err;
2727  
2728  	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2729  		return -ENOTCONN;
2730  
2731  	mutex_lock(&u->iolock);
2732  	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2733  	mutex_unlock(&u->iolock);
2734  	if (!skb)
2735  		return err;
2736  
2737  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2738  	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2739  		bool drop = false;
2740  
2741  		unix_state_lock(sk);
2742  
2743  		if (sock_flag(sk, SOCK_DEAD)) {
2744  			unix_state_unlock(sk);
2745  			kfree_skb(skb);
2746  			return -ECONNRESET;
2747  		}
2748  
2749  		spin_lock(&sk->sk_receive_queue.lock);
2750  		if (likely(skb == u->oob_skb)) {
2751  			WRITE_ONCE(u->oob_skb, NULL);
2752  			drop = true;
2753  		}
2754  		spin_unlock(&sk->sk_receive_queue.lock);
2755  
2756  		unix_state_unlock(sk);
2757  
2758  		if (drop) {
2759  			WARN_ON_ONCE(skb_unref(skb));
2760  			kfree_skb(skb);
2761  			return -EAGAIN;
2762  		}
2763  	}
2764  #endif
2765  
2766  	return recv_actor(sk, skb);
2767  }
2768  
2769  static int unix_stream_read_generic(struct unix_stream_read_state *state,
2770  				    bool freezable)
2771  {
2772  	struct scm_cookie scm;
2773  	struct socket *sock = state->socket;
2774  	struct sock *sk = sock->sk;
2775  	struct unix_sock *u = unix_sk(sk);
2776  	int copied = 0;
2777  	int flags = state->flags;
2778  	int noblock = flags & MSG_DONTWAIT;
2779  	bool check_creds = false;
2780  	int target;
2781  	int err = 0;
2782  	long timeo;
2783  	int skip;
2784  	size_t size = state->size;
2785  	unsigned int last_len;
2786  
2787  	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2788  		err = -EINVAL;
2789  		goto out;
2790  	}
2791  
2792  	if (unlikely(flags & MSG_OOB)) {
2793  		err = -EOPNOTSUPP;
2794  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2795  		err = unix_stream_recv_urg(state);
2796  #endif
2797  		goto out;
2798  	}
2799  
2800  	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2801  	timeo = sock_rcvtimeo(sk, noblock);
2802  
2803  	memset(&scm, 0, sizeof(scm));
2804  
2805  	/* Lock the socket to prevent queue disordering
2806  	 * while sleeps in memcpy_tomsg
2807  	 */
2808  	mutex_lock(&u->iolock);
2809  
2810  	skip = max(sk_peek_offset(sk, flags), 0);
2811  
2812  	do {
2813  		struct sk_buff *skb, *last;
2814  		int chunk;
2815  
2816  redo:
2817  		unix_state_lock(sk);
2818  		if (sock_flag(sk, SOCK_DEAD)) {
2819  			err = -ECONNRESET;
2820  			goto unlock;
2821  		}
2822  		last = skb = skb_peek(&sk->sk_receive_queue);
2823  		last_len = last ? last->len : 0;
2824  
2825  again:
2826  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2827  		if (skb) {
2828  			skb = manage_oob(skb, sk, flags, copied);
2829  			if (!skb && copied) {
2830  				unix_state_unlock(sk);
2831  				break;
2832  			}
2833  		}
2834  #endif
2835  		if (skb == NULL) {
2836  			if (copied >= target)
2837  				goto unlock;
2838  
2839  			/*
2840  			 *	POSIX 1003.1g mandates this order.
2841  			 */
2842  
2843  			err = sock_error(sk);
2844  			if (err)
2845  				goto unlock;
2846  			if (sk->sk_shutdown & RCV_SHUTDOWN)
2847  				goto unlock;
2848  
2849  			unix_state_unlock(sk);
2850  			if (!timeo) {
2851  				err = -EAGAIN;
2852  				break;
2853  			}
2854  
2855  			mutex_unlock(&u->iolock);
2856  
2857  			timeo = unix_stream_data_wait(sk, timeo, last,
2858  						      last_len, freezable);
2859  
2860  			if (signal_pending(current)) {
2861  				err = sock_intr_errno(timeo);
2862  				scm_destroy(&scm);
2863  				goto out;
2864  			}
2865  
2866  			mutex_lock(&u->iolock);
2867  			goto redo;
2868  unlock:
2869  			unix_state_unlock(sk);
2870  			break;
2871  		}
2872  
2873  		while (skip >= unix_skb_len(skb)) {
2874  			skip -= unix_skb_len(skb);
2875  			last = skb;
2876  			last_len = skb->len;
2877  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2878  			if (!skb)
2879  				goto again;
2880  		}
2881  
2882  		unix_state_unlock(sk);
2883  
2884  		if (check_creds) {
2885  			/* Never glue messages from different writers */
2886  			if (!unix_skb_scm_eq(skb, &scm))
2887  				break;
2888  		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2889  			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2890  			/* Copy credentials */
2891  			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2892  			unix_set_secdata(&scm, skb);
2893  			check_creds = true;
2894  		}
2895  
2896  		/* Copy address just once */
2897  		if (state->msg && state->msg->msg_name) {
2898  			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2899  					 state->msg->msg_name);
2900  			unix_copy_addr(state->msg, skb->sk);
2901  
2902  			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2903  							      state->msg->msg_name,
2904  							      &state->msg->msg_namelen);
2905  
2906  			sunaddr = NULL;
2907  		}
2908  
2909  		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2910  		chunk = state->recv_actor(skb, skip, chunk, state);
2911  		if (chunk < 0) {
2912  			if (copied == 0)
2913  				copied = -EFAULT;
2914  			break;
2915  		}
2916  		copied += chunk;
2917  		size -= chunk;
2918  
2919  		/* Mark read part of skb as used */
2920  		if (!(flags & MSG_PEEK)) {
2921  			UNIXCB(skb).consumed += chunk;
2922  
2923  			sk_peek_offset_bwd(sk, chunk);
2924  
2925  			if (UNIXCB(skb).fp) {
2926  				scm_stat_del(sk, skb);
2927  				unix_detach_fds(&scm, skb);
2928  			}
2929  
2930  			if (unix_skb_len(skb))
2931  				break;
2932  
2933  			skb_unlink(skb, &sk->sk_receive_queue);
2934  			consume_skb(skb);
2935  
2936  			if (scm.fp)
2937  				break;
2938  		} else {
2939  			/* It is questionable, see note in unix_dgram_recvmsg.
2940  			 */
2941  			if (UNIXCB(skb).fp)
2942  				unix_peek_fds(&scm, skb);
2943  
2944  			sk_peek_offset_fwd(sk, chunk);
2945  
2946  			if (UNIXCB(skb).fp)
2947  				break;
2948  
2949  			skip = 0;
2950  			last = skb;
2951  			last_len = skb->len;
2952  			unix_state_lock(sk);
2953  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2954  			if (skb)
2955  				goto again;
2956  			unix_state_unlock(sk);
2957  			break;
2958  		}
2959  	} while (size);
2960  
2961  	mutex_unlock(&u->iolock);
2962  	if (state->msg)
2963  		scm_recv_unix(sock, state->msg, &scm, flags);
2964  	else
2965  		scm_destroy(&scm);
2966  out:
2967  	return copied ? : err;
2968  }
2969  
2970  static int unix_stream_read_actor(struct sk_buff *skb,
2971  				  int skip, int chunk,
2972  				  struct unix_stream_read_state *state)
2973  {
2974  	int ret;
2975  
2976  	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2977  				    state->msg, chunk);
2978  	return ret ?: chunk;
2979  }
2980  
2981  int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2982  			  size_t size, int flags)
2983  {
2984  	struct unix_stream_read_state state = {
2985  		.recv_actor = unix_stream_read_actor,
2986  		.socket = sk->sk_socket,
2987  		.msg = msg,
2988  		.size = size,
2989  		.flags = flags
2990  	};
2991  
2992  	return unix_stream_read_generic(&state, true);
2993  }
2994  
2995  static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2996  			       size_t size, int flags)
2997  {
2998  	struct unix_stream_read_state state = {
2999  		.recv_actor = unix_stream_read_actor,
3000  		.socket = sock,
3001  		.msg = msg,
3002  		.size = size,
3003  		.flags = flags
3004  	};
3005  
3006  #ifdef CONFIG_BPF_SYSCALL
3007  	struct sock *sk = sock->sk;
3008  	const struct proto *prot = READ_ONCE(sk->sk_prot);
3009  
3010  	if (prot != &unix_stream_proto)
3011  		return prot->recvmsg(sk, msg, size, flags, NULL);
3012  #endif
3013  	return unix_stream_read_generic(&state, true);
3014  }
3015  
3016  static int unix_stream_splice_actor(struct sk_buff *skb,
3017  				    int skip, int chunk,
3018  				    struct unix_stream_read_state *state)
3019  {
3020  	return skb_splice_bits(skb, state->socket->sk,
3021  			       UNIXCB(skb).consumed + skip,
3022  			       state->pipe, chunk, state->splice_flags);
3023  }
3024  
3025  static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3026  				       struct pipe_inode_info *pipe,
3027  				       size_t size, unsigned int flags)
3028  {
3029  	struct unix_stream_read_state state = {
3030  		.recv_actor = unix_stream_splice_actor,
3031  		.socket = sock,
3032  		.pipe = pipe,
3033  		.size = size,
3034  		.splice_flags = flags,
3035  	};
3036  
3037  	if (unlikely(*ppos))
3038  		return -ESPIPE;
3039  
3040  	if (sock->file->f_flags & O_NONBLOCK ||
3041  	    flags & SPLICE_F_NONBLOCK)
3042  		state.flags = MSG_DONTWAIT;
3043  
3044  	return unix_stream_read_generic(&state, false);
3045  }
3046  
3047  static int unix_shutdown(struct socket *sock, int mode)
3048  {
3049  	struct sock *sk = sock->sk;
3050  	struct sock *other;
3051  
3052  	if (mode < SHUT_RD || mode > SHUT_RDWR)
3053  		return -EINVAL;
3054  	/* This maps:
3055  	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3056  	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3057  	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3058  	 */
3059  	++mode;
3060  
3061  	unix_state_lock(sk);
3062  	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3063  	other = unix_peer(sk);
3064  	if (other)
3065  		sock_hold(other);
3066  	unix_state_unlock(sk);
3067  	sk->sk_state_change(sk);
3068  
3069  	if (other &&
3070  		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3071  
3072  		int peer_mode = 0;
3073  		const struct proto *prot = READ_ONCE(other->sk_prot);
3074  
3075  		if (prot->unhash)
3076  			prot->unhash(other);
3077  		if (mode&RCV_SHUTDOWN)
3078  			peer_mode |= SEND_SHUTDOWN;
3079  		if (mode&SEND_SHUTDOWN)
3080  			peer_mode |= RCV_SHUTDOWN;
3081  		unix_state_lock(other);
3082  		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3083  		unix_state_unlock(other);
3084  		other->sk_state_change(other);
3085  		if (peer_mode == SHUTDOWN_MASK)
3086  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3087  		else if (peer_mode & RCV_SHUTDOWN)
3088  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3089  	}
3090  	if (other)
3091  		sock_put(other);
3092  
3093  	return 0;
3094  }
3095  
3096  long unix_inq_len(struct sock *sk)
3097  {
3098  	struct sk_buff *skb;
3099  	long amount = 0;
3100  
3101  	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3102  		return -EINVAL;
3103  
3104  	spin_lock(&sk->sk_receive_queue.lock);
3105  	if (sk->sk_type == SOCK_STREAM ||
3106  	    sk->sk_type == SOCK_SEQPACKET) {
3107  		skb_queue_walk(&sk->sk_receive_queue, skb)
3108  			amount += unix_skb_len(skb);
3109  	} else {
3110  		skb = skb_peek(&sk->sk_receive_queue);
3111  		if (skb)
3112  			amount = skb->len;
3113  	}
3114  	spin_unlock(&sk->sk_receive_queue.lock);
3115  
3116  	return amount;
3117  }
3118  EXPORT_SYMBOL_GPL(unix_inq_len);
3119  
3120  long unix_outq_len(struct sock *sk)
3121  {
3122  	return sk_wmem_alloc_get(sk);
3123  }
3124  EXPORT_SYMBOL_GPL(unix_outq_len);
3125  
3126  static int unix_open_file(struct sock *sk)
3127  {
3128  	struct path path;
3129  	struct file *f;
3130  	int fd;
3131  
3132  	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3133  		return -EPERM;
3134  
3135  	if (!smp_load_acquire(&unix_sk(sk)->addr))
3136  		return -ENOENT;
3137  
3138  	path = unix_sk(sk)->path;
3139  	if (!path.dentry)
3140  		return -ENOENT;
3141  
3142  	path_get(&path);
3143  
3144  	fd = get_unused_fd_flags(O_CLOEXEC);
3145  	if (fd < 0)
3146  		goto out;
3147  
3148  	f = dentry_open(&path, O_PATH, current_cred());
3149  	if (IS_ERR(f)) {
3150  		put_unused_fd(fd);
3151  		fd = PTR_ERR(f);
3152  		goto out;
3153  	}
3154  
3155  	fd_install(fd, f);
3156  out:
3157  	path_put(&path);
3158  
3159  	return fd;
3160  }
3161  
3162  static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3163  {
3164  	struct sock *sk = sock->sk;
3165  	long amount = 0;
3166  	int err;
3167  
3168  	switch (cmd) {
3169  	case SIOCOUTQ:
3170  		amount = unix_outq_len(sk);
3171  		err = put_user(amount, (int __user *)arg);
3172  		break;
3173  	case SIOCINQ:
3174  		amount = unix_inq_len(sk);
3175  		if (amount < 0)
3176  			err = amount;
3177  		else
3178  			err = put_user(amount, (int __user *)arg);
3179  		break;
3180  	case SIOCUNIXFILE:
3181  		err = unix_open_file(sk);
3182  		break;
3183  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3184  	case SIOCATMARK:
3185  		{
3186  			struct unix_sock *u = unix_sk(sk);
3187  			struct sk_buff *skb;
3188  			int answ = 0;
3189  
3190  			mutex_lock(&u->iolock);
3191  
3192  			skb = skb_peek(&sk->sk_receive_queue);
3193  			if (skb) {
3194  				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3195  
3196  				if (skb == oob_skb ||
3197  				    (!oob_skb && !unix_skb_len(skb)))
3198  					answ = 1;
3199  			}
3200  
3201  			mutex_unlock(&u->iolock);
3202  
3203  			err = put_user(answ, (int __user *)arg);
3204  		}
3205  		break;
3206  #endif
3207  	default:
3208  		err = -ENOIOCTLCMD;
3209  		break;
3210  	}
3211  	return err;
3212  }
3213  
3214  #ifdef CONFIG_COMPAT
3215  static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3216  {
3217  	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3218  }
3219  #endif
3220  
3221  static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3222  {
3223  	struct sock *sk = sock->sk;
3224  	unsigned char state;
3225  	__poll_t mask;
3226  	u8 shutdown;
3227  
3228  	sock_poll_wait(file, sock, wait);
3229  	mask = 0;
3230  	shutdown = READ_ONCE(sk->sk_shutdown);
3231  	state = READ_ONCE(sk->sk_state);
3232  
3233  	/* exceptional events? */
3234  	if (READ_ONCE(sk->sk_err))
3235  		mask |= EPOLLERR;
3236  	if (shutdown == SHUTDOWN_MASK)
3237  		mask |= EPOLLHUP;
3238  	if (shutdown & RCV_SHUTDOWN)
3239  		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3240  
3241  	/* readable? */
3242  	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3243  		mask |= EPOLLIN | EPOLLRDNORM;
3244  	if (sk_is_readable(sk))
3245  		mask |= EPOLLIN | EPOLLRDNORM;
3246  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3247  	if (READ_ONCE(unix_sk(sk)->oob_skb))
3248  		mask |= EPOLLPRI;
3249  #endif
3250  
3251  	/* Connection-based need to check for termination and startup */
3252  	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3253  	    state == TCP_CLOSE)
3254  		mask |= EPOLLHUP;
3255  
3256  	/*
3257  	 * we set writable also when the other side has shut down the
3258  	 * connection. This prevents stuck sockets.
3259  	 */
3260  	if (unix_writable(sk, state))
3261  		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3262  
3263  	return mask;
3264  }
3265  
3266  static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3267  				    poll_table *wait)
3268  {
3269  	struct sock *sk = sock->sk, *other;
3270  	unsigned int writable;
3271  	unsigned char state;
3272  	__poll_t mask;
3273  	u8 shutdown;
3274  
3275  	sock_poll_wait(file, sock, wait);
3276  	mask = 0;
3277  	shutdown = READ_ONCE(sk->sk_shutdown);
3278  	state = READ_ONCE(sk->sk_state);
3279  
3280  	/* exceptional events? */
3281  	if (READ_ONCE(sk->sk_err) ||
3282  	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3283  		mask |= EPOLLERR |
3284  			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3285  
3286  	if (shutdown & RCV_SHUTDOWN)
3287  		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3288  	if (shutdown == SHUTDOWN_MASK)
3289  		mask |= EPOLLHUP;
3290  
3291  	/* readable? */
3292  	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3293  		mask |= EPOLLIN | EPOLLRDNORM;
3294  	if (sk_is_readable(sk))
3295  		mask |= EPOLLIN | EPOLLRDNORM;
3296  
3297  	/* Connection-based need to check for termination and startup */
3298  	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3299  		mask |= EPOLLHUP;
3300  
3301  	/* No write status requested, avoid expensive OUT tests. */
3302  	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3303  		return mask;
3304  
3305  	writable = unix_writable(sk, state);
3306  	if (writable) {
3307  		unix_state_lock(sk);
3308  
3309  		other = unix_peer(sk);
3310  		if (other && unix_peer(other) != sk &&
3311  		    unix_recvq_full_lockless(other) &&
3312  		    unix_dgram_peer_wake_me(sk, other))
3313  			writable = 0;
3314  
3315  		unix_state_unlock(sk);
3316  	}
3317  
3318  	if (writable)
3319  		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3320  	else
3321  		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3322  
3323  	return mask;
3324  }
3325  
3326  #ifdef CONFIG_PROC_FS
3327  
3328  #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3329  
3330  #define get_bucket(x) ((x) >> BUCKET_SPACE)
3331  #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3332  #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3333  
3334  static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3335  {
3336  	unsigned long offset = get_offset(*pos);
3337  	unsigned long bucket = get_bucket(*pos);
3338  	unsigned long count = 0;
3339  	struct sock *sk;
3340  
3341  	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3342  	     sk; sk = sk_next(sk)) {
3343  		if (++count == offset)
3344  			break;
3345  	}
3346  
3347  	return sk;
3348  }
3349  
3350  static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3351  {
3352  	unsigned long bucket = get_bucket(*pos);
3353  	struct net *net = seq_file_net(seq);
3354  	struct sock *sk;
3355  
3356  	while (bucket < UNIX_HASH_SIZE) {
3357  		spin_lock(&net->unx.table.locks[bucket]);
3358  
3359  		sk = unix_from_bucket(seq, pos);
3360  		if (sk)
3361  			return sk;
3362  
3363  		spin_unlock(&net->unx.table.locks[bucket]);
3364  
3365  		*pos = set_bucket_offset(++bucket, 1);
3366  	}
3367  
3368  	return NULL;
3369  }
3370  
3371  static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3372  				  loff_t *pos)
3373  {
3374  	unsigned long bucket = get_bucket(*pos);
3375  
3376  	sk = sk_next(sk);
3377  	if (sk)
3378  		return sk;
3379  
3380  
3381  	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3382  
3383  	*pos = set_bucket_offset(++bucket, 1);
3384  
3385  	return unix_get_first(seq, pos);
3386  }
3387  
3388  static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3389  {
3390  	if (!*pos)
3391  		return SEQ_START_TOKEN;
3392  
3393  	return unix_get_first(seq, pos);
3394  }
3395  
3396  static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3397  {
3398  	++*pos;
3399  
3400  	if (v == SEQ_START_TOKEN)
3401  		return unix_get_first(seq, pos);
3402  
3403  	return unix_get_next(seq, v, pos);
3404  }
3405  
3406  static void unix_seq_stop(struct seq_file *seq, void *v)
3407  {
3408  	struct sock *sk = v;
3409  
3410  	if (sk)
3411  		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3412  }
3413  
3414  static int unix_seq_show(struct seq_file *seq, void *v)
3415  {
3416  
3417  	if (v == SEQ_START_TOKEN)
3418  		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3419  			 "Inode Path\n");
3420  	else {
3421  		struct sock *s = v;
3422  		struct unix_sock *u = unix_sk(s);
3423  		unix_state_lock(s);
3424  
3425  		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3426  			s,
3427  			refcount_read(&s->sk_refcnt),
3428  			0,
3429  			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3430  			s->sk_type,
3431  			s->sk_socket ?
3432  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3433  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3434  			sock_i_ino(s));
3435  
3436  		if (u->addr) {	// under a hash table lock here
3437  			int i, len;
3438  			seq_putc(seq, ' ');
3439  
3440  			i = 0;
3441  			len = u->addr->len -
3442  				offsetof(struct sockaddr_un, sun_path);
3443  			if (u->addr->name->sun_path[0]) {
3444  				len--;
3445  			} else {
3446  				seq_putc(seq, '@');
3447  				i++;
3448  			}
3449  			for ( ; i < len; i++)
3450  				seq_putc(seq, u->addr->name->sun_path[i] ?:
3451  					 '@');
3452  		}
3453  		unix_state_unlock(s);
3454  		seq_putc(seq, '\n');
3455  	}
3456  
3457  	return 0;
3458  }
3459  
3460  static const struct seq_operations unix_seq_ops = {
3461  	.start  = unix_seq_start,
3462  	.next   = unix_seq_next,
3463  	.stop   = unix_seq_stop,
3464  	.show   = unix_seq_show,
3465  };
3466  
3467  #ifdef CONFIG_BPF_SYSCALL
3468  struct bpf_unix_iter_state {
3469  	struct seq_net_private p;
3470  	unsigned int cur_sk;
3471  	unsigned int end_sk;
3472  	unsigned int max_sk;
3473  	struct sock **batch;
3474  	bool st_bucket_done;
3475  };
3476  
3477  struct bpf_iter__unix {
3478  	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3479  	__bpf_md_ptr(struct unix_sock *, unix_sk);
3480  	uid_t uid __aligned(8);
3481  };
3482  
3483  static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3484  			      struct unix_sock *unix_sk, uid_t uid)
3485  {
3486  	struct bpf_iter__unix ctx;
3487  
3488  	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3489  	ctx.meta = meta;
3490  	ctx.unix_sk = unix_sk;
3491  	ctx.uid = uid;
3492  	return bpf_iter_run_prog(prog, &ctx);
3493  }
3494  
3495  static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3496  
3497  {
3498  	struct bpf_unix_iter_state *iter = seq->private;
3499  	unsigned int expected = 1;
3500  	struct sock *sk;
3501  
3502  	sock_hold(start_sk);
3503  	iter->batch[iter->end_sk++] = start_sk;
3504  
3505  	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3506  		if (iter->end_sk < iter->max_sk) {
3507  			sock_hold(sk);
3508  			iter->batch[iter->end_sk++] = sk;
3509  		}
3510  
3511  		expected++;
3512  	}
3513  
3514  	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3515  
3516  	return expected;
3517  }
3518  
3519  static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3520  {
3521  	while (iter->cur_sk < iter->end_sk)
3522  		sock_put(iter->batch[iter->cur_sk++]);
3523  }
3524  
3525  static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3526  				       unsigned int new_batch_sz)
3527  {
3528  	struct sock **new_batch;
3529  
3530  	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3531  			     GFP_USER | __GFP_NOWARN);
3532  	if (!new_batch)
3533  		return -ENOMEM;
3534  
3535  	bpf_iter_unix_put_batch(iter);
3536  	kvfree(iter->batch);
3537  	iter->batch = new_batch;
3538  	iter->max_sk = new_batch_sz;
3539  
3540  	return 0;
3541  }
3542  
3543  static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3544  					loff_t *pos)
3545  {
3546  	struct bpf_unix_iter_state *iter = seq->private;
3547  	unsigned int expected;
3548  	bool resized = false;
3549  	struct sock *sk;
3550  
3551  	if (iter->st_bucket_done)
3552  		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3553  
3554  again:
3555  	/* Get a new batch */
3556  	iter->cur_sk = 0;
3557  	iter->end_sk = 0;
3558  
3559  	sk = unix_get_first(seq, pos);
3560  	if (!sk)
3561  		return NULL; /* Done */
3562  
3563  	expected = bpf_iter_unix_hold_batch(seq, sk);
3564  
3565  	if (iter->end_sk == expected) {
3566  		iter->st_bucket_done = true;
3567  		return sk;
3568  	}
3569  
3570  	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3571  		resized = true;
3572  		goto again;
3573  	}
3574  
3575  	return sk;
3576  }
3577  
3578  static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3579  {
3580  	if (!*pos)
3581  		return SEQ_START_TOKEN;
3582  
3583  	/* bpf iter does not support lseek, so it always
3584  	 * continue from where it was stop()-ped.
3585  	 */
3586  	return bpf_iter_unix_batch(seq, pos);
3587  }
3588  
3589  static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3590  {
3591  	struct bpf_unix_iter_state *iter = seq->private;
3592  	struct sock *sk;
3593  
3594  	/* Whenever seq_next() is called, the iter->cur_sk is
3595  	 * done with seq_show(), so advance to the next sk in
3596  	 * the batch.
3597  	 */
3598  	if (iter->cur_sk < iter->end_sk)
3599  		sock_put(iter->batch[iter->cur_sk++]);
3600  
3601  	++*pos;
3602  
3603  	if (iter->cur_sk < iter->end_sk)
3604  		sk = iter->batch[iter->cur_sk];
3605  	else
3606  		sk = bpf_iter_unix_batch(seq, pos);
3607  
3608  	return sk;
3609  }
3610  
3611  static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3612  {
3613  	struct bpf_iter_meta meta;
3614  	struct bpf_prog *prog;
3615  	struct sock *sk = v;
3616  	uid_t uid;
3617  	bool slow;
3618  	int ret;
3619  
3620  	if (v == SEQ_START_TOKEN)
3621  		return 0;
3622  
3623  	slow = lock_sock_fast(sk);
3624  
3625  	if (unlikely(sk_unhashed(sk))) {
3626  		ret = SEQ_SKIP;
3627  		goto unlock;
3628  	}
3629  
3630  	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3631  	meta.seq = seq;
3632  	prog = bpf_iter_get_info(&meta, false);
3633  	ret = unix_prog_seq_show(prog, &meta, v, uid);
3634  unlock:
3635  	unlock_sock_fast(sk, slow);
3636  	return ret;
3637  }
3638  
3639  static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3640  {
3641  	struct bpf_unix_iter_state *iter = seq->private;
3642  	struct bpf_iter_meta meta;
3643  	struct bpf_prog *prog;
3644  
3645  	if (!v) {
3646  		meta.seq = seq;
3647  		prog = bpf_iter_get_info(&meta, true);
3648  		if (prog)
3649  			(void)unix_prog_seq_show(prog, &meta, v, 0);
3650  	}
3651  
3652  	if (iter->cur_sk < iter->end_sk)
3653  		bpf_iter_unix_put_batch(iter);
3654  }
3655  
3656  static const struct seq_operations bpf_iter_unix_seq_ops = {
3657  	.start	= bpf_iter_unix_seq_start,
3658  	.next	= bpf_iter_unix_seq_next,
3659  	.stop	= bpf_iter_unix_seq_stop,
3660  	.show	= bpf_iter_unix_seq_show,
3661  };
3662  #endif
3663  #endif
3664  
3665  static const struct net_proto_family unix_family_ops = {
3666  	.family = PF_UNIX,
3667  	.create = unix_create,
3668  	.owner	= THIS_MODULE,
3669  };
3670  
3671  
3672  static int __net_init unix_net_init(struct net *net)
3673  {
3674  	int i;
3675  
3676  	net->unx.sysctl_max_dgram_qlen = 10;
3677  	if (unix_sysctl_register(net))
3678  		goto out;
3679  
3680  #ifdef CONFIG_PROC_FS
3681  	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3682  			     sizeof(struct seq_net_private)))
3683  		goto err_sysctl;
3684  #endif
3685  
3686  	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3687  					      sizeof(spinlock_t), GFP_KERNEL);
3688  	if (!net->unx.table.locks)
3689  		goto err_proc;
3690  
3691  	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3692  						sizeof(struct hlist_head),
3693  						GFP_KERNEL);
3694  	if (!net->unx.table.buckets)
3695  		goto free_locks;
3696  
3697  	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3698  		spin_lock_init(&net->unx.table.locks[i]);
3699  		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3700  		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3701  	}
3702  
3703  	return 0;
3704  
3705  free_locks:
3706  	kvfree(net->unx.table.locks);
3707  err_proc:
3708  #ifdef CONFIG_PROC_FS
3709  	remove_proc_entry("unix", net->proc_net);
3710  err_sysctl:
3711  #endif
3712  	unix_sysctl_unregister(net);
3713  out:
3714  	return -ENOMEM;
3715  }
3716  
3717  static void __net_exit unix_net_exit(struct net *net)
3718  {
3719  	kvfree(net->unx.table.buckets);
3720  	kvfree(net->unx.table.locks);
3721  	unix_sysctl_unregister(net);
3722  	remove_proc_entry("unix", net->proc_net);
3723  }
3724  
3725  static struct pernet_operations unix_net_ops = {
3726  	.init = unix_net_init,
3727  	.exit = unix_net_exit,
3728  };
3729  
3730  #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3731  DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3732  		     struct unix_sock *unix_sk, uid_t uid)
3733  
3734  #define INIT_BATCH_SZ 16
3735  
3736  static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3737  {
3738  	struct bpf_unix_iter_state *iter = priv_data;
3739  	int err;
3740  
3741  	err = bpf_iter_init_seq_net(priv_data, aux);
3742  	if (err)
3743  		return err;
3744  
3745  	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3746  	if (err) {
3747  		bpf_iter_fini_seq_net(priv_data);
3748  		return err;
3749  	}
3750  
3751  	return 0;
3752  }
3753  
3754  static void bpf_iter_fini_unix(void *priv_data)
3755  {
3756  	struct bpf_unix_iter_state *iter = priv_data;
3757  
3758  	bpf_iter_fini_seq_net(priv_data);
3759  	kvfree(iter->batch);
3760  }
3761  
3762  static const struct bpf_iter_seq_info unix_seq_info = {
3763  	.seq_ops		= &bpf_iter_unix_seq_ops,
3764  	.init_seq_private	= bpf_iter_init_unix,
3765  	.fini_seq_private	= bpf_iter_fini_unix,
3766  	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3767  };
3768  
3769  static const struct bpf_func_proto *
3770  bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3771  			     const struct bpf_prog *prog)
3772  {
3773  	switch (func_id) {
3774  	case BPF_FUNC_setsockopt:
3775  		return &bpf_sk_setsockopt_proto;
3776  	case BPF_FUNC_getsockopt:
3777  		return &bpf_sk_getsockopt_proto;
3778  	default:
3779  		return NULL;
3780  	}
3781  }
3782  
3783  static struct bpf_iter_reg unix_reg_info = {
3784  	.target			= "unix",
3785  	.ctx_arg_info_size	= 1,
3786  	.ctx_arg_info		= {
3787  		{ offsetof(struct bpf_iter__unix, unix_sk),
3788  		  PTR_TO_BTF_ID_OR_NULL },
3789  	},
3790  	.get_func_proto         = bpf_iter_unix_get_func_proto,
3791  	.seq_info		= &unix_seq_info,
3792  };
3793  
3794  static void __init bpf_iter_register(void)
3795  {
3796  	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3797  	if (bpf_iter_reg_target(&unix_reg_info))
3798  		pr_warn("Warning: could not register bpf iterator unix\n");
3799  }
3800  #endif
3801  
3802  static int __init af_unix_init(void)
3803  {
3804  	int i, rc = -1;
3805  
3806  	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3807  
3808  	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3809  		spin_lock_init(&bsd_socket_locks[i]);
3810  		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3811  	}
3812  
3813  	rc = proto_register(&unix_dgram_proto, 1);
3814  	if (rc != 0) {
3815  		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3816  		goto out;
3817  	}
3818  
3819  	rc = proto_register(&unix_stream_proto, 1);
3820  	if (rc != 0) {
3821  		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3822  		proto_unregister(&unix_dgram_proto);
3823  		goto out;
3824  	}
3825  
3826  	sock_register(&unix_family_ops);
3827  	register_pernet_subsys(&unix_net_ops);
3828  	unix_bpf_build_proto();
3829  
3830  #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3831  	bpf_iter_register();
3832  #endif
3833  
3834  out:
3835  	return rc;
3836  }
3837  
3838  /* Later than subsys_initcall() because we depend on stuff initialised there */
3839  fs_initcall(af_unix_init);
3840