xref: /linux/net/unix/af_unix.c (revision fe9d25b46bc744d73491fb68f1c322910dc70437)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * NET4:	Implementation of BSD Unix domain sockets.
4   *
5   * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6   *
7   * Fixes:
8   *		Linus Torvalds	:	Assorted bug cures.
9   *		Niibe Yutaka	:	async I/O support.
10   *		Carsten Paeth	:	PF_UNIX check, address fixes.
11   *		Alan Cox	:	Limit size of allocated blocks.
12   *		Alan Cox	:	Fixed the stupid socketpair bug.
13   *		Alan Cox	:	BSD compatibility fine tuning.
14   *		Alan Cox	:	Fixed a bug in connect when interrupted.
15   *		Alan Cox	:	Sorted out a proper draft version of
16   *					file descriptor passing hacked up from
17   *					Mike Shaver's work.
18   *		Marty Leisner	:	Fixes to fd passing
19   *		Nick Nevin	:	recvmsg bugfix.
20   *		Alan Cox	:	Started proper garbage collector
21   *		Heiko EiBfeldt	:	Missing verify_area check
22   *		Alan Cox	:	Started POSIXisms
23   *		Andreas Schwab	:	Replace inode by dentry for proper
24   *					reference counting
25   *		Kirk Petersen	:	Made this a module
26   *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27   *					Lots of bug fixes.
28   *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29   *					by above two patches.
30   *	     Andrea Arcangeli	:	If possible we block in connect(2)
31   *					if the max backlog of the listen socket
32   *					is been reached. This won't break
33   *					old apps and it will avoid huge amount
34   *					of socks hashed (this for unix_gc()
35   *					performances reasons).
36   *					Security fix that limits the max
37   *					number of socks to 2*max_files and
38   *					the number of skb queueable in the
39   *					dgram receiver.
40   *		Artur Skawina   :	Hash function optimizations
41   *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42   *	      Malcolm Beattie   :	Set peercred for socketpair
43   *	     Michal Ostrowski   :       Module initialization cleanup.
44   *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45   *	     				the core infrastructure is doing that
46   *	     				for all net proto families now (2.5.69+)
47   *
48   * Known differences from reference BSD that was tested:
49   *
50   *	[TO FIX]
51   *	ECONNREFUSED is not returned from one end of a connected() socket to the
52   *		other the moment one end closes.
53   *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54   *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55   *	[NOT TO FIX]
56   *	accept() returns a path name even if the connecting socket has closed
57   *		in the meantime (BSD loses the path and gives up).
58   *	accept() returns 0 length path for an unbound connector. BSD returns 16
59   *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60   *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61   *	BSD af_unix apparently has connect forgetting to block properly.
62   *		(need to check this with the POSIX spec in detail)
63   *
64   * Differences from 2.0.0-11-... (ANK)
65   *	Bug fixes and improvements.
66   *		- client shutdown killed server socket.
67   *		- removed all useless cli/sti pairs.
68   *
69   *	Semantic changes/extensions.
70   *		- generic control message passing.
71   *		- SCM_CREDENTIALS control message.
72   *		- "Abstract" (not FS based) socket bindings.
73   *		  Abstract names are sequences of bytes (not zero terminated)
74   *		  started by 0, so that this name space does not intersect
75   *		  with BSD names.
76   */
77  
78  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79  
80  #include <linux/module.h>
81  #include <linux/kernel.h>
82  #include <linux/signal.h>
83  #include <linux/sched/signal.h>
84  #include <linux/errno.h>
85  #include <linux/string.h>
86  #include <linux/stat.h>
87  #include <linux/dcache.h>
88  #include <linux/namei.h>
89  #include <linux/socket.h>
90  #include <linux/un.h>
91  #include <linux/fcntl.h>
92  #include <linux/filter.h>
93  #include <linux/termios.h>
94  #include <linux/sockios.h>
95  #include <linux/net.h>
96  #include <linux/in.h>
97  #include <linux/fs.h>
98  #include <linux/slab.h>
99  #include <linux/uaccess.h>
100  #include <linux/skbuff.h>
101  #include <linux/netdevice.h>
102  #include <net/net_namespace.h>
103  #include <net/sock.h>
104  #include <net/tcp_states.h>
105  #include <net/af_unix.h>
106  #include <linux/proc_fs.h>
107  #include <linux/seq_file.h>
108  #include <net/scm.h>
109  #include <linux/init.h>
110  #include <linux/poll.h>
111  #include <linux/rtnetlink.h>
112  #include <linux/mount.h>
113  #include <net/checksum.h>
114  #include <linux/security.h>
115  #include <linux/freezer.h>
116  #include <linux/file.h>
117  #include <linux/btf_ids.h>
118  
119  #include "scm.h"
120  
121  static atomic_long_t unix_nr_socks;
122  static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123  static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124  
125  /* SMP locking strategy:
126   *    hash table is protected with spinlock.
127   *    each socket state is protected by separate spinlock.
128   */
129  
130  static unsigned int unix_unbound_hash(struct sock *sk)
131  {
132  	unsigned long hash = (unsigned long)sk;
133  
134  	hash ^= hash >> 16;
135  	hash ^= hash >> 8;
136  	hash ^= sk->sk_type;
137  
138  	return hash & UNIX_HASH_MOD;
139  }
140  
141  static unsigned int unix_bsd_hash(struct inode *i)
142  {
143  	return i->i_ino & UNIX_HASH_MOD;
144  }
145  
146  static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147  				       int addr_len, int type)
148  {
149  	__wsum csum = csum_partial(sunaddr, addr_len, 0);
150  	unsigned int hash;
151  
152  	hash = (__force unsigned int)csum_fold(csum);
153  	hash ^= hash >> 8;
154  	hash ^= type;
155  
156  	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
157  }
158  
159  static void unix_table_double_lock(struct net *net,
160  				   unsigned int hash1, unsigned int hash2)
161  {
162  	if (hash1 == hash2) {
163  		spin_lock(&net->unx.table.locks[hash1]);
164  		return;
165  	}
166  
167  	if (hash1 > hash2)
168  		swap(hash1, hash2);
169  
170  	spin_lock(&net->unx.table.locks[hash1]);
171  	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
172  }
173  
174  static void unix_table_double_unlock(struct net *net,
175  				     unsigned int hash1, unsigned int hash2)
176  {
177  	if (hash1 == hash2) {
178  		spin_unlock(&net->unx.table.locks[hash1]);
179  		return;
180  	}
181  
182  	spin_unlock(&net->unx.table.locks[hash1]);
183  	spin_unlock(&net->unx.table.locks[hash2]);
184  }
185  
186  #ifdef CONFIG_SECURITY_NETWORK
187  static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
188  {
189  	UNIXCB(skb).secid = scm->secid;
190  }
191  
192  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
193  {
194  	scm->secid = UNIXCB(skb).secid;
195  }
196  
197  static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
198  {
199  	return (scm->secid == UNIXCB(skb).secid);
200  }
201  #else
202  static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
203  { }
204  
205  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
206  { }
207  
208  static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
209  {
210  	return true;
211  }
212  #endif /* CONFIG_SECURITY_NETWORK */
213  
214  #define unix_peer(sk) (unix_sk(sk)->peer)
215  
216  static inline int unix_our_peer(struct sock *sk, struct sock *osk)
217  {
218  	return unix_peer(osk) == sk;
219  }
220  
221  static inline int unix_may_send(struct sock *sk, struct sock *osk)
222  {
223  	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
224  }
225  
226  static inline int unix_recvq_full(const struct sock *sk)
227  {
228  	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
229  }
230  
231  static inline int unix_recvq_full_lockless(const struct sock *sk)
232  {
233  	return skb_queue_len_lockless(&sk->sk_receive_queue) >
234  		READ_ONCE(sk->sk_max_ack_backlog);
235  }
236  
237  struct sock *unix_peer_get(struct sock *s)
238  {
239  	struct sock *peer;
240  
241  	unix_state_lock(s);
242  	peer = unix_peer(s);
243  	if (peer)
244  		sock_hold(peer);
245  	unix_state_unlock(s);
246  	return peer;
247  }
248  EXPORT_SYMBOL_GPL(unix_peer_get);
249  
250  static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
251  					     int addr_len)
252  {
253  	struct unix_address *addr;
254  
255  	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
256  	if (!addr)
257  		return NULL;
258  
259  	refcount_set(&addr->refcnt, 1);
260  	addr->len = addr_len;
261  	memcpy(addr->name, sunaddr, addr_len);
262  
263  	return addr;
264  }
265  
266  static inline void unix_release_addr(struct unix_address *addr)
267  {
268  	if (refcount_dec_and_test(&addr->refcnt))
269  		kfree(addr);
270  }
271  
272  /*
273   *	Check unix socket name:
274   *		- should be not zero length.
275   *	        - if started by not zero, should be NULL terminated (FS object)
276   *		- if started by zero, it is abstract name.
277   */
278  
279  static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
280  {
281  	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
282  	    addr_len > sizeof(*sunaddr))
283  		return -EINVAL;
284  
285  	if (sunaddr->sun_family != AF_UNIX)
286  		return -EINVAL;
287  
288  	return 0;
289  }
290  
291  static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
292  {
293  	/* This may look like an off by one error but it is a bit more
294  	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
295  	 * sun_path[108] doesn't as such exist.  However in kernel space
296  	 * we are guaranteed that it is a valid memory location in our
297  	 * kernel address buffer because syscall functions always pass
298  	 * a pointer of struct sockaddr_storage which has a bigger buffer
299  	 * than 108.
300  	 */
301  	((char *)sunaddr)[addr_len] = 0;
302  }
303  
304  static void __unix_remove_socket(struct sock *sk)
305  {
306  	sk_del_node_init(sk);
307  }
308  
309  static void __unix_insert_socket(struct net *net, struct sock *sk)
310  {
311  	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
312  	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
313  }
314  
315  static void __unix_set_addr_hash(struct net *net, struct sock *sk,
316  				 struct unix_address *addr, unsigned int hash)
317  {
318  	__unix_remove_socket(sk);
319  	smp_store_release(&unix_sk(sk)->addr, addr);
320  
321  	sk->sk_hash = hash;
322  	__unix_insert_socket(net, sk);
323  }
324  
325  static void unix_remove_socket(struct net *net, struct sock *sk)
326  {
327  	spin_lock(&net->unx.table.locks[sk->sk_hash]);
328  	__unix_remove_socket(sk);
329  	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
330  }
331  
332  static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
333  {
334  	spin_lock(&net->unx.table.locks[sk->sk_hash]);
335  	__unix_insert_socket(net, sk);
336  	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
337  }
338  
339  static void unix_insert_bsd_socket(struct sock *sk)
340  {
341  	spin_lock(&bsd_socket_locks[sk->sk_hash]);
342  	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
343  	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
344  }
345  
346  static void unix_remove_bsd_socket(struct sock *sk)
347  {
348  	if (!hlist_unhashed(&sk->sk_bind_node)) {
349  		spin_lock(&bsd_socket_locks[sk->sk_hash]);
350  		__sk_del_bind_node(sk);
351  		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
352  
353  		sk_node_init(&sk->sk_bind_node);
354  	}
355  }
356  
357  static struct sock *__unix_find_socket_byname(struct net *net,
358  					      struct sockaddr_un *sunname,
359  					      int len, unsigned int hash)
360  {
361  	struct sock *s;
362  
363  	sk_for_each(s, &net->unx.table.buckets[hash]) {
364  		struct unix_sock *u = unix_sk(s);
365  
366  		if (u->addr->len == len &&
367  		    !memcmp(u->addr->name, sunname, len))
368  			return s;
369  	}
370  	return NULL;
371  }
372  
373  static inline struct sock *unix_find_socket_byname(struct net *net,
374  						   struct sockaddr_un *sunname,
375  						   int len, unsigned int hash)
376  {
377  	struct sock *s;
378  
379  	spin_lock(&net->unx.table.locks[hash]);
380  	s = __unix_find_socket_byname(net, sunname, len, hash);
381  	if (s)
382  		sock_hold(s);
383  	spin_unlock(&net->unx.table.locks[hash]);
384  	return s;
385  }
386  
387  static struct sock *unix_find_socket_byinode(struct inode *i)
388  {
389  	unsigned int hash = unix_bsd_hash(i);
390  	struct sock *s;
391  
392  	spin_lock(&bsd_socket_locks[hash]);
393  	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
394  		struct dentry *dentry = unix_sk(s)->path.dentry;
395  
396  		if (dentry && d_backing_inode(dentry) == i) {
397  			sock_hold(s);
398  			spin_unlock(&bsd_socket_locks[hash]);
399  			return s;
400  		}
401  	}
402  	spin_unlock(&bsd_socket_locks[hash]);
403  	return NULL;
404  }
405  
406  /* Support code for asymmetrically connected dgram sockets
407   *
408   * If a datagram socket is connected to a socket not itself connected
409   * to the first socket (eg, /dev/log), clients may only enqueue more
410   * messages if the present receive queue of the server socket is not
411   * "too large". This means there's a second writeability condition
412   * poll and sendmsg need to test. The dgram recv code will do a wake
413   * up on the peer_wait wait queue of a socket upon reception of a
414   * datagram which needs to be propagated to sleeping would-be writers
415   * since these might not have sent anything so far. This can't be
416   * accomplished via poll_wait because the lifetime of the server
417   * socket might be less than that of its clients if these break their
418   * association with it or if the server socket is closed while clients
419   * are still connected to it and there's no way to inform "a polling
420   * implementation" that it should let go of a certain wait queue
421   *
422   * In order to propagate a wake up, a wait_queue_entry_t of the client
423   * socket is enqueued on the peer_wait queue of the server socket
424   * whose wake function does a wake_up on the ordinary client socket
425   * wait queue. This connection is established whenever a write (or
426   * poll for write) hit the flow control condition and broken when the
427   * association to the server socket is dissolved or after a wake up
428   * was relayed.
429   */
430  
431  static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
432  				      void *key)
433  {
434  	struct unix_sock *u;
435  	wait_queue_head_t *u_sleep;
436  
437  	u = container_of(q, struct unix_sock, peer_wake);
438  
439  	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
440  			    q);
441  	u->peer_wake.private = NULL;
442  
443  	/* relaying can only happen while the wq still exists */
444  	u_sleep = sk_sleep(&u->sk);
445  	if (u_sleep)
446  		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
447  
448  	return 0;
449  }
450  
451  static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
452  {
453  	struct unix_sock *u, *u_other;
454  	int rc;
455  
456  	u = unix_sk(sk);
457  	u_other = unix_sk(other);
458  	rc = 0;
459  	spin_lock(&u_other->peer_wait.lock);
460  
461  	if (!u->peer_wake.private) {
462  		u->peer_wake.private = other;
463  		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
464  
465  		rc = 1;
466  	}
467  
468  	spin_unlock(&u_other->peer_wait.lock);
469  	return rc;
470  }
471  
472  static void unix_dgram_peer_wake_disconnect(struct sock *sk,
473  					    struct sock *other)
474  {
475  	struct unix_sock *u, *u_other;
476  
477  	u = unix_sk(sk);
478  	u_other = unix_sk(other);
479  	spin_lock(&u_other->peer_wait.lock);
480  
481  	if (u->peer_wake.private == other) {
482  		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
483  		u->peer_wake.private = NULL;
484  	}
485  
486  	spin_unlock(&u_other->peer_wait.lock);
487  }
488  
489  static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
490  						   struct sock *other)
491  {
492  	unix_dgram_peer_wake_disconnect(sk, other);
493  	wake_up_interruptible_poll(sk_sleep(sk),
494  				   EPOLLOUT |
495  				   EPOLLWRNORM |
496  				   EPOLLWRBAND);
497  }
498  
499  /* preconditions:
500   *	- unix_peer(sk) == other
501   *	- association is stable
502   */
503  static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
504  {
505  	int connected;
506  
507  	connected = unix_dgram_peer_wake_connect(sk, other);
508  
509  	/* If other is SOCK_DEAD, we want to make sure we signal
510  	 * POLLOUT, such that a subsequent write() can get a
511  	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
512  	 * to other and its full, we will hang waiting for POLLOUT.
513  	 */
514  	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
515  		return 1;
516  
517  	if (connected)
518  		unix_dgram_peer_wake_disconnect(sk, other);
519  
520  	return 0;
521  }
522  
523  static int unix_writable(const struct sock *sk)
524  {
525  	return sk->sk_state != TCP_LISTEN &&
526  	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
527  }
528  
529  static void unix_write_space(struct sock *sk)
530  {
531  	struct socket_wq *wq;
532  
533  	rcu_read_lock();
534  	if (unix_writable(sk)) {
535  		wq = rcu_dereference(sk->sk_wq);
536  		if (skwq_has_sleeper(wq))
537  			wake_up_interruptible_sync_poll(&wq->wait,
538  				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
539  		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
540  	}
541  	rcu_read_unlock();
542  }
543  
544  /* When dgram socket disconnects (or changes its peer), we clear its receive
545   * queue of packets arrived from previous peer. First, it allows to do
546   * flow control based only on wmem_alloc; second, sk connected to peer
547   * may receive messages only from that peer. */
548  static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
549  {
550  	if (!skb_queue_empty(&sk->sk_receive_queue)) {
551  		skb_queue_purge(&sk->sk_receive_queue);
552  		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
553  
554  		/* If one link of bidirectional dgram pipe is disconnected,
555  		 * we signal error. Messages are lost. Do not make this,
556  		 * when peer was not connected to us.
557  		 */
558  		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
559  			other->sk_err = ECONNRESET;
560  			sk_error_report(other);
561  		}
562  	}
563  	other->sk_state = TCP_CLOSE;
564  }
565  
566  static void unix_sock_destructor(struct sock *sk)
567  {
568  	struct unix_sock *u = unix_sk(sk);
569  
570  	skb_queue_purge(&sk->sk_receive_queue);
571  
572  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
573  	if (u->oob_skb) {
574  		kfree_skb(u->oob_skb);
575  		u->oob_skb = NULL;
576  	}
577  #endif
578  	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
579  	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
580  	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
581  	if (!sock_flag(sk, SOCK_DEAD)) {
582  		pr_info("Attempt to release alive unix socket: %p\n", sk);
583  		return;
584  	}
585  
586  	if (u->addr)
587  		unix_release_addr(u->addr);
588  
589  	atomic_long_dec(&unix_nr_socks);
590  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
591  #ifdef UNIX_REFCNT_DEBUG
592  	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
593  		atomic_long_read(&unix_nr_socks));
594  #endif
595  }
596  
597  static void unix_release_sock(struct sock *sk, int embrion)
598  {
599  	struct unix_sock *u = unix_sk(sk);
600  	struct sock *skpair;
601  	struct sk_buff *skb;
602  	struct path path;
603  	int state;
604  
605  	unix_remove_socket(sock_net(sk), sk);
606  	unix_remove_bsd_socket(sk);
607  
608  	/* Clear state */
609  	unix_state_lock(sk);
610  	sock_orphan(sk);
611  	sk->sk_shutdown = SHUTDOWN_MASK;
612  	path	     = u->path;
613  	u->path.dentry = NULL;
614  	u->path.mnt = NULL;
615  	state = sk->sk_state;
616  	sk->sk_state = TCP_CLOSE;
617  
618  	skpair = unix_peer(sk);
619  	unix_peer(sk) = NULL;
620  
621  	unix_state_unlock(sk);
622  
623  	wake_up_interruptible_all(&u->peer_wait);
624  
625  	if (skpair != NULL) {
626  		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
627  			unix_state_lock(skpair);
628  			/* No more writes */
629  			skpair->sk_shutdown = SHUTDOWN_MASK;
630  			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
631  				skpair->sk_err = ECONNRESET;
632  			unix_state_unlock(skpair);
633  			skpair->sk_state_change(skpair);
634  			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
635  		}
636  
637  		unix_dgram_peer_wake_disconnect(sk, skpair);
638  		sock_put(skpair); /* It may now die */
639  	}
640  
641  	/* Try to flush out this socket. Throw out buffers at least */
642  
643  	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
644  		if (state == TCP_LISTEN)
645  			unix_release_sock(skb->sk, 1);
646  		/* passed fds are erased in the kfree_skb hook	      */
647  		UNIXCB(skb).consumed = skb->len;
648  		kfree_skb(skb);
649  	}
650  
651  	if (path.dentry)
652  		path_put(&path);
653  
654  	sock_put(sk);
655  
656  	/* ---- Socket is dead now and most probably destroyed ---- */
657  
658  	/*
659  	 * Fixme: BSD difference: In BSD all sockets connected to us get
660  	 *	  ECONNRESET and we die on the spot. In Linux we behave
661  	 *	  like files and pipes do and wait for the last
662  	 *	  dereference.
663  	 *
664  	 * Can't we simply set sock->err?
665  	 *
666  	 *	  What the above comment does talk about? --ANK(980817)
667  	 */
668  
669  	if (unix_tot_inflight)
670  		unix_gc();		/* Garbage collect fds */
671  }
672  
673  static void init_peercred(struct sock *sk)
674  {
675  	const struct cred *old_cred;
676  	struct pid *old_pid;
677  
678  	spin_lock(&sk->sk_peer_lock);
679  	old_pid = sk->sk_peer_pid;
680  	old_cred = sk->sk_peer_cred;
681  	sk->sk_peer_pid  = get_pid(task_tgid(current));
682  	sk->sk_peer_cred = get_current_cred();
683  	spin_unlock(&sk->sk_peer_lock);
684  
685  	put_pid(old_pid);
686  	put_cred(old_cred);
687  }
688  
689  static void copy_peercred(struct sock *sk, struct sock *peersk)
690  {
691  	const struct cred *old_cred;
692  	struct pid *old_pid;
693  
694  	if (sk < peersk) {
695  		spin_lock(&sk->sk_peer_lock);
696  		spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
697  	} else {
698  		spin_lock(&peersk->sk_peer_lock);
699  		spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
700  	}
701  	old_pid = sk->sk_peer_pid;
702  	old_cred = sk->sk_peer_cred;
703  	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
704  	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
705  
706  	spin_unlock(&sk->sk_peer_lock);
707  	spin_unlock(&peersk->sk_peer_lock);
708  
709  	put_pid(old_pid);
710  	put_cred(old_cred);
711  }
712  
713  static int unix_listen(struct socket *sock, int backlog)
714  {
715  	int err;
716  	struct sock *sk = sock->sk;
717  	struct unix_sock *u = unix_sk(sk);
718  
719  	err = -EOPNOTSUPP;
720  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
721  		goto out;	/* Only stream/seqpacket sockets accept */
722  	err = -EINVAL;
723  	if (!u->addr)
724  		goto out;	/* No listens on an unbound socket */
725  	unix_state_lock(sk);
726  	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
727  		goto out_unlock;
728  	if (backlog > sk->sk_max_ack_backlog)
729  		wake_up_interruptible_all(&u->peer_wait);
730  	sk->sk_max_ack_backlog	= backlog;
731  	sk->sk_state		= TCP_LISTEN;
732  	/* set credentials so connect can copy them */
733  	init_peercred(sk);
734  	err = 0;
735  
736  out_unlock:
737  	unix_state_unlock(sk);
738  out:
739  	return err;
740  }
741  
742  static int unix_release(struct socket *);
743  static int unix_bind(struct socket *, struct sockaddr *, int);
744  static int unix_stream_connect(struct socket *, struct sockaddr *,
745  			       int addr_len, int flags);
746  static int unix_socketpair(struct socket *, struct socket *);
747  static int unix_accept(struct socket *, struct socket *, int, bool);
748  static int unix_getname(struct socket *, struct sockaddr *, int);
749  static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
750  static __poll_t unix_dgram_poll(struct file *, struct socket *,
751  				    poll_table *);
752  static int unix_ioctl(struct socket *, unsigned int, unsigned long);
753  #ifdef CONFIG_COMPAT
754  static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
755  #endif
756  static int unix_shutdown(struct socket *, int);
757  static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
758  static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
759  static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
760  				    size_t size, int flags);
761  static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
762  				       struct pipe_inode_info *, size_t size,
763  				       unsigned int flags);
764  static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
765  static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
766  static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
767  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
768  static int unix_dgram_connect(struct socket *, struct sockaddr *,
769  			      int, int);
770  static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
771  static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
772  				  int);
773  
774  static int unix_set_peek_off(struct sock *sk, int val)
775  {
776  	struct unix_sock *u = unix_sk(sk);
777  
778  	if (mutex_lock_interruptible(&u->iolock))
779  		return -EINTR;
780  
781  	sk->sk_peek_off = val;
782  	mutex_unlock(&u->iolock);
783  
784  	return 0;
785  }
786  
787  #ifdef CONFIG_PROC_FS
788  static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
789  {
790  	struct sock *sk = sock->sk;
791  	struct unix_sock *u;
792  
793  	if (sk) {
794  		u = unix_sk(sock->sk);
795  		seq_printf(m, "scm_fds: %u\n",
796  			   atomic_read(&u->scm_stat.nr_fds));
797  	}
798  }
799  #else
800  #define unix_show_fdinfo NULL
801  #endif
802  
803  static const struct proto_ops unix_stream_ops = {
804  	.family =	PF_UNIX,
805  	.owner =	THIS_MODULE,
806  	.release =	unix_release,
807  	.bind =		unix_bind,
808  	.connect =	unix_stream_connect,
809  	.socketpair =	unix_socketpair,
810  	.accept =	unix_accept,
811  	.getname =	unix_getname,
812  	.poll =		unix_poll,
813  	.ioctl =	unix_ioctl,
814  #ifdef CONFIG_COMPAT
815  	.compat_ioctl =	unix_compat_ioctl,
816  #endif
817  	.listen =	unix_listen,
818  	.shutdown =	unix_shutdown,
819  	.sendmsg =	unix_stream_sendmsg,
820  	.recvmsg =	unix_stream_recvmsg,
821  	.read_skb =	unix_stream_read_skb,
822  	.mmap =		sock_no_mmap,
823  	.sendpage =	unix_stream_sendpage,
824  	.splice_read =	unix_stream_splice_read,
825  	.set_peek_off =	unix_set_peek_off,
826  	.show_fdinfo =	unix_show_fdinfo,
827  };
828  
829  static const struct proto_ops unix_dgram_ops = {
830  	.family =	PF_UNIX,
831  	.owner =	THIS_MODULE,
832  	.release =	unix_release,
833  	.bind =		unix_bind,
834  	.connect =	unix_dgram_connect,
835  	.socketpair =	unix_socketpair,
836  	.accept =	sock_no_accept,
837  	.getname =	unix_getname,
838  	.poll =		unix_dgram_poll,
839  	.ioctl =	unix_ioctl,
840  #ifdef CONFIG_COMPAT
841  	.compat_ioctl =	unix_compat_ioctl,
842  #endif
843  	.listen =	sock_no_listen,
844  	.shutdown =	unix_shutdown,
845  	.sendmsg =	unix_dgram_sendmsg,
846  	.read_skb =	unix_read_skb,
847  	.recvmsg =	unix_dgram_recvmsg,
848  	.mmap =		sock_no_mmap,
849  	.sendpage =	sock_no_sendpage,
850  	.set_peek_off =	unix_set_peek_off,
851  	.show_fdinfo =	unix_show_fdinfo,
852  };
853  
854  static const struct proto_ops unix_seqpacket_ops = {
855  	.family =	PF_UNIX,
856  	.owner =	THIS_MODULE,
857  	.release =	unix_release,
858  	.bind =		unix_bind,
859  	.connect =	unix_stream_connect,
860  	.socketpair =	unix_socketpair,
861  	.accept =	unix_accept,
862  	.getname =	unix_getname,
863  	.poll =		unix_dgram_poll,
864  	.ioctl =	unix_ioctl,
865  #ifdef CONFIG_COMPAT
866  	.compat_ioctl =	unix_compat_ioctl,
867  #endif
868  	.listen =	unix_listen,
869  	.shutdown =	unix_shutdown,
870  	.sendmsg =	unix_seqpacket_sendmsg,
871  	.recvmsg =	unix_seqpacket_recvmsg,
872  	.mmap =		sock_no_mmap,
873  	.sendpage =	sock_no_sendpage,
874  	.set_peek_off =	unix_set_peek_off,
875  	.show_fdinfo =	unix_show_fdinfo,
876  };
877  
878  static void unix_close(struct sock *sk, long timeout)
879  {
880  	/* Nothing to do here, unix socket does not need a ->close().
881  	 * This is merely for sockmap.
882  	 */
883  }
884  
885  static void unix_unhash(struct sock *sk)
886  {
887  	/* Nothing to do here, unix socket does not need a ->unhash().
888  	 * This is merely for sockmap.
889  	 */
890  }
891  
892  struct proto unix_dgram_proto = {
893  	.name			= "UNIX",
894  	.owner			= THIS_MODULE,
895  	.obj_size		= sizeof(struct unix_sock),
896  	.close			= unix_close,
897  #ifdef CONFIG_BPF_SYSCALL
898  	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
899  #endif
900  };
901  
902  struct proto unix_stream_proto = {
903  	.name			= "UNIX-STREAM",
904  	.owner			= THIS_MODULE,
905  	.obj_size		= sizeof(struct unix_sock),
906  	.close			= unix_close,
907  	.unhash			= unix_unhash,
908  #ifdef CONFIG_BPF_SYSCALL
909  	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
910  #endif
911  };
912  
913  static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
914  {
915  	struct unix_sock *u;
916  	struct sock *sk;
917  	int err;
918  
919  	atomic_long_inc(&unix_nr_socks);
920  	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
921  		err = -ENFILE;
922  		goto err;
923  	}
924  
925  	if (type == SOCK_STREAM)
926  		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
927  	else /*dgram and  seqpacket */
928  		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
929  
930  	if (!sk) {
931  		err = -ENOMEM;
932  		goto err;
933  	}
934  
935  	sock_init_data(sock, sk);
936  
937  	sk->sk_hash		= unix_unbound_hash(sk);
938  	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
939  	sk->sk_write_space	= unix_write_space;
940  	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
941  	sk->sk_destruct		= unix_sock_destructor;
942  	u	  = unix_sk(sk);
943  	u->path.dentry = NULL;
944  	u->path.mnt = NULL;
945  	spin_lock_init(&u->lock);
946  	atomic_long_set(&u->inflight, 0);
947  	INIT_LIST_HEAD(&u->link);
948  	mutex_init(&u->iolock); /* single task reading lock */
949  	mutex_init(&u->bindlock); /* single task binding lock */
950  	init_waitqueue_head(&u->peer_wait);
951  	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
952  	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
953  	unix_insert_unbound_socket(net, sk);
954  
955  	sock_prot_inuse_add(net, sk->sk_prot, 1);
956  
957  	return sk;
958  
959  err:
960  	atomic_long_dec(&unix_nr_socks);
961  	return ERR_PTR(err);
962  }
963  
964  static int unix_create(struct net *net, struct socket *sock, int protocol,
965  		       int kern)
966  {
967  	struct sock *sk;
968  
969  	if (protocol && protocol != PF_UNIX)
970  		return -EPROTONOSUPPORT;
971  
972  	sock->state = SS_UNCONNECTED;
973  
974  	switch (sock->type) {
975  	case SOCK_STREAM:
976  		sock->ops = &unix_stream_ops;
977  		break;
978  		/*
979  		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
980  		 *	nothing uses it.
981  		 */
982  	case SOCK_RAW:
983  		sock->type = SOCK_DGRAM;
984  		fallthrough;
985  	case SOCK_DGRAM:
986  		sock->ops = &unix_dgram_ops;
987  		break;
988  	case SOCK_SEQPACKET:
989  		sock->ops = &unix_seqpacket_ops;
990  		break;
991  	default:
992  		return -ESOCKTNOSUPPORT;
993  	}
994  
995  	sk = unix_create1(net, sock, kern, sock->type);
996  	if (IS_ERR(sk))
997  		return PTR_ERR(sk);
998  
999  	return 0;
1000  }
1001  
1002  static int unix_release(struct socket *sock)
1003  {
1004  	struct sock *sk = sock->sk;
1005  
1006  	if (!sk)
1007  		return 0;
1008  
1009  	sk->sk_prot->close(sk, 0);
1010  	unix_release_sock(sk, 0);
1011  	sock->sk = NULL;
1012  
1013  	return 0;
1014  }
1015  
1016  static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1017  				  int type)
1018  {
1019  	struct inode *inode;
1020  	struct path path;
1021  	struct sock *sk;
1022  	int err;
1023  
1024  	unix_mkname_bsd(sunaddr, addr_len);
1025  	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1026  	if (err)
1027  		goto fail;
1028  
1029  	err = path_permission(&path, MAY_WRITE);
1030  	if (err)
1031  		goto path_put;
1032  
1033  	err = -ECONNREFUSED;
1034  	inode = d_backing_inode(path.dentry);
1035  	if (!S_ISSOCK(inode->i_mode))
1036  		goto path_put;
1037  
1038  	sk = unix_find_socket_byinode(inode);
1039  	if (!sk)
1040  		goto path_put;
1041  
1042  	err = -EPROTOTYPE;
1043  	if (sk->sk_type == type)
1044  		touch_atime(&path);
1045  	else
1046  		goto sock_put;
1047  
1048  	path_put(&path);
1049  
1050  	return sk;
1051  
1052  sock_put:
1053  	sock_put(sk);
1054  path_put:
1055  	path_put(&path);
1056  fail:
1057  	return ERR_PTR(err);
1058  }
1059  
1060  static struct sock *unix_find_abstract(struct net *net,
1061  				       struct sockaddr_un *sunaddr,
1062  				       int addr_len, int type)
1063  {
1064  	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1065  	struct dentry *dentry;
1066  	struct sock *sk;
1067  
1068  	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1069  	if (!sk)
1070  		return ERR_PTR(-ECONNREFUSED);
1071  
1072  	dentry = unix_sk(sk)->path.dentry;
1073  	if (dentry)
1074  		touch_atime(&unix_sk(sk)->path);
1075  
1076  	return sk;
1077  }
1078  
1079  static struct sock *unix_find_other(struct net *net,
1080  				    struct sockaddr_un *sunaddr,
1081  				    int addr_len, int type)
1082  {
1083  	struct sock *sk;
1084  
1085  	if (sunaddr->sun_path[0])
1086  		sk = unix_find_bsd(sunaddr, addr_len, type);
1087  	else
1088  		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1089  
1090  	return sk;
1091  }
1092  
1093  static int unix_autobind(struct sock *sk)
1094  {
1095  	unsigned int new_hash, old_hash = sk->sk_hash;
1096  	struct unix_sock *u = unix_sk(sk);
1097  	struct net *net = sock_net(sk);
1098  	struct unix_address *addr;
1099  	u32 lastnum, ordernum;
1100  	int err;
1101  
1102  	err = mutex_lock_interruptible(&u->bindlock);
1103  	if (err)
1104  		return err;
1105  
1106  	if (u->addr)
1107  		goto out;
1108  
1109  	err = -ENOMEM;
1110  	addr = kzalloc(sizeof(*addr) +
1111  		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1112  	if (!addr)
1113  		goto out;
1114  
1115  	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1116  	addr->name->sun_family = AF_UNIX;
1117  	refcount_set(&addr->refcnt, 1);
1118  
1119  	ordernum = prandom_u32();
1120  	lastnum = ordernum & 0xFFFFF;
1121  retry:
1122  	ordernum = (ordernum + 1) & 0xFFFFF;
1123  	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1124  
1125  	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1126  	unix_table_double_lock(net, old_hash, new_hash);
1127  
1128  	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1129  		unix_table_double_unlock(net, old_hash, new_hash);
1130  
1131  		/* __unix_find_socket_byname() may take long time if many names
1132  		 * are already in use.
1133  		 */
1134  		cond_resched();
1135  
1136  		if (ordernum == lastnum) {
1137  			/* Give up if all names seems to be in use. */
1138  			err = -ENOSPC;
1139  			unix_release_addr(addr);
1140  			goto out;
1141  		}
1142  
1143  		goto retry;
1144  	}
1145  
1146  	__unix_set_addr_hash(net, sk, addr, new_hash);
1147  	unix_table_double_unlock(net, old_hash, new_hash);
1148  	err = 0;
1149  
1150  out:	mutex_unlock(&u->bindlock);
1151  	return err;
1152  }
1153  
1154  static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1155  			 int addr_len)
1156  {
1157  	umode_t mode = S_IFSOCK |
1158  	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1159  	unsigned int new_hash, old_hash = sk->sk_hash;
1160  	struct unix_sock *u = unix_sk(sk);
1161  	struct net *net = sock_net(sk);
1162  	struct user_namespace *ns; // barf...
1163  	struct unix_address *addr;
1164  	struct dentry *dentry;
1165  	struct path parent;
1166  	int err;
1167  
1168  	unix_mkname_bsd(sunaddr, addr_len);
1169  	addr_len = strlen(sunaddr->sun_path) +
1170  		offsetof(struct sockaddr_un, sun_path) + 1;
1171  
1172  	addr = unix_create_addr(sunaddr, addr_len);
1173  	if (!addr)
1174  		return -ENOMEM;
1175  
1176  	/*
1177  	 * Get the parent directory, calculate the hash for last
1178  	 * component.
1179  	 */
1180  	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1181  	if (IS_ERR(dentry)) {
1182  		err = PTR_ERR(dentry);
1183  		goto out;
1184  	}
1185  
1186  	/*
1187  	 * All right, let's create it.
1188  	 */
1189  	ns = mnt_user_ns(parent.mnt);
1190  	err = security_path_mknod(&parent, dentry, mode, 0);
1191  	if (!err)
1192  		err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1193  	if (err)
1194  		goto out_path;
1195  	err = mutex_lock_interruptible(&u->bindlock);
1196  	if (err)
1197  		goto out_unlink;
1198  	if (u->addr)
1199  		goto out_unlock;
1200  
1201  	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1202  	unix_table_double_lock(net, old_hash, new_hash);
1203  	u->path.mnt = mntget(parent.mnt);
1204  	u->path.dentry = dget(dentry);
1205  	__unix_set_addr_hash(net, sk, addr, new_hash);
1206  	unix_table_double_unlock(net, old_hash, new_hash);
1207  	unix_insert_bsd_socket(sk);
1208  	mutex_unlock(&u->bindlock);
1209  	done_path_create(&parent, dentry);
1210  	return 0;
1211  
1212  out_unlock:
1213  	mutex_unlock(&u->bindlock);
1214  	err = -EINVAL;
1215  out_unlink:
1216  	/* failed after successful mknod?  unlink what we'd created... */
1217  	vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1218  out_path:
1219  	done_path_create(&parent, dentry);
1220  out:
1221  	unix_release_addr(addr);
1222  	return err == -EEXIST ? -EADDRINUSE : err;
1223  }
1224  
1225  static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1226  			      int addr_len)
1227  {
1228  	unsigned int new_hash, old_hash = sk->sk_hash;
1229  	struct unix_sock *u = unix_sk(sk);
1230  	struct net *net = sock_net(sk);
1231  	struct unix_address *addr;
1232  	int err;
1233  
1234  	addr = unix_create_addr(sunaddr, addr_len);
1235  	if (!addr)
1236  		return -ENOMEM;
1237  
1238  	err = mutex_lock_interruptible(&u->bindlock);
1239  	if (err)
1240  		goto out;
1241  
1242  	if (u->addr) {
1243  		err = -EINVAL;
1244  		goto out_mutex;
1245  	}
1246  
1247  	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1248  	unix_table_double_lock(net, old_hash, new_hash);
1249  
1250  	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1251  		goto out_spin;
1252  
1253  	__unix_set_addr_hash(net, sk, addr, new_hash);
1254  	unix_table_double_unlock(net, old_hash, new_hash);
1255  	mutex_unlock(&u->bindlock);
1256  	return 0;
1257  
1258  out_spin:
1259  	unix_table_double_unlock(net, old_hash, new_hash);
1260  	err = -EADDRINUSE;
1261  out_mutex:
1262  	mutex_unlock(&u->bindlock);
1263  out:
1264  	unix_release_addr(addr);
1265  	return err;
1266  }
1267  
1268  static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1269  {
1270  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1271  	struct sock *sk = sock->sk;
1272  	int err;
1273  
1274  	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1275  	    sunaddr->sun_family == AF_UNIX)
1276  		return unix_autobind(sk);
1277  
1278  	err = unix_validate_addr(sunaddr, addr_len);
1279  	if (err)
1280  		return err;
1281  
1282  	if (sunaddr->sun_path[0])
1283  		err = unix_bind_bsd(sk, sunaddr, addr_len);
1284  	else
1285  		err = unix_bind_abstract(sk, sunaddr, addr_len);
1286  
1287  	return err;
1288  }
1289  
1290  static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1291  {
1292  	if (unlikely(sk1 == sk2) || !sk2) {
1293  		unix_state_lock(sk1);
1294  		return;
1295  	}
1296  	if (sk1 < sk2) {
1297  		unix_state_lock(sk1);
1298  		unix_state_lock_nested(sk2);
1299  	} else {
1300  		unix_state_lock(sk2);
1301  		unix_state_lock_nested(sk1);
1302  	}
1303  }
1304  
1305  static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1306  {
1307  	if (unlikely(sk1 == sk2) || !sk2) {
1308  		unix_state_unlock(sk1);
1309  		return;
1310  	}
1311  	unix_state_unlock(sk1);
1312  	unix_state_unlock(sk2);
1313  }
1314  
1315  static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1316  			      int alen, int flags)
1317  {
1318  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1319  	struct sock *sk = sock->sk;
1320  	struct sock *other;
1321  	int err;
1322  
1323  	err = -EINVAL;
1324  	if (alen < offsetofend(struct sockaddr, sa_family))
1325  		goto out;
1326  
1327  	if (addr->sa_family != AF_UNSPEC) {
1328  		err = unix_validate_addr(sunaddr, alen);
1329  		if (err)
1330  			goto out;
1331  
1332  		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1333  		    !unix_sk(sk)->addr) {
1334  			err = unix_autobind(sk);
1335  			if (err)
1336  				goto out;
1337  		}
1338  
1339  restart:
1340  		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1341  		if (IS_ERR(other)) {
1342  			err = PTR_ERR(other);
1343  			goto out;
1344  		}
1345  
1346  		unix_state_double_lock(sk, other);
1347  
1348  		/* Apparently VFS overslept socket death. Retry. */
1349  		if (sock_flag(other, SOCK_DEAD)) {
1350  			unix_state_double_unlock(sk, other);
1351  			sock_put(other);
1352  			goto restart;
1353  		}
1354  
1355  		err = -EPERM;
1356  		if (!unix_may_send(sk, other))
1357  			goto out_unlock;
1358  
1359  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1360  		if (err)
1361  			goto out_unlock;
1362  
1363  		sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1364  	} else {
1365  		/*
1366  		 *	1003.1g breaking connected state with AF_UNSPEC
1367  		 */
1368  		other = NULL;
1369  		unix_state_double_lock(sk, other);
1370  	}
1371  
1372  	/*
1373  	 * If it was connected, reconnect.
1374  	 */
1375  	if (unix_peer(sk)) {
1376  		struct sock *old_peer = unix_peer(sk);
1377  
1378  		unix_peer(sk) = other;
1379  		if (!other)
1380  			sk->sk_state = TCP_CLOSE;
1381  		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1382  
1383  		unix_state_double_unlock(sk, other);
1384  
1385  		if (other != old_peer)
1386  			unix_dgram_disconnected(sk, old_peer);
1387  		sock_put(old_peer);
1388  	} else {
1389  		unix_peer(sk) = other;
1390  		unix_state_double_unlock(sk, other);
1391  	}
1392  
1393  	return 0;
1394  
1395  out_unlock:
1396  	unix_state_double_unlock(sk, other);
1397  	sock_put(other);
1398  out:
1399  	return err;
1400  }
1401  
1402  static long unix_wait_for_peer(struct sock *other, long timeo)
1403  	__releases(&unix_sk(other)->lock)
1404  {
1405  	struct unix_sock *u = unix_sk(other);
1406  	int sched;
1407  	DEFINE_WAIT(wait);
1408  
1409  	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1410  
1411  	sched = !sock_flag(other, SOCK_DEAD) &&
1412  		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1413  		unix_recvq_full(other);
1414  
1415  	unix_state_unlock(other);
1416  
1417  	if (sched)
1418  		timeo = schedule_timeout(timeo);
1419  
1420  	finish_wait(&u->peer_wait, &wait);
1421  	return timeo;
1422  }
1423  
1424  static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1425  			       int addr_len, int flags)
1426  {
1427  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1428  	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1429  	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1430  	struct net *net = sock_net(sk);
1431  	struct sk_buff *skb = NULL;
1432  	long timeo;
1433  	int err;
1434  	int st;
1435  
1436  	err = unix_validate_addr(sunaddr, addr_len);
1437  	if (err)
1438  		goto out;
1439  
1440  	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1441  		err = unix_autobind(sk);
1442  		if (err)
1443  			goto out;
1444  	}
1445  
1446  	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1447  
1448  	/* First of all allocate resources.
1449  	   If we will make it after state is locked,
1450  	   we will have to recheck all again in any case.
1451  	 */
1452  
1453  	/* create new sock for complete connection */
1454  	newsk = unix_create1(net, NULL, 0, sock->type);
1455  	if (IS_ERR(newsk)) {
1456  		err = PTR_ERR(newsk);
1457  		newsk = NULL;
1458  		goto out;
1459  	}
1460  
1461  	err = -ENOMEM;
1462  
1463  	/* Allocate skb for sending to listening sock */
1464  	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1465  	if (skb == NULL)
1466  		goto out;
1467  
1468  restart:
1469  	/*  Find listening sock. */
1470  	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1471  	if (IS_ERR(other)) {
1472  		err = PTR_ERR(other);
1473  		other = NULL;
1474  		goto out;
1475  	}
1476  
1477  	/* Latch state of peer */
1478  	unix_state_lock(other);
1479  
1480  	/* Apparently VFS overslept socket death. Retry. */
1481  	if (sock_flag(other, SOCK_DEAD)) {
1482  		unix_state_unlock(other);
1483  		sock_put(other);
1484  		goto restart;
1485  	}
1486  
1487  	err = -ECONNREFUSED;
1488  	if (other->sk_state != TCP_LISTEN)
1489  		goto out_unlock;
1490  	if (other->sk_shutdown & RCV_SHUTDOWN)
1491  		goto out_unlock;
1492  
1493  	if (unix_recvq_full(other)) {
1494  		err = -EAGAIN;
1495  		if (!timeo)
1496  			goto out_unlock;
1497  
1498  		timeo = unix_wait_for_peer(other, timeo);
1499  
1500  		err = sock_intr_errno(timeo);
1501  		if (signal_pending(current))
1502  			goto out;
1503  		sock_put(other);
1504  		goto restart;
1505  	}
1506  
1507  	/* Latch our state.
1508  
1509  	   It is tricky place. We need to grab our state lock and cannot
1510  	   drop lock on peer. It is dangerous because deadlock is
1511  	   possible. Connect to self case and simultaneous
1512  	   attempt to connect are eliminated by checking socket
1513  	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1514  	   check this before attempt to grab lock.
1515  
1516  	   Well, and we have to recheck the state after socket locked.
1517  	 */
1518  	st = sk->sk_state;
1519  
1520  	switch (st) {
1521  	case TCP_CLOSE:
1522  		/* This is ok... continue with connect */
1523  		break;
1524  	case TCP_ESTABLISHED:
1525  		/* Socket is already connected */
1526  		err = -EISCONN;
1527  		goto out_unlock;
1528  	default:
1529  		err = -EINVAL;
1530  		goto out_unlock;
1531  	}
1532  
1533  	unix_state_lock_nested(sk);
1534  
1535  	if (sk->sk_state != st) {
1536  		unix_state_unlock(sk);
1537  		unix_state_unlock(other);
1538  		sock_put(other);
1539  		goto restart;
1540  	}
1541  
1542  	err = security_unix_stream_connect(sk, other, newsk);
1543  	if (err) {
1544  		unix_state_unlock(sk);
1545  		goto out_unlock;
1546  	}
1547  
1548  	/* The way is open! Fastly set all the necessary fields... */
1549  
1550  	sock_hold(sk);
1551  	unix_peer(newsk)	= sk;
1552  	newsk->sk_state		= TCP_ESTABLISHED;
1553  	newsk->sk_type		= sk->sk_type;
1554  	init_peercred(newsk);
1555  	newu = unix_sk(newsk);
1556  	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1557  	otheru = unix_sk(other);
1558  
1559  	/* copy address information from listening to new sock
1560  	 *
1561  	 * The contents of *(otheru->addr) and otheru->path
1562  	 * are seen fully set up here, since we have found
1563  	 * otheru in hash under its lock.  Insertion into the
1564  	 * hash chain we'd found it in had been done in an
1565  	 * earlier critical area protected by the chain's lock,
1566  	 * the same one where we'd set *(otheru->addr) contents,
1567  	 * as well as otheru->path and otheru->addr itself.
1568  	 *
1569  	 * Using smp_store_release() here to set newu->addr
1570  	 * is enough to make those stores, as well as stores
1571  	 * to newu->path visible to anyone who gets newu->addr
1572  	 * by smp_load_acquire().  IOW, the same warranties
1573  	 * as for unix_sock instances bound in unix_bind() or
1574  	 * in unix_autobind().
1575  	 */
1576  	if (otheru->path.dentry) {
1577  		path_get(&otheru->path);
1578  		newu->path = otheru->path;
1579  	}
1580  	refcount_inc(&otheru->addr->refcnt);
1581  	smp_store_release(&newu->addr, otheru->addr);
1582  
1583  	/* Set credentials */
1584  	copy_peercred(sk, other);
1585  
1586  	sock->state	= SS_CONNECTED;
1587  	sk->sk_state	= TCP_ESTABLISHED;
1588  	sock_hold(newsk);
1589  
1590  	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1591  	unix_peer(sk)	= newsk;
1592  
1593  	unix_state_unlock(sk);
1594  
1595  	/* take ten and send info to listening sock */
1596  	spin_lock(&other->sk_receive_queue.lock);
1597  	__skb_queue_tail(&other->sk_receive_queue, skb);
1598  	spin_unlock(&other->sk_receive_queue.lock);
1599  	unix_state_unlock(other);
1600  	other->sk_data_ready(other);
1601  	sock_put(other);
1602  	return 0;
1603  
1604  out_unlock:
1605  	if (other)
1606  		unix_state_unlock(other);
1607  
1608  out:
1609  	kfree_skb(skb);
1610  	if (newsk)
1611  		unix_release_sock(newsk, 0);
1612  	if (other)
1613  		sock_put(other);
1614  	return err;
1615  }
1616  
1617  static int unix_socketpair(struct socket *socka, struct socket *sockb)
1618  {
1619  	struct sock *ska = socka->sk, *skb = sockb->sk;
1620  
1621  	/* Join our sockets back to back */
1622  	sock_hold(ska);
1623  	sock_hold(skb);
1624  	unix_peer(ska) = skb;
1625  	unix_peer(skb) = ska;
1626  	init_peercred(ska);
1627  	init_peercred(skb);
1628  
1629  	ska->sk_state = TCP_ESTABLISHED;
1630  	skb->sk_state = TCP_ESTABLISHED;
1631  	socka->state  = SS_CONNECTED;
1632  	sockb->state  = SS_CONNECTED;
1633  	return 0;
1634  }
1635  
1636  static void unix_sock_inherit_flags(const struct socket *old,
1637  				    struct socket *new)
1638  {
1639  	if (test_bit(SOCK_PASSCRED, &old->flags))
1640  		set_bit(SOCK_PASSCRED, &new->flags);
1641  	if (test_bit(SOCK_PASSSEC, &old->flags))
1642  		set_bit(SOCK_PASSSEC, &new->flags);
1643  }
1644  
1645  static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1646  		       bool kern)
1647  {
1648  	struct sock *sk = sock->sk;
1649  	struct sock *tsk;
1650  	struct sk_buff *skb;
1651  	int err;
1652  
1653  	err = -EOPNOTSUPP;
1654  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1655  		goto out;
1656  
1657  	err = -EINVAL;
1658  	if (sk->sk_state != TCP_LISTEN)
1659  		goto out;
1660  
1661  	/* If socket state is TCP_LISTEN it cannot change (for now...),
1662  	 * so that no locks are necessary.
1663  	 */
1664  
1665  	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1666  				&err);
1667  	if (!skb) {
1668  		/* This means receive shutdown. */
1669  		if (err == 0)
1670  			err = -EINVAL;
1671  		goto out;
1672  	}
1673  
1674  	tsk = skb->sk;
1675  	skb_free_datagram(sk, skb);
1676  	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1677  
1678  	/* attach accepted sock to socket */
1679  	unix_state_lock(tsk);
1680  	newsock->state = SS_CONNECTED;
1681  	unix_sock_inherit_flags(sock, newsock);
1682  	sock_graft(tsk, newsock);
1683  	unix_state_unlock(tsk);
1684  	return 0;
1685  
1686  out:
1687  	return err;
1688  }
1689  
1690  
1691  static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1692  {
1693  	struct sock *sk = sock->sk;
1694  	struct unix_address *addr;
1695  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1696  	int err = 0;
1697  
1698  	if (peer) {
1699  		sk = unix_peer_get(sk);
1700  
1701  		err = -ENOTCONN;
1702  		if (!sk)
1703  			goto out;
1704  		err = 0;
1705  	} else {
1706  		sock_hold(sk);
1707  	}
1708  
1709  	addr = smp_load_acquire(&unix_sk(sk)->addr);
1710  	if (!addr) {
1711  		sunaddr->sun_family = AF_UNIX;
1712  		sunaddr->sun_path[0] = 0;
1713  		err = offsetof(struct sockaddr_un, sun_path);
1714  	} else {
1715  		err = addr->len;
1716  		memcpy(sunaddr, addr->name, addr->len);
1717  	}
1718  	sock_put(sk);
1719  out:
1720  	return err;
1721  }
1722  
1723  static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1724  {
1725  	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1726  
1727  	/*
1728  	 * Garbage collection of unix sockets starts by selecting a set of
1729  	 * candidate sockets which have reference only from being in flight
1730  	 * (total_refs == inflight_refs).  This condition is checked once during
1731  	 * the candidate collection phase, and candidates are marked as such, so
1732  	 * that non-candidates can later be ignored.  While inflight_refs is
1733  	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1734  	 * is an instantaneous decision.
1735  	 *
1736  	 * Once a candidate, however, the socket must not be reinstalled into a
1737  	 * file descriptor while the garbage collection is in progress.
1738  	 *
1739  	 * If the above conditions are met, then the directed graph of
1740  	 * candidates (*) does not change while unix_gc_lock is held.
1741  	 *
1742  	 * Any operations that changes the file count through file descriptors
1743  	 * (dup, close, sendmsg) does not change the graph since candidates are
1744  	 * not installed in fds.
1745  	 *
1746  	 * Dequeing a candidate via recvmsg would install it into an fd, but
1747  	 * that takes unix_gc_lock to decrement the inflight count, so it's
1748  	 * serialized with garbage collection.
1749  	 *
1750  	 * MSG_PEEK is special in that it does not change the inflight count,
1751  	 * yet does install the socket into an fd.  The following lock/unlock
1752  	 * pair is to ensure serialization with garbage collection.  It must be
1753  	 * done between incrementing the file count and installing the file into
1754  	 * an fd.
1755  	 *
1756  	 * If garbage collection starts after the barrier provided by the
1757  	 * lock/unlock, then it will see the elevated refcount and not mark this
1758  	 * as a candidate.  If a garbage collection is already in progress
1759  	 * before the file count was incremented, then the lock/unlock pair will
1760  	 * ensure that garbage collection is finished before progressing to
1761  	 * installing the fd.
1762  	 *
1763  	 * (*) A -> B where B is on the queue of A or B is on the queue of C
1764  	 * which is on the queue of listening socket A.
1765  	 */
1766  	spin_lock(&unix_gc_lock);
1767  	spin_unlock(&unix_gc_lock);
1768  }
1769  
1770  static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1771  {
1772  	int err = 0;
1773  
1774  	UNIXCB(skb).pid  = get_pid(scm->pid);
1775  	UNIXCB(skb).uid = scm->creds.uid;
1776  	UNIXCB(skb).gid = scm->creds.gid;
1777  	UNIXCB(skb).fp = NULL;
1778  	unix_get_secdata(scm, skb);
1779  	if (scm->fp && send_fds)
1780  		err = unix_attach_fds(scm, skb);
1781  
1782  	skb->destructor = unix_destruct_scm;
1783  	return err;
1784  }
1785  
1786  static bool unix_passcred_enabled(const struct socket *sock,
1787  				  const struct sock *other)
1788  {
1789  	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1790  	       !other->sk_socket ||
1791  	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1792  }
1793  
1794  /*
1795   * Some apps rely on write() giving SCM_CREDENTIALS
1796   * We include credentials if source or destination socket
1797   * asserted SOCK_PASSCRED.
1798   */
1799  static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1800  			    const struct sock *other)
1801  {
1802  	if (UNIXCB(skb).pid)
1803  		return;
1804  	if (unix_passcred_enabled(sock, other)) {
1805  		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1806  		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1807  	}
1808  }
1809  
1810  static int maybe_init_creds(struct scm_cookie *scm,
1811  			    struct socket *socket,
1812  			    const struct sock *other)
1813  {
1814  	int err;
1815  	struct msghdr msg = { .msg_controllen = 0 };
1816  
1817  	err = scm_send(socket, &msg, scm, false);
1818  	if (err)
1819  		return err;
1820  
1821  	if (unix_passcred_enabled(socket, other)) {
1822  		scm->pid = get_pid(task_tgid(current));
1823  		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1824  	}
1825  	return err;
1826  }
1827  
1828  static bool unix_skb_scm_eq(struct sk_buff *skb,
1829  			    struct scm_cookie *scm)
1830  {
1831  	return UNIXCB(skb).pid == scm->pid &&
1832  	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1833  	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1834  	       unix_secdata_eq(scm, skb);
1835  }
1836  
1837  static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1838  {
1839  	struct scm_fp_list *fp = UNIXCB(skb).fp;
1840  	struct unix_sock *u = unix_sk(sk);
1841  
1842  	if (unlikely(fp && fp->count))
1843  		atomic_add(fp->count, &u->scm_stat.nr_fds);
1844  }
1845  
1846  static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1847  {
1848  	struct scm_fp_list *fp = UNIXCB(skb).fp;
1849  	struct unix_sock *u = unix_sk(sk);
1850  
1851  	if (unlikely(fp && fp->count))
1852  		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1853  }
1854  
1855  /*
1856   *	Send AF_UNIX data.
1857   */
1858  
1859  static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1860  			      size_t len)
1861  {
1862  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1863  	struct sock *sk = sock->sk, *other = NULL;
1864  	struct unix_sock *u = unix_sk(sk);
1865  	struct scm_cookie scm;
1866  	struct sk_buff *skb;
1867  	int data_len = 0;
1868  	int sk_locked;
1869  	long timeo;
1870  	int err;
1871  
1872  	wait_for_unix_gc();
1873  	err = scm_send(sock, msg, &scm, false);
1874  	if (err < 0)
1875  		return err;
1876  
1877  	err = -EOPNOTSUPP;
1878  	if (msg->msg_flags&MSG_OOB)
1879  		goto out;
1880  
1881  	if (msg->msg_namelen) {
1882  		err = unix_validate_addr(sunaddr, msg->msg_namelen);
1883  		if (err)
1884  			goto out;
1885  	} else {
1886  		sunaddr = NULL;
1887  		err = -ENOTCONN;
1888  		other = unix_peer_get(sk);
1889  		if (!other)
1890  			goto out;
1891  	}
1892  
1893  	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1894  		err = unix_autobind(sk);
1895  		if (err)
1896  			goto out;
1897  	}
1898  
1899  	err = -EMSGSIZE;
1900  	if (len > sk->sk_sndbuf - 32)
1901  		goto out;
1902  
1903  	if (len > SKB_MAX_ALLOC) {
1904  		data_len = min_t(size_t,
1905  				 len - SKB_MAX_ALLOC,
1906  				 MAX_SKB_FRAGS * PAGE_SIZE);
1907  		data_len = PAGE_ALIGN(data_len);
1908  
1909  		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1910  	}
1911  
1912  	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1913  				   msg->msg_flags & MSG_DONTWAIT, &err,
1914  				   PAGE_ALLOC_COSTLY_ORDER);
1915  	if (skb == NULL)
1916  		goto out;
1917  
1918  	err = unix_scm_to_skb(&scm, skb, true);
1919  	if (err < 0)
1920  		goto out_free;
1921  
1922  	skb_put(skb, len - data_len);
1923  	skb->data_len = data_len;
1924  	skb->len = len;
1925  	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1926  	if (err)
1927  		goto out_free;
1928  
1929  	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1930  
1931  restart:
1932  	if (!other) {
1933  		err = -ECONNRESET;
1934  		if (sunaddr == NULL)
1935  			goto out_free;
1936  
1937  		other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1938  					sk->sk_type);
1939  		if (IS_ERR(other)) {
1940  			err = PTR_ERR(other);
1941  			other = NULL;
1942  			goto out_free;
1943  		}
1944  	}
1945  
1946  	if (sk_filter(other, skb) < 0) {
1947  		/* Toss the packet but do not return any error to the sender */
1948  		err = len;
1949  		goto out_free;
1950  	}
1951  
1952  	sk_locked = 0;
1953  	unix_state_lock(other);
1954  restart_locked:
1955  	err = -EPERM;
1956  	if (!unix_may_send(sk, other))
1957  		goto out_unlock;
1958  
1959  	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1960  		/*
1961  		 *	Check with 1003.1g - what should
1962  		 *	datagram error
1963  		 */
1964  		unix_state_unlock(other);
1965  		sock_put(other);
1966  
1967  		if (!sk_locked)
1968  			unix_state_lock(sk);
1969  
1970  		err = 0;
1971  		if (unix_peer(sk) == other) {
1972  			unix_peer(sk) = NULL;
1973  			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1974  
1975  			unix_state_unlock(sk);
1976  
1977  			sk->sk_state = TCP_CLOSE;
1978  			unix_dgram_disconnected(sk, other);
1979  			sock_put(other);
1980  			err = -ECONNREFUSED;
1981  		} else {
1982  			unix_state_unlock(sk);
1983  		}
1984  
1985  		other = NULL;
1986  		if (err)
1987  			goto out_free;
1988  		goto restart;
1989  	}
1990  
1991  	err = -EPIPE;
1992  	if (other->sk_shutdown & RCV_SHUTDOWN)
1993  		goto out_unlock;
1994  
1995  	if (sk->sk_type != SOCK_SEQPACKET) {
1996  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1997  		if (err)
1998  			goto out_unlock;
1999  	}
2000  
2001  	/* other == sk && unix_peer(other) != sk if
2002  	 * - unix_peer(sk) == NULL, destination address bound to sk
2003  	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2004  	 */
2005  	if (other != sk &&
2006  	    unlikely(unix_peer(other) != sk &&
2007  	    unix_recvq_full_lockless(other))) {
2008  		if (timeo) {
2009  			timeo = unix_wait_for_peer(other, timeo);
2010  
2011  			err = sock_intr_errno(timeo);
2012  			if (signal_pending(current))
2013  				goto out_free;
2014  
2015  			goto restart;
2016  		}
2017  
2018  		if (!sk_locked) {
2019  			unix_state_unlock(other);
2020  			unix_state_double_lock(sk, other);
2021  		}
2022  
2023  		if (unix_peer(sk) != other ||
2024  		    unix_dgram_peer_wake_me(sk, other)) {
2025  			err = -EAGAIN;
2026  			sk_locked = 1;
2027  			goto out_unlock;
2028  		}
2029  
2030  		if (!sk_locked) {
2031  			sk_locked = 1;
2032  			goto restart_locked;
2033  		}
2034  	}
2035  
2036  	if (unlikely(sk_locked))
2037  		unix_state_unlock(sk);
2038  
2039  	if (sock_flag(other, SOCK_RCVTSTAMP))
2040  		__net_timestamp(skb);
2041  	maybe_add_creds(skb, sock, other);
2042  	scm_stat_add(other, skb);
2043  	skb_queue_tail(&other->sk_receive_queue, skb);
2044  	unix_state_unlock(other);
2045  	other->sk_data_ready(other);
2046  	sock_put(other);
2047  	scm_destroy(&scm);
2048  	return len;
2049  
2050  out_unlock:
2051  	if (sk_locked)
2052  		unix_state_unlock(sk);
2053  	unix_state_unlock(other);
2054  out_free:
2055  	kfree_skb(skb);
2056  out:
2057  	if (other)
2058  		sock_put(other);
2059  	scm_destroy(&scm);
2060  	return err;
2061  }
2062  
2063  /* We use paged skbs for stream sockets, and limit occupancy to 32768
2064   * bytes, and a minimum of a full page.
2065   */
2066  #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2067  
2068  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2069  static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2070  {
2071  	struct unix_sock *ousk = unix_sk(other);
2072  	struct sk_buff *skb;
2073  	int err = 0;
2074  
2075  	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2076  
2077  	if (!skb)
2078  		return err;
2079  
2080  	skb_put(skb, 1);
2081  	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2082  
2083  	if (err) {
2084  		kfree_skb(skb);
2085  		return err;
2086  	}
2087  
2088  	unix_state_lock(other);
2089  
2090  	if (sock_flag(other, SOCK_DEAD) ||
2091  	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2092  		unix_state_unlock(other);
2093  		kfree_skb(skb);
2094  		return -EPIPE;
2095  	}
2096  
2097  	maybe_add_creds(skb, sock, other);
2098  	skb_get(skb);
2099  
2100  	if (ousk->oob_skb)
2101  		consume_skb(ousk->oob_skb);
2102  
2103  	WRITE_ONCE(ousk->oob_skb, skb);
2104  
2105  	scm_stat_add(other, skb);
2106  	skb_queue_tail(&other->sk_receive_queue, skb);
2107  	sk_send_sigurg(other);
2108  	unix_state_unlock(other);
2109  	other->sk_data_ready(other);
2110  
2111  	return err;
2112  }
2113  #endif
2114  
2115  static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2116  			       size_t len)
2117  {
2118  	struct sock *sk = sock->sk;
2119  	struct sock *other = NULL;
2120  	int err, size;
2121  	struct sk_buff *skb;
2122  	int sent = 0;
2123  	struct scm_cookie scm;
2124  	bool fds_sent = false;
2125  	int data_len;
2126  
2127  	wait_for_unix_gc();
2128  	err = scm_send(sock, msg, &scm, false);
2129  	if (err < 0)
2130  		return err;
2131  
2132  	err = -EOPNOTSUPP;
2133  	if (msg->msg_flags & MSG_OOB) {
2134  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2135  		if (len)
2136  			len--;
2137  		else
2138  #endif
2139  			goto out_err;
2140  	}
2141  
2142  	if (msg->msg_namelen) {
2143  		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2144  		goto out_err;
2145  	} else {
2146  		err = -ENOTCONN;
2147  		other = unix_peer(sk);
2148  		if (!other)
2149  			goto out_err;
2150  	}
2151  
2152  	if (sk->sk_shutdown & SEND_SHUTDOWN)
2153  		goto pipe_err;
2154  
2155  	while (sent < len) {
2156  		size = len - sent;
2157  
2158  		/* Keep two messages in the pipe so it schedules better */
2159  		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2160  
2161  		/* allow fallback to order-0 allocations */
2162  		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2163  
2164  		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2165  
2166  		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2167  
2168  		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2169  					   msg->msg_flags & MSG_DONTWAIT, &err,
2170  					   get_order(UNIX_SKB_FRAGS_SZ));
2171  		if (!skb)
2172  			goto out_err;
2173  
2174  		/* Only send the fds in the first buffer */
2175  		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2176  		if (err < 0) {
2177  			kfree_skb(skb);
2178  			goto out_err;
2179  		}
2180  		fds_sent = true;
2181  
2182  		skb_put(skb, size - data_len);
2183  		skb->data_len = data_len;
2184  		skb->len = size;
2185  		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2186  		if (err) {
2187  			kfree_skb(skb);
2188  			goto out_err;
2189  		}
2190  
2191  		unix_state_lock(other);
2192  
2193  		if (sock_flag(other, SOCK_DEAD) ||
2194  		    (other->sk_shutdown & RCV_SHUTDOWN))
2195  			goto pipe_err_free;
2196  
2197  		maybe_add_creds(skb, sock, other);
2198  		scm_stat_add(other, skb);
2199  		skb_queue_tail(&other->sk_receive_queue, skb);
2200  		unix_state_unlock(other);
2201  		other->sk_data_ready(other);
2202  		sent += size;
2203  	}
2204  
2205  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2206  	if (msg->msg_flags & MSG_OOB) {
2207  		err = queue_oob(sock, msg, other);
2208  		if (err)
2209  			goto out_err;
2210  		sent++;
2211  	}
2212  #endif
2213  
2214  	scm_destroy(&scm);
2215  
2216  	return sent;
2217  
2218  pipe_err_free:
2219  	unix_state_unlock(other);
2220  	kfree_skb(skb);
2221  pipe_err:
2222  	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2223  		send_sig(SIGPIPE, current, 0);
2224  	err = -EPIPE;
2225  out_err:
2226  	scm_destroy(&scm);
2227  	return sent ? : err;
2228  }
2229  
2230  static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2231  				    int offset, size_t size, int flags)
2232  {
2233  	int err;
2234  	bool send_sigpipe = false;
2235  	bool init_scm = true;
2236  	struct scm_cookie scm;
2237  	struct sock *other, *sk = socket->sk;
2238  	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2239  
2240  	if (flags & MSG_OOB)
2241  		return -EOPNOTSUPP;
2242  
2243  	other = unix_peer(sk);
2244  	if (!other || sk->sk_state != TCP_ESTABLISHED)
2245  		return -ENOTCONN;
2246  
2247  	if (false) {
2248  alloc_skb:
2249  		unix_state_unlock(other);
2250  		mutex_unlock(&unix_sk(other)->iolock);
2251  		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2252  					      &err, 0);
2253  		if (!newskb)
2254  			goto err;
2255  	}
2256  
2257  	/* we must acquire iolock as we modify already present
2258  	 * skbs in the sk_receive_queue and mess with skb->len
2259  	 */
2260  	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2261  	if (err) {
2262  		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2263  		goto err;
2264  	}
2265  
2266  	if (sk->sk_shutdown & SEND_SHUTDOWN) {
2267  		err = -EPIPE;
2268  		send_sigpipe = true;
2269  		goto err_unlock;
2270  	}
2271  
2272  	unix_state_lock(other);
2273  
2274  	if (sock_flag(other, SOCK_DEAD) ||
2275  	    other->sk_shutdown & RCV_SHUTDOWN) {
2276  		err = -EPIPE;
2277  		send_sigpipe = true;
2278  		goto err_state_unlock;
2279  	}
2280  
2281  	if (init_scm) {
2282  		err = maybe_init_creds(&scm, socket, other);
2283  		if (err)
2284  			goto err_state_unlock;
2285  		init_scm = false;
2286  	}
2287  
2288  	skb = skb_peek_tail(&other->sk_receive_queue);
2289  	if (tail && tail == skb) {
2290  		skb = newskb;
2291  	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2292  		if (newskb) {
2293  			skb = newskb;
2294  		} else {
2295  			tail = skb;
2296  			goto alloc_skb;
2297  		}
2298  	} else if (newskb) {
2299  		/* this is fast path, we don't necessarily need to
2300  		 * call to kfree_skb even though with newskb == NULL
2301  		 * this - does no harm
2302  		 */
2303  		consume_skb(newskb);
2304  		newskb = NULL;
2305  	}
2306  
2307  	if (skb_append_pagefrags(skb, page, offset, size)) {
2308  		tail = skb;
2309  		goto alloc_skb;
2310  	}
2311  
2312  	skb->len += size;
2313  	skb->data_len += size;
2314  	skb->truesize += size;
2315  	refcount_add(size, &sk->sk_wmem_alloc);
2316  
2317  	if (newskb) {
2318  		err = unix_scm_to_skb(&scm, skb, false);
2319  		if (err)
2320  			goto err_state_unlock;
2321  		spin_lock(&other->sk_receive_queue.lock);
2322  		__skb_queue_tail(&other->sk_receive_queue, newskb);
2323  		spin_unlock(&other->sk_receive_queue.lock);
2324  	}
2325  
2326  	unix_state_unlock(other);
2327  	mutex_unlock(&unix_sk(other)->iolock);
2328  
2329  	other->sk_data_ready(other);
2330  	scm_destroy(&scm);
2331  	return size;
2332  
2333  err_state_unlock:
2334  	unix_state_unlock(other);
2335  err_unlock:
2336  	mutex_unlock(&unix_sk(other)->iolock);
2337  err:
2338  	kfree_skb(newskb);
2339  	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2340  		send_sig(SIGPIPE, current, 0);
2341  	if (!init_scm)
2342  		scm_destroy(&scm);
2343  	return err;
2344  }
2345  
2346  static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2347  				  size_t len)
2348  {
2349  	int err;
2350  	struct sock *sk = sock->sk;
2351  
2352  	err = sock_error(sk);
2353  	if (err)
2354  		return err;
2355  
2356  	if (sk->sk_state != TCP_ESTABLISHED)
2357  		return -ENOTCONN;
2358  
2359  	if (msg->msg_namelen)
2360  		msg->msg_namelen = 0;
2361  
2362  	return unix_dgram_sendmsg(sock, msg, len);
2363  }
2364  
2365  static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2366  				  size_t size, int flags)
2367  {
2368  	struct sock *sk = sock->sk;
2369  
2370  	if (sk->sk_state != TCP_ESTABLISHED)
2371  		return -ENOTCONN;
2372  
2373  	return unix_dgram_recvmsg(sock, msg, size, flags);
2374  }
2375  
2376  static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2377  {
2378  	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2379  
2380  	if (addr) {
2381  		msg->msg_namelen = addr->len;
2382  		memcpy(msg->msg_name, addr->name, addr->len);
2383  	}
2384  }
2385  
2386  int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2387  			 int flags)
2388  {
2389  	struct scm_cookie scm;
2390  	struct socket *sock = sk->sk_socket;
2391  	struct unix_sock *u = unix_sk(sk);
2392  	struct sk_buff *skb, *last;
2393  	long timeo;
2394  	int skip;
2395  	int err;
2396  
2397  	err = -EOPNOTSUPP;
2398  	if (flags&MSG_OOB)
2399  		goto out;
2400  
2401  	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2402  
2403  	do {
2404  		mutex_lock(&u->iolock);
2405  
2406  		skip = sk_peek_offset(sk, flags);
2407  		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2408  					      &skip, &err, &last);
2409  		if (skb) {
2410  			if (!(flags & MSG_PEEK))
2411  				scm_stat_del(sk, skb);
2412  			break;
2413  		}
2414  
2415  		mutex_unlock(&u->iolock);
2416  
2417  		if (err != -EAGAIN)
2418  			break;
2419  	} while (timeo &&
2420  		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2421  					      &err, &timeo, last));
2422  
2423  	if (!skb) { /* implies iolock unlocked */
2424  		unix_state_lock(sk);
2425  		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2426  		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2427  		    (sk->sk_shutdown & RCV_SHUTDOWN))
2428  			err = 0;
2429  		unix_state_unlock(sk);
2430  		goto out;
2431  	}
2432  
2433  	if (wq_has_sleeper(&u->peer_wait))
2434  		wake_up_interruptible_sync_poll(&u->peer_wait,
2435  						EPOLLOUT | EPOLLWRNORM |
2436  						EPOLLWRBAND);
2437  
2438  	if (msg->msg_name)
2439  		unix_copy_addr(msg, skb->sk);
2440  
2441  	if (size > skb->len - skip)
2442  		size = skb->len - skip;
2443  	else if (size < skb->len - skip)
2444  		msg->msg_flags |= MSG_TRUNC;
2445  
2446  	err = skb_copy_datagram_msg(skb, skip, msg, size);
2447  	if (err)
2448  		goto out_free;
2449  
2450  	if (sock_flag(sk, SOCK_RCVTSTAMP))
2451  		__sock_recv_timestamp(msg, sk, skb);
2452  
2453  	memset(&scm, 0, sizeof(scm));
2454  
2455  	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2456  	unix_set_secdata(&scm, skb);
2457  
2458  	if (!(flags & MSG_PEEK)) {
2459  		if (UNIXCB(skb).fp)
2460  			unix_detach_fds(&scm, skb);
2461  
2462  		sk_peek_offset_bwd(sk, skb->len);
2463  	} else {
2464  		/* It is questionable: on PEEK we could:
2465  		   - do not return fds - good, but too simple 8)
2466  		   - return fds, and do not return them on read (old strategy,
2467  		     apparently wrong)
2468  		   - clone fds (I chose it for now, it is the most universal
2469  		     solution)
2470  
2471  		   POSIX 1003.1g does not actually define this clearly
2472  		   at all. POSIX 1003.1g doesn't define a lot of things
2473  		   clearly however!
2474  
2475  		*/
2476  
2477  		sk_peek_offset_fwd(sk, size);
2478  
2479  		if (UNIXCB(skb).fp)
2480  			unix_peek_fds(&scm, skb);
2481  	}
2482  	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2483  
2484  	scm_recv(sock, msg, &scm, flags);
2485  
2486  out_free:
2487  	skb_free_datagram(sk, skb);
2488  	mutex_unlock(&u->iolock);
2489  out:
2490  	return err;
2491  }
2492  
2493  static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2494  			      int flags)
2495  {
2496  	struct sock *sk = sock->sk;
2497  
2498  #ifdef CONFIG_BPF_SYSCALL
2499  	const struct proto *prot = READ_ONCE(sk->sk_prot);
2500  
2501  	if (prot != &unix_dgram_proto)
2502  		return prot->recvmsg(sk, msg, size, flags, NULL);
2503  #endif
2504  	return __unix_dgram_recvmsg(sk, msg, size, flags);
2505  }
2506  
2507  static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2508  {
2509  	int copied = 0;
2510  
2511  	while (1) {
2512  		struct unix_sock *u = unix_sk(sk);
2513  		struct sk_buff *skb;
2514  		int used, err;
2515  
2516  		mutex_lock(&u->iolock);
2517  		skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2518  		mutex_unlock(&u->iolock);
2519  		if (!skb)
2520  			return err;
2521  
2522  		used = recv_actor(sk, skb);
2523  		if (used <= 0) {
2524  			if (!copied)
2525  				copied = used;
2526  			kfree_skb(skb);
2527  			break;
2528  		} else if (used <= skb->len) {
2529  			copied += used;
2530  		}
2531  
2532  		kfree_skb(skb);
2533  		break;
2534  	}
2535  
2536  	return copied;
2537  }
2538  
2539  /*
2540   *	Sleep until more data has arrived. But check for races..
2541   */
2542  static long unix_stream_data_wait(struct sock *sk, long timeo,
2543  				  struct sk_buff *last, unsigned int last_len,
2544  				  bool freezable)
2545  {
2546  	struct sk_buff *tail;
2547  	DEFINE_WAIT(wait);
2548  
2549  	unix_state_lock(sk);
2550  
2551  	for (;;) {
2552  		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2553  
2554  		tail = skb_peek_tail(&sk->sk_receive_queue);
2555  		if (tail != last ||
2556  		    (tail && tail->len != last_len) ||
2557  		    sk->sk_err ||
2558  		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2559  		    signal_pending(current) ||
2560  		    !timeo)
2561  			break;
2562  
2563  		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2564  		unix_state_unlock(sk);
2565  		if (freezable)
2566  			timeo = freezable_schedule_timeout(timeo);
2567  		else
2568  			timeo = schedule_timeout(timeo);
2569  		unix_state_lock(sk);
2570  
2571  		if (sock_flag(sk, SOCK_DEAD))
2572  			break;
2573  
2574  		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2575  	}
2576  
2577  	finish_wait(sk_sleep(sk), &wait);
2578  	unix_state_unlock(sk);
2579  	return timeo;
2580  }
2581  
2582  static unsigned int unix_skb_len(const struct sk_buff *skb)
2583  {
2584  	return skb->len - UNIXCB(skb).consumed;
2585  }
2586  
2587  struct unix_stream_read_state {
2588  	int (*recv_actor)(struct sk_buff *, int, int,
2589  			  struct unix_stream_read_state *);
2590  	struct socket *socket;
2591  	struct msghdr *msg;
2592  	struct pipe_inode_info *pipe;
2593  	size_t size;
2594  	int flags;
2595  	unsigned int splice_flags;
2596  };
2597  
2598  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2599  static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2600  {
2601  	struct socket *sock = state->socket;
2602  	struct sock *sk = sock->sk;
2603  	struct unix_sock *u = unix_sk(sk);
2604  	int chunk = 1;
2605  	struct sk_buff *oob_skb;
2606  
2607  	mutex_lock(&u->iolock);
2608  	unix_state_lock(sk);
2609  
2610  	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2611  		unix_state_unlock(sk);
2612  		mutex_unlock(&u->iolock);
2613  		return -EINVAL;
2614  	}
2615  
2616  	oob_skb = u->oob_skb;
2617  
2618  	if (!(state->flags & MSG_PEEK))
2619  		WRITE_ONCE(u->oob_skb, NULL);
2620  
2621  	unix_state_unlock(sk);
2622  
2623  	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2624  
2625  	if (!(state->flags & MSG_PEEK)) {
2626  		UNIXCB(oob_skb).consumed += 1;
2627  		kfree_skb(oob_skb);
2628  	}
2629  
2630  	mutex_unlock(&u->iolock);
2631  
2632  	if (chunk < 0)
2633  		return -EFAULT;
2634  
2635  	state->msg->msg_flags |= MSG_OOB;
2636  	return 1;
2637  }
2638  
2639  static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2640  				  int flags, int copied)
2641  {
2642  	struct unix_sock *u = unix_sk(sk);
2643  
2644  	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2645  		skb_unlink(skb, &sk->sk_receive_queue);
2646  		consume_skb(skb);
2647  		skb = NULL;
2648  	} else {
2649  		if (skb == u->oob_skb) {
2650  			if (copied) {
2651  				skb = NULL;
2652  			} else if (sock_flag(sk, SOCK_URGINLINE)) {
2653  				if (!(flags & MSG_PEEK)) {
2654  					WRITE_ONCE(u->oob_skb, NULL);
2655  					consume_skb(skb);
2656  				}
2657  			} else if (!(flags & MSG_PEEK)) {
2658  				skb_unlink(skb, &sk->sk_receive_queue);
2659  				consume_skb(skb);
2660  				skb = skb_peek(&sk->sk_receive_queue);
2661  			}
2662  		}
2663  	}
2664  	return skb;
2665  }
2666  #endif
2667  
2668  static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2669  {
2670  	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2671  		return -ENOTCONN;
2672  
2673  	return unix_read_skb(sk, recv_actor);
2674  }
2675  
2676  static int unix_stream_read_generic(struct unix_stream_read_state *state,
2677  				    bool freezable)
2678  {
2679  	struct scm_cookie scm;
2680  	struct socket *sock = state->socket;
2681  	struct sock *sk = sock->sk;
2682  	struct unix_sock *u = unix_sk(sk);
2683  	int copied = 0;
2684  	int flags = state->flags;
2685  	int noblock = flags & MSG_DONTWAIT;
2686  	bool check_creds = false;
2687  	int target;
2688  	int err = 0;
2689  	long timeo;
2690  	int skip;
2691  	size_t size = state->size;
2692  	unsigned int last_len;
2693  
2694  	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2695  		err = -EINVAL;
2696  		goto out;
2697  	}
2698  
2699  	if (unlikely(flags & MSG_OOB)) {
2700  		err = -EOPNOTSUPP;
2701  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2702  		err = unix_stream_recv_urg(state);
2703  #endif
2704  		goto out;
2705  	}
2706  
2707  	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2708  	timeo = sock_rcvtimeo(sk, noblock);
2709  
2710  	memset(&scm, 0, sizeof(scm));
2711  
2712  	/* Lock the socket to prevent queue disordering
2713  	 * while sleeps in memcpy_tomsg
2714  	 */
2715  	mutex_lock(&u->iolock);
2716  
2717  	skip = max(sk_peek_offset(sk, flags), 0);
2718  
2719  	do {
2720  		int chunk;
2721  		bool drop_skb;
2722  		struct sk_buff *skb, *last;
2723  
2724  redo:
2725  		unix_state_lock(sk);
2726  		if (sock_flag(sk, SOCK_DEAD)) {
2727  			err = -ECONNRESET;
2728  			goto unlock;
2729  		}
2730  		last = skb = skb_peek(&sk->sk_receive_queue);
2731  		last_len = last ? last->len : 0;
2732  
2733  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2734  		if (skb) {
2735  			skb = manage_oob(skb, sk, flags, copied);
2736  			if (!skb) {
2737  				unix_state_unlock(sk);
2738  				if (copied)
2739  					break;
2740  				goto redo;
2741  			}
2742  		}
2743  #endif
2744  again:
2745  		if (skb == NULL) {
2746  			if (copied >= target)
2747  				goto unlock;
2748  
2749  			/*
2750  			 *	POSIX 1003.1g mandates this order.
2751  			 */
2752  
2753  			err = sock_error(sk);
2754  			if (err)
2755  				goto unlock;
2756  			if (sk->sk_shutdown & RCV_SHUTDOWN)
2757  				goto unlock;
2758  
2759  			unix_state_unlock(sk);
2760  			if (!timeo) {
2761  				err = -EAGAIN;
2762  				break;
2763  			}
2764  
2765  			mutex_unlock(&u->iolock);
2766  
2767  			timeo = unix_stream_data_wait(sk, timeo, last,
2768  						      last_len, freezable);
2769  
2770  			if (signal_pending(current)) {
2771  				err = sock_intr_errno(timeo);
2772  				scm_destroy(&scm);
2773  				goto out;
2774  			}
2775  
2776  			mutex_lock(&u->iolock);
2777  			goto redo;
2778  unlock:
2779  			unix_state_unlock(sk);
2780  			break;
2781  		}
2782  
2783  		while (skip >= unix_skb_len(skb)) {
2784  			skip -= unix_skb_len(skb);
2785  			last = skb;
2786  			last_len = skb->len;
2787  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2788  			if (!skb)
2789  				goto again;
2790  		}
2791  
2792  		unix_state_unlock(sk);
2793  
2794  		if (check_creds) {
2795  			/* Never glue messages from different writers */
2796  			if (!unix_skb_scm_eq(skb, &scm))
2797  				break;
2798  		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2799  			/* Copy credentials */
2800  			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2801  			unix_set_secdata(&scm, skb);
2802  			check_creds = true;
2803  		}
2804  
2805  		/* Copy address just once */
2806  		if (state->msg && state->msg->msg_name) {
2807  			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2808  					 state->msg->msg_name);
2809  			unix_copy_addr(state->msg, skb->sk);
2810  			sunaddr = NULL;
2811  		}
2812  
2813  		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2814  		skb_get(skb);
2815  		chunk = state->recv_actor(skb, skip, chunk, state);
2816  		drop_skb = !unix_skb_len(skb);
2817  		/* skb is only safe to use if !drop_skb */
2818  		consume_skb(skb);
2819  		if (chunk < 0) {
2820  			if (copied == 0)
2821  				copied = -EFAULT;
2822  			break;
2823  		}
2824  		copied += chunk;
2825  		size -= chunk;
2826  
2827  		if (drop_skb) {
2828  			/* the skb was touched by a concurrent reader;
2829  			 * we should not expect anything from this skb
2830  			 * anymore and assume it invalid - we can be
2831  			 * sure it was dropped from the socket queue
2832  			 *
2833  			 * let's report a short read
2834  			 */
2835  			err = 0;
2836  			break;
2837  		}
2838  
2839  		/* Mark read part of skb as used */
2840  		if (!(flags & MSG_PEEK)) {
2841  			UNIXCB(skb).consumed += chunk;
2842  
2843  			sk_peek_offset_bwd(sk, chunk);
2844  
2845  			if (UNIXCB(skb).fp) {
2846  				scm_stat_del(sk, skb);
2847  				unix_detach_fds(&scm, skb);
2848  			}
2849  
2850  			if (unix_skb_len(skb))
2851  				break;
2852  
2853  			skb_unlink(skb, &sk->sk_receive_queue);
2854  			consume_skb(skb);
2855  
2856  			if (scm.fp)
2857  				break;
2858  		} else {
2859  			/* It is questionable, see note in unix_dgram_recvmsg.
2860  			 */
2861  			if (UNIXCB(skb).fp)
2862  				unix_peek_fds(&scm, skb);
2863  
2864  			sk_peek_offset_fwd(sk, chunk);
2865  
2866  			if (UNIXCB(skb).fp)
2867  				break;
2868  
2869  			skip = 0;
2870  			last = skb;
2871  			last_len = skb->len;
2872  			unix_state_lock(sk);
2873  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2874  			if (skb)
2875  				goto again;
2876  			unix_state_unlock(sk);
2877  			break;
2878  		}
2879  	} while (size);
2880  
2881  	mutex_unlock(&u->iolock);
2882  	if (state->msg)
2883  		scm_recv(sock, state->msg, &scm, flags);
2884  	else
2885  		scm_destroy(&scm);
2886  out:
2887  	return copied ? : err;
2888  }
2889  
2890  static int unix_stream_read_actor(struct sk_buff *skb,
2891  				  int skip, int chunk,
2892  				  struct unix_stream_read_state *state)
2893  {
2894  	int ret;
2895  
2896  	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2897  				    state->msg, chunk);
2898  	return ret ?: chunk;
2899  }
2900  
2901  int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2902  			  size_t size, int flags)
2903  {
2904  	struct unix_stream_read_state state = {
2905  		.recv_actor = unix_stream_read_actor,
2906  		.socket = sk->sk_socket,
2907  		.msg = msg,
2908  		.size = size,
2909  		.flags = flags
2910  	};
2911  
2912  	return unix_stream_read_generic(&state, true);
2913  }
2914  
2915  static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2916  			       size_t size, int flags)
2917  {
2918  	struct unix_stream_read_state state = {
2919  		.recv_actor = unix_stream_read_actor,
2920  		.socket = sock,
2921  		.msg = msg,
2922  		.size = size,
2923  		.flags = flags
2924  	};
2925  
2926  #ifdef CONFIG_BPF_SYSCALL
2927  	struct sock *sk = sock->sk;
2928  	const struct proto *prot = READ_ONCE(sk->sk_prot);
2929  
2930  	if (prot != &unix_stream_proto)
2931  		return prot->recvmsg(sk, msg, size, flags, NULL);
2932  #endif
2933  	return unix_stream_read_generic(&state, true);
2934  }
2935  
2936  static int unix_stream_splice_actor(struct sk_buff *skb,
2937  				    int skip, int chunk,
2938  				    struct unix_stream_read_state *state)
2939  {
2940  	return skb_splice_bits(skb, state->socket->sk,
2941  			       UNIXCB(skb).consumed + skip,
2942  			       state->pipe, chunk, state->splice_flags);
2943  }
2944  
2945  static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2946  				       struct pipe_inode_info *pipe,
2947  				       size_t size, unsigned int flags)
2948  {
2949  	struct unix_stream_read_state state = {
2950  		.recv_actor = unix_stream_splice_actor,
2951  		.socket = sock,
2952  		.pipe = pipe,
2953  		.size = size,
2954  		.splice_flags = flags,
2955  	};
2956  
2957  	if (unlikely(*ppos))
2958  		return -ESPIPE;
2959  
2960  	if (sock->file->f_flags & O_NONBLOCK ||
2961  	    flags & SPLICE_F_NONBLOCK)
2962  		state.flags = MSG_DONTWAIT;
2963  
2964  	return unix_stream_read_generic(&state, false);
2965  }
2966  
2967  static int unix_shutdown(struct socket *sock, int mode)
2968  {
2969  	struct sock *sk = sock->sk;
2970  	struct sock *other;
2971  
2972  	if (mode < SHUT_RD || mode > SHUT_RDWR)
2973  		return -EINVAL;
2974  	/* This maps:
2975  	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2976  	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2977  	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2978  	 */
2979  	++mode;
2980  
2981  	unix_state_lock(sk);
2982  	sk->sk_shutdown |= mode;
2983  	other = unix_peer(sk);
2984  	if (other)
2985  		sock_hold(other);
2986  	unix_state_unlock(sk);
2987  	sk->sk_state_change(sk);
2988  
2989  	if (other &&
2990  		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2991  
2992  		int peer_mode = 0;
2993  		const struct proto *prot = READ_ONCE(other->sk_prot);
2994  
2995  		if (prot->unhash)
2996  			prot->unhash(other);
2997  		if (mode&RCV_SHUTDOWN)
2998  			peer_mode |= SEND_SHUTDOWN;
2999  		if (mode&SEND_SHUTDOWN)
3000  			peer_mode |= RCV_SHUTDOWN;
3001  		unix_state_lock(other);
3002  		other->sk_shutdown |= peer_mode;
3003  		unix_state_unlock(other);
3004  		other->sk_state_change(other);
3005  		if (peer_mode == SHUTDOWN_MASK)
3006  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3007  		else if (peer_mode & RCV_SHUTDOWN)
3008  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3009  	}
3010  	if (other)
3011  		sock_put(other);
3012  
3013  	return 0;
3014  }
3015  
3016  long unix_inq_len(struct sock *sk)
3017  {
3018  	struct sk_buff *skb;
3019  	long amount = 0;
3020  
3021  	if (sk->sk_state == TCP_LISTEN)
3022  		return -EINVAL;
3023  
3024  	spin_lock(&sk->sk_receive_queue.lock);
3025  	if (sk->sk_type == SOCK_STREAM ||
3026  	    sk->sk_type == SOCK_SEQPACKET) {
3027  		skb_queue_walk(&sk->sk_receive_queue, skb)
3028  			amount += unix_skb_len(skb);
3029  	} else {
3030  		skb = skb_peek(&sk->sk_receive_queue);
3031  		if (skb)
3032  			amount = skb->len;
3033  	}
3034  	spin_unlock(&sk->sk_receive_queue.lock);
3035  
3036  	return amount;
3037  }
3038  EXPORT_SYMBOL_GPL(unix_inq_len);
3039  
3040  long unix_outq_len(struct sock *sk)
3041  {
3042  	return sk_wmem_alloc_get(sk);
3043  }
3044  EXPORT_SYMBOL_GPL(unix_outq_len);
3045  
3046  static int unix_open_file(struct sock *sk)
3047  {
3048  	struct path path;
3049  	struct file *f;
3050  	int fd;
3051  
3052  	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3053  		return -EPERM;
3054  
3055  	if (!smp_load_acquire(&unix_sk(sk)->addr))
3056  		return -ENOENT;
3057  
3058  	path = unix_sk(sk)->path;
3059  	if (!path.dentry)
3060  		return -ENOENT;
3061  
3062  	path_get(&path);
3063  
3064  	fd = get_unused_fd_flags(O_CLOEXEC);
3065  	if (fd < 0)
3066  		goto out;
3067  
3068  	f = dentry_open(&path, O_PATH, current_cred());
3069  	if (IS_ERR(f)) {
3070  		put_unused_fd(fd);
3071  		fd = PTR_ERR(f);
3072  		goto out;
3073  	}
3074  
3075  	fd_install(fd, f);
3076  out:
3077  	path_put(&path);
3078  
3079  	return fd;
3080  }
3081  
3082  static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3083  {
3084  	struct sock *sk = sock->sk;
3085  	long amount = 0;
3086  	int err;
3087  
3088  	switch (cmd) {
3089  	case SIOCOUTQ:
3090  		amount = unix_outq_len(sk);
3091  		err = put_user(amount, (int __user *)arg);
3092  		break;
3093  	case SIOCINQ:
3094  		amount = unix_inq_len(sk);
3095  		if (amount < 0)
3096  			err = amount;
3097  		else
3098  			err = put_user(amount, (int __user *)arg);
3099  		break;
3100  	case SIOCUNIXFILE:
3101  		err = unix_open_file(sk);
3102  		break;
3103  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3104  	case SIOCATMARK:
3105  		{
3106  			struct sk_buff *skb;
3107  			int answ = 0;
3108  
3109  			skb = skb_peek(&sk->sk_receive_queue);
3110  			if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3111  				answ = 1;
3112  			err = put_user(answ, (int __user *)arg);
3113  		}
3114  		break;
3115  #endif
3116  	default:
3117  		err = -ENOIOCTLCMD;
3118  		break;
3119  	}
3120  	return err;
3121  }
3122  
3123  #ifdef CONFIG_COMPAT
3124  static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3125  {
3126  	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3127  }
3128  #endif
3129  
3130  static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3131  {
3132  	struct sock *sk = sock->sk;
3133  	__poll_t mask;
3134  
3135  	sock_poll_wait(file, sock, wait);
3136  	mask = 0;
3137  
3138  	/* exceptional events? */
3139  	if (sk->sk_err)
3140  		mask |= EPOLLERR;
3141  	if (sk->sk_shutdown == SHUTDOWN_MASK)
3142  		mask |= EPOLLHUP;
3143  	if (sk->sk_shutdown & RCV_SHUTDOWN)
3144  		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3145  
3146  	/* readable? */
3147  	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3148  		mask |= EPOLLIN | EPOLLRDNORM;
3149  	if (sk_is_readable(sk))
3150  		mask |= EPOLLIN | EPOLLRDNORM;
3151  #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3152  	if (READ_ONCE(unix_sk(sk)->oob_skb))
3153  		mask |= EPOLLPRI;
3154  #endif
3155  
3156  	/* Connection-based need to check for termination and startup */
3157  	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3158  	    sk->sk_state == TCP_CLOSE)
3159  		mask |= EPOLLHUP;
3160  
3161  	/*
3162  	 * we set writable also when the other side has shut down the
3163  	 * connection. This prevents stuck sockets.
3164  	 */
3165  	if (unix_writable(sk))
3166  		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3167  
3168  	return mask;
3169  }
3170  
3171  static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3172  				    poll_table *wait)
3173  {
3174  	struct sock *sk = sock->sk, *other;
3175  	unsigned int writable;
3176  	__poll_t mask;
3177  
3178  	sock_poll_wait(file, sock, wait);
3179  	mask = 0;
3180  
3181  	/* exceptional events? */
3182  	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3183  		mask |= EPOLLERR |
3184  			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3185  
3186  	if (sk->sk_shutdown & RCV_SHUTDOWN)
3187  		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3188  	if (sk->sk_shutdown == SHUTDOWN_MASK)
3189  		mask |= EPOLLHUP;
3190  
3191  	/* readable? */
3192  	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3193  		mask |= EPOLLIN | EPOLLRDNORM;
3194  	if (sk_is_readable(sk))
3195  		mask |= EPOLLIN | EPOLLRDNORM;
3196  
3197  	/* Connection-based need to check for termination and startup */
3198  	if (sk->sk_type == SOCK_SEQPACKET) {
3199  		if (sk->sk_state == TCP_CLOSE)
3200  			mask |= EPOLLHUP;
3201  		/* connection hasn't started yet? */
3202  		if (sk->sk_state == TCP_SYN_SENT)
3203  			return mask;
3204  	}
3205  
3206  	/* No write status requested, avoid expensive OUT tests. */
3207  	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3208  		return mask;
3209  
3210  	writable = unix_writable(sk);
3211  	if (writable) {
3212  		unix_state_lock(sk);
3213  
3214  		other = unix_peer(sk);
3215  		if (other && unix_peer(other) != sk &&
3216  		    unix_recvq_full_lockless(other) &&
3217  		    unix_dgram_peer_wake_me(sk, other))
3218  			writable = 0;
3219  
3220  		unix_state_unlock(sk);
3221  	}
3222  
3223  	if (writable)
3224  		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3225  	else
3226  		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3227  
3228  	return mask;
3229  }
3230  
3231  #ifdef CONFIG_PROC_FS
3232  
3233  #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3234  
3235  #define get_bucket(x) ((x) >> BUCKET_SPACE)
3236  #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3237  #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3238  
3239  static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3240  {
3241  	unsigned long offset = get_offset(*pos);
3242  	unsigned long bucket = get_bucket(*pos);
3243  	unsigned long count = 0;
3244  	struct sock *sk;
3245  
3246  	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3247  	     sk; sk = sk_next(sk)) {
3248  		if (++count == offset)
3249  			break;
3250  	}
3251  
3252  	return sk;
3253  }
3254  
3255  static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3256  {
3257  	unsigned long bucket = get_bucket(*pos);
3258  	struct net *net = seq_file_net(seq);
3259  	struct sock *sk;
3260  
3261  	while (bucket < UNIX_HASH_SIZE) {
3262  		spin_lock(&net->unx.table.locks[bucket]);
3263  
3264  		sk = unix_from_bucket(seq, pos);
3265  		if (sk)
3266  			return sk;
3267  
3268  		spin_unlock(&net->unx.table.locks[bucket]);
3269  
3270  		*pos = set_bucket_offset(++bucket, 1);
3271  	}
3272  
3273  	return NULL;
3274  }
3275  
3276  static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3277  				  loff_t *pos)
3278  {
3279  	unsigned long bucket = get_bucket(*pos);
3280  
3281  	sk = sk_next(sk);
3282  	if (sk)
3283  		return sk;
3284  
3285  
3286  	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3287  
3288  	*pos = set_bucket_offset(++bucket, 1);
3289  
3290  	return unix_get_first(seq, pos);
3291  }
3292  
3293  static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3294  {
3295  	if (!*pos)
3296  		return SEQ_START_TOKEN;
3297  
3298  	return unix_get_first(seq, pos);
3299  }
3300  
3301  static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3302  {
3303  	++*pos;
3304  
3305  	if (v == SEQ_START_TOKEN)
3306  		return unix_get_first(seq, pos);
3307  
3308  	return unix_get_next(seq, v, pos);
3309  }
3310  
3311  static void unix_seq_stop(struct seq_file *seq, void *v)
3312  {
3313  	struct sock *sk = v;
3314  
3315  	if (sk)
3316  		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3317  }
3318  
3319  static int unix_seq_show(struct seq_file *seq, void *v)
3320  {
3321  
3322  	if (v == SEQ_START_TOKEN)
3323  		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3324  			 "Inode Path\n");
3325  	else {
3326  		struct sock *s = v;
3327  		struct unix_sock *u = unix_sk(s);
3328  		unix_state_lock(s);
3329  
3330  		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3331  			s,
3332  			refcount_read(&s->sk_refcnt),
3333  			0,
3334  			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3335  			s->sk_type,
3336  			s->sk_socket ?
3337  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3338  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3339  			sock_i_ino(s));
3340  
3341  		if (u->addr) {	// under a hash table lock here
3342  			int i, len;
3343  			seq_putc(seq, ' ');
3344  
3345  			i = 0;
3346  			len = u->addr->len -
3347  				offsetof(struct sockaddr_un, sun_path);
3348  			if (u->addr->name->sun_path[0]) {
3349  				len--;
3350  			} else {
3351  				seq_putc(seq, '@');
3352  				i++;
3353  			}
3354  			for ( ; i < len; i++)
3355  				seq_putc(seq, u->addr->name->sun_path[i] ?:
3356  					 '@');
3357  		}
3358  		unix_state_unlock(s);
3359  		seq_putc(seq, '\n');
3360  	}
3361  
3362  	return 0;
3363  }
3364  
3365  static const struct seq_operations unix_seq_ops = {
3366  	.start  = unix_seq_start,
3367  	.next   = unix_seq_next,
3368  	.stop   = unix_seq_stop,
3369  	.show   = unix_seq_show,
3370  };
3371  
3372  #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3373  struct bpf_unix_iter_state {
3374  	struct seq_net_private p;
3375  	unsigned int cur_sk;
3376  	unsigned int end_sk;
3377  	unsigned int max_sk;
3378  	struct sock **batch;
3379  	bool st_bucket_done;
3380  };
3381  
3382  struct bpf_iter__unix {
3383  	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3384  	__bpf_md_ptr(struct unix_sock *, unix_sk);
3385  	uid_t uid __aligned(8);
3386  };
3387  
3388  static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3389  			      struct unix_sock *unix_sk, uid_t uid)
3390  {
3391  	struct bpf_iter__unix ctx;
3392  
3393  	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3394  	ctx.meta = meta;
3395  	ctx.unix_sk = unix_sk;
3396  	ctx.uid = uid;
3397  	return bpf_iter_run_prog(prog, &ctx);
3398  }
3399  
3400  static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3401  
3402  {
3403  	struct bpf_unix_iter_state *iter = seq->private;
3404  	unsigned int expected = 1;
3405  	struct sock *sk;
3406  
3407  	sock_hold(start_sk);
3408  	iter->batch[iter->end_sk++] = start_sk;
3409  
3410  	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3411  		if (iter->end_sk < iter->max_sk) {
3412  			sock_hold(sk);
3413  			iter->batch[iter->end_sk++] = sk;
3414  		}
3415  
3416  		expected++;
3417  	}
3418  
3419  	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3420  
3421  	return expected;
3422  }
3423  
3424  static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3425  {
3426  	while (iter->cur_sk < iter->end_sk)
3427  		sock_put(iter->batch[iter->cur_sk++]);
3428  }
3429  
3430  static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3431  				       unsigned int new_batch_sz)
3432  {
3433  	struct sock **new_batch;
3434  
3435  	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3436  			     GFP_USER | __GFP_NOWARN);
3437  	if (!new_batch)
3438  		return -ENOMEM;
3439  
3440  	bpf_iter_unix_put_batch(iter);
3441  	kvfree(iter->batch);
3442  	iter->batch = new_batch;
3443  	iter->max_sk = new_batch_sz;
3444  
3445  	return 0;
3446  }
3447  
3448  static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3449  					loff_t *pos)
3450  {
3451  	struct bpf_unix_iter_state *iter = seq->private;
3452  	unsigned int expected;
3453  	bool resized = false;
3454  	struct sock *sk;
3455  
3456  	if (iter->st_bucket_done)
3457  		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3458  
3459  again:
3460  	/* Get a new batch */
3461  	iter->cur_sk = 0;
3462  	iter->end_sk = 0;
3463  
3464  	sk = unix_get_first(seq, pos);
3465  	if (!sk)
3466  		return NULL; /* Done */
3467  
3468  	expected = bpf_iter_unix_hold_batch(seq, sk);
3469  
3470  	if (iter->end_sk == expected) {
3471  		iter->st_bucket_done = true;
3472  		return sk;
3473  	}
3474  
3475  	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3476  		resized = true;
3477  		goto again;
3478  	}
3479  
3480  	return sk;
3481  }
3482  
3483  static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3484  {
3485  	if (!*pos)
3486  		return SEQ_START_TOKEN;
3487  
3488  	/* bpf iter does not support lseek, so it always
3489  	 * continue from where it was stop()-ped.
3490  	 */
3491  	return bpf_iter_unix_batch(seq, pos);
3492  }
3493  
3494  static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3495  {
3496  	struct bpf_unix_iter_state *iter = seq->private;
3497  	struct sock *sk;
3498  
3499  	/* Whenever seq_next() is called, the iter->cur_sk is
3500  	 * done with seq_show(), so advance to the next sk in
3501  	 * the batch.
3502  	 */
3503  	if (iter->cur_sk < iter->end_sk)
3504  		sock_put(iter->batch[iter->cur_sk++]);
3505  
3506  	++*pos;
3507  
3508  	if (iter->cur_sk < iter->end_sk)
3509  		sk = iter->batch[iter->cur_sk];
3510  	else
3511  		sk = bpf_iter_unix_batch(seq, pos);
3512  
3513  	return sk;
3514  }
3515  
3516  static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3517  {
3518  	struct bpf_iter_meta meta;
3519  	struct bpf_prog *prog;
3520  	struct sock *sk = v;
3521  	uid_t uid;
3522  	bool slow;
3523  	int ret;
3524  
3525  	if (v == SEQ_START_TOKEN)
3526  		return 0;
3527  
3528  	slow = lock_sock_fast(sk);
3529  
3530  	if (unlikely(sk_unhashed(sk))) {
3531  		ret = SEQ_SKIP;
3532  		goto unlock;
3533  	}
3534  
3535  	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3536  	meta.seq = seq;
3537  	prog = bpf_iter_get_info(&meta, false);
3538  	ret = unix_prog_seq_show(prog, &meta, v, uid);
3539  unlock:
3540  	unlock_sock_fast(sk, slow);
3541  	return ret;
3542  }
3543  
3544  static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3545  {
3546  	struct bpf_unix_iter_state *iter = seq->private;
3547  	struct bpf_iter_meta meta;
3548  	struct bpf_prog *prog;
3549  
3550  	if (!v) {
3551  		meta.seq = seq;
3552  		prog = bpf_iter_get_info(&meta, true);
3553  		if (prog)
3554  			(void)unix_prog_seq_show(prog, &meta, v, 0);
3555  	}
3556  
3557  	if (iter->cur_sk < iter->end_sk)
3558  		bpf_iter_unix_put_batch(iter);
3559  }
3560  
3561  static const struct seq_operations bpf_iter_unix_seq_ops = {
3562  	.start	= bpf_iter_unix_seq_start,
3563  	.next	= bpf_iter_unix_seq_next,
3564  	.stop	= bpf_iter_unix_seq_stop,
3565  	.show	= bpf_iter_unix_seq_show,
3566  };
3567  #endif
3568  #endif
3569  
3570  static const struct net_proto_family unix_family_ops = {
3571  	.family = PF_UNIX,
3572  	.create = unix_create,
3573  	.owner	= THIS_MODULE,
3574  };
3575  
3576  
3577  static int __net_init unix_net_init(struct net *net)
3578  {
3579  	int i;
3580  
3581  	net->unx.sysctl_max_dgram_qlen = 10;
3582  	if (unix_sysctl_register(net))
3583  		goto out;
3584  
3585  #ifdef CONFIG_PROC_FS
3586  	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3587  			     sizeof(struct seq_net_private)))
3588  		goto err_sysctl;
3589  #endif
3590  
3591  	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3592  					      sizeof(spinlock_t), GFP_KERNEL);
3593  	if (!net->unx.table.locks)
3594  		goto err_proc;
3595  
3596  	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3597  						sizeof(struct hlist_head),
3598  						GFP_KERNEL);
3599  	if (!net->unx.table.buckets)
3600  		goto free_locks;
3601  
3602  	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3603  		spin_lock_init(&net->unx.table.locks[i]);
3604  		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3605  	}
3606  
3607  	return 0;
3608  
3609  free_locks:
3610  	kvfree(net->unx.table.locks);
3611  err_proc:
3612  #ifdef CONFIG_PROC_FS
3613  	remove_proc_entry("unix", net->proc_net);
3614  err_sysctl:
3615  #endif
3616  	unix_sysctl_unregister(net);
3617  out:
3618  	return -ENOMEM;
3619  }
3620  
3621  static void __net_exit unix_net_exit(struct net *net)
3622  {
3623  	kvfree(net->unx.table.buckets);
3624  	kvfree(net->unx.table.locks);
3625  	unix_sysctl_unregister(net);
3626  	remove_proc_entry("unix", net->proc_net);
3627  }
3628  
3629  static struct pernet_operations unix_net_ops = {
3630  	.init = unix_net_init,
3631  	.exit = unix_net_exit,
3632  };
3633  
3634  #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3635  DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3636  		     struct unix_sock *unix_sk, uid_t uid)
3637  
3638  #define INIT_BATCH_SZ 16
3639  
3640  static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3641  {
3642  	struct bpf_unix_iter_state *iter = priv_data;
3643  	int err;
3644  
3645  	err = bpf_iter_init_seq_net(priv_data, aux);
3646  	if (err)
3647  		return err;
3648  
3649  	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3650  	if (err) {
3651  		bpf_iter_fini_seq_net(priv_data);
3652  		return err;
3653  	}
3654  
3655  	return 0;
3656  }
3657  
3658  static void bpf_iter_fini_unix(void *priv_data)
3659  {
3660  	struct bpf_unix_iter_state *iter = priv_data;
3661  
3662  	bpf_iter_fini_seq_net(priv_data);
3663  	kvfree(iter->batch);
3664  }
3665  
3666  static const struct bpf_iter_seq_info unix_seq_info = {
3667  	.seq_ops		= &bpf_iter_unix_seq_ops,
3668  	.init_seq_private	= bpf_iter_init_unix,
3669  	.fini_seq_private	= bpf_iter_fini_unix,
3670  	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3671  };
3672  
3673  static const struct bpf_func_proto *
3674  bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3675  			     const struct bpf_prog *prog)
3676  {
3677  	switch (func_id) {
3678  	case BPF_FUNC_setsockopt:
3679  		return &bpf_sk_setsockopt_proto;
3680  	case BPF_FUNC_getsockopt:
3681  		return &bpf_sk_getsockopt_proto;
3682  	default:
3683  		return NULL;
3684  	}
3685  }
3686  
3687  static struct bpf_iter_reg unix_reg_info = {
3688  	.target			= "unix",
3689  	.ctx_arg_info_size	= 1,
3690  	.ctx_arg_info		= {
3691  		{ offsetof(struct bpf_iter__unix, unix_sk),
3692  		  PTR_TO_BTF_ID_OR_NULL },
3693  	},
3694  	.get_func_proto         = bpf_iter_unix_get_func_proto,
3695  	.seq_info		= &unix_seq_info,
3696  };
3697  
3698  static void __init bpf_iter_register(void)
3699  {
3700  	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3701  	if (bpf_iter_reg_target(&unix_reg_info))
3702  		pr_warn("Warning: could not register bpf iterator unix\n");
3703  }
3704  #endif
3705  
3706  static int __init af_unix_init(void)
3707  {
3708  	int i, rc = -1;
3709  
3710  	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3711  
3712  	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3713  		spin_lock_init(&bsd_socket_locks[i]);
3714  		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3715  	}
3716  
3717  	rc = proto_register(&unix_dgram_proto, 1);
3718  	if (rc != 0) {
3719  		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3720  		goto out;
3721  	}
3722  
3723  	rc = proto_register(&unix_stream_proto, 1);
3724  	if (rc != 0) {
3725  		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3726  		goto out;
3727  	}
3728  
3729  	sock_register(&unix_family_ops);
3730  	register_pernet_subsys(&unix_net_ops);
3731  	unix_bpf_build_proto();
3732  
3733  #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3734  	bpf_iter_register();
3735  #endif
3736  
3737  out:
3738  	return rc;
3739  }
3740  
3741  static void __exit af_unix_exit(void)
3742  {
3743  	sock_unregister(PF_UNIX);
3744  	proto_unregister(&unix_dgram_proto);
3745  	proto_unregister(&unix_stream_proto);
3746  	unregister_pernet_subsys(&unix_net_ops);
3747  }
3748  
3749  /* Earlier than device_initcall() so that other drivers invoking
3750     request_module() don't end up in a loop when modprobe tries
3751     to use a UNIX socket. But later than subsys_initcall() because
3752     we depend on stuff initialised there */
3753  fs_initcall(af_unix_init);
3754  module_exit(af_unix_exit);
3755  
3756  MODULE_LICENSE("GPL");
3757  MODULE_ALIAS_NETPROTO(PF_UNIX);
3758