xref: /linux/net/unix/af_unix.c (revision 4eb7ae7a301d3586c3351e81d5c3cfe2304a1a6a)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/termios.h>
93 #include <linux/sockios.h>
94 #include <linux/net.h>
95 #include <linux/in.h>
96 #include <linux/fs.h>
97 #include <linux/slab.h>
98 #include <linux/uaccess.h>
99 #include <linux/skbuff.h>
100 #include <linux/netdevice.h>
101 #include <net/net_namespace.h>
102 #include <net/sock.h>
103 #include <net/tcp_states.h>
104 #include <net/af_unix.h>
105 #include <linux/proc_fs.h>
106 #include <linux/seq_file.h>
107 #include <net/scm.h>
108 #include <linux/init.h>
109 #include <linux/poll.h>
110 #include <linux/rtnetlink.h>
111 #include <linux/mount.h>
112 #include <net/checksum.h>
113 #include <linux/security.h>
114 #include <linux/freezer.h>
115 #include <linux/file.h>
116 
117 #include "scm.h"
118 
119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120 EXPORT_SYMBOL_GPL(unix_socket_table);
121 DEFINE_SPINLOCK(unix_table_lock);
122 EXPORT_SYMBOL_GPL(unix_table_lock);
123 static atomic_long_t unix_nr_socks;
124 
125 
126 static struct hlist_head *unix_sockets_unbound(void *addr)
127 {
128 	unsigned long hash = (unsigned long)addr;
129 
130 	hash ^= hash >> 16;
131 	hash ^= hash >> 8;
132 	hash %= UNIX_HASH_SIZE;
133 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
134 }
135 
136 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
137 
138 #ifdef CONFIG_SECURITY_NETWORK
139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140 {
141 	UNIXCB(skb).secid = scm->secid;
142 }
143 
144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
145 {
146 	scm->secid = UNIXCB(skb).secid;
147 }
148 
149 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
150 {
151 	return (scm->secid == UNIXCB(skb).secid);
152 }
153 #else
154 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
155 { }
156 
157 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
158 { }
159 
160 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
161 {
162 	return true;
163 }
164 #endif /* CONFIG_SECURITY_NETWORK */
165 
166 /*
167  *  SMP locking strategy:
168  *    hash table is protected with spinlock unix_table_lock
169  *    each socket state is protected by separate spin lock.
170  */
171 
172 static inline unsigned int unix_hash_fold(__wsum n)
173 {
174 	unsigned int hash = (__force unsigned int)csum_fold(n);
175 
176 	hash ^= hash>>8;
177 	return hash&(UNIX_HASH_SIZE-1);
178 }
179 
180 #define unix_peer(sk) (unix_sk(sk)->peer)
181 
182 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
183 {
184 	return unix_peer(osk) == sk;
185 }
186 
187 static inline int unix_may_send(struct sock *sk, struct sock *osk)
188 {
189 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
190 }
191 
192 static inline int unix_recvq_full(struct sock const *sk)
193 {
194 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
195 }
196 
197 struct sock *unix_peer_get(struct sock *s)
198 {
199 	struct sock *peer;
200 
201 	unix_state_lock(s);
202 	peer = unix_peer(s);
203 	if (peer)
204 		sock_hold(peer);
205 	unix_state_unlock(s);
206 	return peer;
207 }
208 EXPORT_SYMBOL_GPL(unix_peer_get);
209 
210 static inline void unix_release_addr(struct unix_address *addr)
211 {
212 	if (refcount_dec_and_test(&addr->refcnt))
213 		kfree(addr);
214 }
215 
216 /*
217  *	Check unix socket name:
218  *		- should be not zero length.
219  *	        - if started by not zero, should be NULL terminated (FS object)
220  *		- if started by zero, it is abstract name.
221  */
222 
223 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
224 {
225 	*hashp = 0;
226 
227 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
228 		return -EINVAL;
229 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
230 		return -EINVAL;
231 	if (sunaddr->sun_path[0]) {
232 		/*
233 		 * This may look like an off by one error but it is a bit more
234 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
235 		 * sun_path[108] doesn't as such exist.  However in kernel space
236 		 * we are guaranteed that it is a valid memory location in our
237 		 * kernel address buffer.
238 		 */
239 		((char *)sunaddr)[len] = 0;
240 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
241 		return len;
242 	}
243 
244 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
245 	return len;
246 }
247 
248 static void __unix_remove_socket(struct sock *sk)
249 {
250 	sk_del_node_init(sk);
251 }
252 
253 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
254 {
255 	WARN_ON(!sk_unhashed(sk));
256 	sk_add_node(sk, list);
257 }
258 
259 static inline void unix_remove_socket(struct sock *sk)
260 {
261 	spin_lock(&unix_table_lock);
262 	__unix_remove_socket(sk);
263 	spin_unlock(&unix_table_lock);
264 }
265 
266 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
267 {
268 	spin_lock(&unix_table_lock);
269 	__unix_insert_socket(list, sk);
270 	spin_unlock(&unix_table_lock);
271 }
272 
273 static struct sock *__unix_find_socket_byname(struct net *net,
274 					      struct sockaddr_un *sunname,
275 					      int len, int type, unsigned int hash)
276 {
277 	struct sock *s;
278 
279 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
280 		struct unix_sock *u = unix_sk(s);
281 
282 		if (!net_eq(sock_net(s), net))
283 			continue;
284 
285 		if (u->addr->len == len &&
286 		    !memcmp(u->addr->name, sunname, len))
287 			return s;
288 	}
289 	return NULL;
290 }
291 
292 static inline struct sock *unix_find_socket_byname(struct net *net,
293 						   struct sockaddr_un *sunname,
294 						   int len, int type,
295 						   unsigned int hash)
296 {
297 	struct sock *s;
298 
299 	spin_lock(&unix_table_lock);
300 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
301 	if (s)
302 		sock_hold(s);
303 	spin_unlock(&unix_table_lock);
304 	return s;
305 }
306 
307 static struct sock *unix_find_socket_byinode(struct inode *i)
308 {
309 	struct sock *s;
310 
311 	spin_lock(&unix_table_lock);
312 	sk_for_each(s,
313 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
314 		struct dentry *dentry = unix_sk(s)->path.dentry;
315 
316 		if (dentry && d_backing_inode(dentry) == i) {
317 			sock_hold(s);
318 			goto found;
319 		}
320 	}
321 	s = NULL;
322 found:
323 	spin_unlock(&unix_table_lock);
324 	return s;
325 }
326 
327 /* Support code for asymmetrically connected dgram sockets
328  *
329  * If a datagram socket is connected to a socket not itself connected
330  * to the first socket (eg, /dev/log), clients may only enqueue more
331  * messages if the present receive queue of the server socket is not
332  * "too large". This means there's a second writeability condition
333  * poll and sendmsg need to test. The dgram recv code will do a wake
334  * up on the peer_wait wait queue of a socket upon reception of a
335  * datagram which needs to be propagated to sleeping would-be writers
336  * since these might not have sent anything so far. This can't be
337  * accomplished via poll_wait because the lifetime of the server
338  * socket might be less than that of its clients if these break their
339  * association with it or if the server socket is closed while clients
340  * are still connected to it and there's no way to inform "a polling
341  * implementation" that it should let go of a certain wait queue
342  *
343  * In order to propagate a wake up, a wait_queue_entry_t of the client
344  * socket is enqueued on the peer_wait queue of the server socket
345  * whose wake function does a wake_up on the ordinary client socket
346  * wait queue. This connection is established whenever a write (or
347  * poll for write) hit the flow control condition and broken when the
348  * association to the server socket is dissolved or after a wake up
349  * was relayed.
350  */
351 
352 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
353 				      void *key)
354 {
355 	struct unix_sock *u;
356 	wait_queue_head_t *u_sleep;
357 
358 	u = container_of(q, struct unix_sock, peer_wake);
359 
360 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
361 			    q);
362 	u->peer_wake.private = NULL;
363 
364 	/* relaying can only happen while the wq still exists */
365 	u_sleep = sk_sleep(&u->sk);
366 	if (u_sleep)
367 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
368 
369 	return 0;
370 }
371 
372 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
373 {
374 	struct unix_sock *u, *u_other;
375 	int rc;
376 
377 	u = unix_sk(sk);
378 	u_other = unix_sk(other);
379 	rc = 0;
380 	spin_lock(&u_other->peer_wait.lock);
381 
382 	if (!u->peer_wake.private) {
383 		u->peer_wake.private = other;
384 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
385 
386 		rc = 1;
387 	}
388 
389 	spin_unlock(&u_other->peer_wait.lock);
390 	return rc;
391 }
392 
393 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
394 					    struct sock *other)
395 {
396 	struct unix_sock *u, *u_other;
397 
398 	u = unix_sk(sk);
399 	u_other = unix_sk(other);
400 	spin_lock(&u_other->peer_wait.lock);
401 
402 	if (u->peer_wake.private == other) {
403 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
404 		u->peer_wake.private = NULL;
405 	}
406 
407 	spin_unlock(&u_other->peer_wait.lock);
408 }
409 
410 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
411 						   struct sock *other)
412 {
413 	unix_dgram_peer_wake_disconnect(sk, other);
414 	wake_up_interruptible_poll(sk_sleep(sk),
415 				   EPOLLOUT |
416 				   EPOLLWRNORM |
417 				   EPOLLWRBAND);
418 }
419 
420 /* preconditions:
421  *	- unix_peer(sk) == other
422  *	- association is stable
423  */
424 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
425 {
426 	int connected;
427 
428 	connected = unix_dgram_peer_wake_connect(sk, other);
429 
430 	/* If other is SOCK_DEAD, we want to make sure we signal
431 	 * POLLOUT, such that a subsequent write() can get a
432 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
433 	 * to other and its full, we will hang waiting for POLLOUT.
434 	 */
435 	if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
436 		return 1;
437 
438 	if (connected)
439 		unix_dgram_peer_wake_disconnect(sk, other);
440 
441 	return 0;
442 }
443 
444 static int unix_writable(const struct sock *sk)
445 {
446 	return sk->sk_state != TCP_LISTEN &&
447 	       (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
448 }
449 
450 static void unix_write_space(struct sock *sk)
451 {
452 	struct socket_wq *wq;
453 
454 	rcu_read_lock();
455 	if (unix_writable(sk)) {
456 		wq = rcu_dereference(sk->sk_wq);
457 		if (skwq_has_sleeper(wq))
458 			wake_up_interruptible_sync_poll(&wq->wait,
459 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
460 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
461 	}
462 	rcu_read_unlock();
463 }
464 
465 /* When dgram socket disconnects (or changes its peer), we clear its receive
466  * queue of packets arrived from previous peer. First, it allows to do
467  * flow control based only on wmem_alloc; second, sk connected to peer
468  * may receive messages only from that peer. */
469 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
470 {
471 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
472 		skb_queue_purge(&sk->sk_receive_queue);
473 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
474 
475 		/* If one link of bidirectional dgram pipe is disconnected,
476 		 * we signal error. Messages are lost. Do not make this,
477 		 * when peer was not connected to us.
478 		 */
479 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
480 			other->sk_err = ECONNRESET;
481 			other->sk_error_report(other);
482 		}
483 	}
484 }
485 
486 static void unix_sock_destructor(struct sock *sk)
487 {
488 	struct unix_sock *u = unix_sk(sk);
489 
490 	skb_queue_purge(&sk->sk_receive_queue);
491 
492 	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
493 	WARN_ON(!sk_unhashed(sk));
494 	WARN_ON(sk->sk_socket);
495 	if (!sock_flag(sk, SOCK_DEAD)) {
496 		pr_info("Attempt to release alive unix socket: %p\n", sk);
497 		return;
498 	}
499 
500 	if (u->addr)
501 		unix_release_addr(u->addr);
502 
503 	atomic_long_dec(&unix_nr_socks);
504 	local_bh_disable();
505 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
506 	local_bh_enable();
507 #ifdef UNIX_REFCNT_DEBUG
508 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
509 		atomic_long_read(&unix_nr_socks));
510 #endif
511 }
512 
513 static void unix_release_sock(struct sock *sk, int embrion)
514 {
515 	struct unix_sock *u = unix_sk(sk);
516 	struct path path;
517 	struct sock *skpair;
518 	struct sk_buff *skb;
519 	int state;
520 
521 	unix_remove_socket(sk);
522 
523 	/* Clear state */
524 	unix_state_lock(sk);
525 	sock_orphan(sk);
526 	sk->sk_shutdown = SHUTDOWN_MASK;
527 	path	     = u->path;
528 	u->path.dentry = NULL;
529 	u->path.mnt = NULL;
530 	state = sk->sk_state;
531 	sk->sk_state = TCP_CLOSE;
532 	unix_state_unlock(sk);
533 
534 	wake_up_interruptible_all(&u->peer_wait);
535 
536 	skpair = unix_peer(sk);
537 
538 	if (skpair != NULL) {
539 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
540 			unix_state_lock(skpair);
541 			/* No more writes */
542 			skpair->sk_shutdown = SHUTDOWN_MASK;
543 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
544 				skpair->sk_err = ECONNRESET;
545 			unix_state_unlock(skpair);
546 			skpair->sk_state_change(skpair);
547 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
548 		}
549 
550 		unix_dgram_peer_wake_disconnect(sk, skpair);
551 		sock_put(skpair); /* It may now die */
552 		unix_peer(sk) = NULL;
553 	}
554 
555 	/* Try to flush out this socket. Throw out buffers at least */
556 
557 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
558 		if (state == TCP_LISTEN)
559 			unix_release_sock(skb->sk, 1);
560 		/* passed fds are erased in the kfree_skb hook	      */
561 		UNIXCB(skb).consumed = skb->len;
562 		kfree_skb(skb);
563 	}
564 
565 	if (path.dentry)
566 		path_put(&path);
567 
568 	sock_put(sk);
569 
570 	/* ---- Socket is dead now and most probably destroyed ---- */
571 
572 	/*
573 	 * Fixme: BSD difference: In BSD all sockets connected to us get
574 	 *	  ECONNRESET and we die on the spot. In Linux we behave
575 	 *	  like files and pipes do and wait for the last
576 	 *	  dereference.
577 	 *
578 	 * Can't we simply set sock->err?
579 	 *
580 	 *	  What the above comment does talk about? --ANK(980817)
581 	 */
582 
583 	if (unix_tot_inflight)
584 		unix_gc();		/* Garbage collect fds */
585 }
586 
587 static void init_peercred(struct sock *sk)
588 {
589 	put_pid(sk->sk_peer_pid);
590 	if (sk->sk_peer_cred)
591 		put_cred(sk->sk_peer_cred);
592 	sk->sk_peer_pid  = get_pid(task_tgid(current));
593 	sk->sk_peer_cred = get_current_cred();
594 }
595 
596 static void copy_peercred(struct sock *sk, struct sock *peersk)
597 {
598 	put_pid(sk->sk_peer_pid);
599 	if (sk->sk_peer_cred)
600 		put_cred(sk->sk_peer_cred);
601 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
602 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
603 }
604 
605 static int unix_listen(struct socket *sock, int backlog)
606 {
607 	int err;
608 	struct sock *sk = sock->sk;
609 	struct unix_sock *u = unix_sk(sk);
610 	struct pid *old_pid = NULL;
611 
612 	err = -EOPNOTSUPP;
613 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
614 		goto out;	/* Only stream/seqpacket sockets accept */
615 	err = -EINVAL;
616 	if (!u->addr)
617 		goto out;	/* No listens on an unbound socket */
618 	unix_state_lock(sk);
619 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
620 		goto out_unlock;
621 	if (backlog > sk->sk_max_ack_backlog)
622 		wake_up_interruptible_all(&u->peer_wait);
623 	sk->sk_max_ack_backlog	= backlog;
624 	sk->sk_state		= TCP_LISTEN;
625 	/* set credentials so connect can copy them */
626 	init_peercred(sk);
627 	err = 0;
628 
629 out_unlock:
630 	unix_state_unlock(sk);
631 	put_pid(old_pid);
632 out:
633 	return err;
634 }
635 
636 static int unix_release(struct socket *);
637 static int unix_bind(struct socket *, struct sockaddr *, int);
638 static int unix_stream_connect(struct socket *, struct sockaddr *,
639 			       int addr_len, int flags);
640 static int unix_socketpair(struct socket *, struct socket *);
641 static int unix_accept(struct socket *, struct socket *, int, bool);
642 static int unix_getname(struct socket *, struct sockaddr *, int);
643 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
644 static __poll_t unix_dgram_poll(struct file *, struct socket *,
645 				    poll_table *);
646 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
647 #ifdef CONFIG_COMPAT
648 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
649 #endif
650 static int unix_shutdown(struct socket *, int);
651 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
652 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
653 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
654 				    size_t size, int flags);
655 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
656 				       struct pipe_inode_info *, size_t size,
657 				       unsigned int flags);
658 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
659 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
660 static int unix_dgram_connect(struct socket *, struct sockaddr *,
661 			      int, int);
662 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
663 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
664 				  int);
665 
666 static int unix_set_peek_off(struct sock *sk, int val)
667 {
668 	struct unix_sock *u = unix_sk(sk);
669 
670 	if (mutex_lock_interruptible(&u->iolock))
671 		return -EINTR;
672 
673 	sk->sk_peek_off = val;
674 	mutex_unlock(&u->iolock);
675 
676 	return 0;
677 }
678 
679 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
680 {
681 	struct sock *sk = sock->sk;
682 	struct unix_sock *u;
683 
684 	if (sk) {
685 		u = unix_sk(sock->sk);
686 		seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds));
687 	}
688 }
689 
690 static const struct proto_ops unix_stream_ops = {
691 	.family =	PF_UNIX,
692 	.owner =	THIS_MODULE,
693 	.release =	unix_release,
694 	.bind =		unix_bind,
695 	.connect =	unix_stream_connect,
696 	.socketpair =	unix_socketpair,
697 	.accept =	unix_accept,
698 	.getname =	unix_getname,
699 	.poll =		unix_poll,
700 	.ioctl =	unix_ioctl,
701 #ifdef CONFIG_COMPAT
702 	.compat_ioctl =	unix_compat_ioctl,
703 #endif
704 	.listen =	unix_listen,
705 	.shutdown =	unix_shutdown,
706 	.setsockopt =	sock_no_setsockopt,
707 	.getsockopt =	sock_no_getsockopt,
708 	.sendmsg =	unix_stream_sendmsg,
709 	.recvmsg =	unix_stream_recvmsg,
710 	.mmap =		sock_no_mmap,
711 	.sendpage =	unix_stream_sendpage,
712 	.splice_read =	unix_stream_splice_read,
713 	.set_peek_off =	unix_set_peek_off,
714 	.show_fdinfo =	unix_show_fdinfo,
715 };
716 
717 static const struct proto_ops unix_dgram_ops = {
718 	.family =	PF_UNIX,
719 	.owner =	THIS_MODULE,
720 	.release =	unix_release,
721 	.bind =		unix_bind,
722 	.connect =	unix_dgram_connect,
723 	.socketpair =	unix_socketpair,
724 	.accept =	sock_no_accept,
725 	.getname =	unix_getname,
726 	.poll =		unix_dgram_poll,
727 	.ioctl =	unix_ioctl,
728 #ifdef CONFIG_COMPAT
729 	.compat_ioctl =	unix_compat_ioctl,
730 #endif
731 	.listen =	sock_no_listen,
732 	.shutdown =	unix_shutdown,
733 	.setsockopt =	sock_no_setsockopt,
734 	.getsockopt =	sock_no_getsockopt,
735 	.sendmsg =	unix_dgram_sendmsg,
736 	.recvmsg =	unix_dgram_recvmsg,
737 	.mmap =		sock_no_mmap,
738 	.sendpage =	sock_no_sendpage,
739 	.set_peek_off =	unix_set_peek_off,
740 	.show_fdinfo =	unix_show_fdinfo,
741 };
742 
743 static const struct proto_ops unix_seqpacket_ops = {
744 	.family =	PF_UNIX,
745 	.owner =	THIS_MODULE,
746 	.release =	unix_release,
747 	.bind =		unix_bind,
748 	.connect =	unix_stream_connect,
749 	.socketpair =	unix_socketpair,
750 	.accept =	unix_accept,
751 	.getname =	unix_getname,
752 	.poll =		unix_dgram_poll,
753 	.ioctl =	unix_ioctl,
754 #ifdef CONFIG_COMPAT
755 	.compat_ioctl =	unix_compat_ioctl,
756 #endif
757 	.listen =	unix_listen,
758 	.shutdown =	unix_shutdown,
759 	.setsockopt =	sock_no_setsockopt,
760 	.getsockopt =	sock_no_getsockopt,
761 	.sendmsg =	unix_seqpacket_sendmsg,
762 	.recvmsg =	unix_seqpacket_recvmsg,
763 	.mmap =		sock_no_mmap,
764 	.sendpage =	sock_no_sendpage,
765 	.set_peek_off =	unix_set_peek_off,
766 	.show_fdinfo =	unix_show_fdinfo,
767 };
768 
769 static struct proto unix_proto = {
770 	.name			= "UNIX",
771 	.owner			= THIS_MODULE,
772 	.obj_size		= sizeof(struct unix_sock),
773 };
774 
775 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
776 {
777 	struct sock *sk = NULL;
778 	struct unix_sock *u;
779 
780 	atomic_long_inc(&unix_nr_socks);
781 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
782 		goto out;
783 
784 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
785 	if (!sk)
786 		goto out;
787 
788 	sock_init_data(sock, sk);
789 
790 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
791 	sk->sk_write_space	= unix_write_space;
792 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
793 	sk->sk_destruct		= unix_sock_destructor;
794 	u	  = unix_sk(sk);
795 	u->path.dentry = NULL;
796 	u->path.mnt = NULL;
797 	spin_lock_init(&u->lock);
798 	atomic_long_set(&u->inflight, 0);
799 	INIT_LIST_HEAD(&u->link);
800 	mutex_init(&u->iolock); /* single task reading lock */
801 	mutex_init(&u->bindlock); /* single task binding lock */
802 	init_waitqueue_head(&u->peer_wait);
803 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
804 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
805 	unix_insert_socket(unix_sockets_unbound(sk), sk);
806 out:
807 	if (sk == NULL)
808 		atomic_long_dec(&unix_nr_socks);
809 	else {
810 		local_bh_disable();
811 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
812 		local_bh_enable();
813 	}
814 	return sk;
815 }
816 
817 static int unix_create(struct net *net, struct socket *sock, int protocol,
818 		       int kern)
819 {
820 	if (protocol && protocol != PF_UNIX)
821 		return -EPROTONOSUPPORT;
822 
823 	sock->state = SS_UNCONNECTED;
824 
825 	switch (sock->type) {
826 	case SOCK_STREAM:
827 		sock->ops = &unix_stream_ops;
828 		break;
829 		/*
830 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
831 		 *	nothing uses it.
832 		 */
833 	case SOCK_RAW:
834 		sock->type = SOCK_DGRAM;
835 		/* fall through */
836 	case SOCK_DGRAM:
837 		sock->ops = &unix_dgram_ops;
838 		break;
839 	case SOCK_SEQPACKET:
840 		sock->ops = &unix_seqpacket_ops;
841 		break;
842 	default:
843 		return -ESOCKTNOSUPPORT;
844 	}
845 
846 	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
847 }
848 
849 static int unix_release(struct socket *sock)
850 {
851 	struct sock *sk = sock->sk;
852 
853 	if (!sk)
854 		return 0;
855 
856 	unix_release_sock(sk, 0);
857 	sock->sk = NULL;
858 
859 	return 0;
860 }
861 
862 static int unix_autobind(struct socket *sock)
863 {
864 	struct sock *sk = sock->sk;
865 	struct net *net = sock_net(sk);
866 	struct unix_sock *u = unix_sk(sk);
867 	static u32 ordernum = 1;
868 	struct unix_address *addr;
869 	int err;
870 	unsigned int retries = 0;
871 
872 	err = mutex_lock_interruptible(&u->bindlock);
873 	if (err)
874 		return err;
875 
876 	err = 0;
877 	if (u->addr)
878 		goto out;
879 
880 	err = -ENOMEM;
881 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
882 	if (!addr)
883 		goto out;
884 
885 	addr->name->sun_family = AF_UNIX;
886 	refcount_set(&addr->refcnt, 1);
887 
888 retry:
889 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
890 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
891 
892 	spin_lock(&unix_table_lock);
893 	ordernum = (ordernum+1)&0xFFFFF;
894 
895 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
896 				      addr->hash)) {
897 		spin_unlock(&unix_table_lock);
898 		/*
899 		 * __unix_find_socket_byname() may take long time if many names
900 		 * are already in use.
901 		 */
902 		cond_resched();
903 		/* Give up if all names seems to be in use. */
904 		if (retries++ == 0xFFFFF) {
905 			err = -ENOSPC;
906 			kfree(addr);
907 			goto out;
908 		}
909 		goto retry;
910 	}
911 	addr->hash ^= sk->sk_type;
912 
913 	__unix_remove_socket(sk);
914 	smp_store_release(&u->addr, addr);
915 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
916 	spin_unlock(&unix_table_lock);
917 	err = 0;
918 
919 out:	mutex_unlock(&u->bindlock);
920 	return err;
921 }
922 
923 static struct sock *unix_find_other(struct net *net,
924 				    struct sockaddr_un *sunname, int len,
925 				    int type, unsigned int hash, int *error)
926 {
927 	struct sock *u;
928 	struct path path;
929 	int err = 0;
930 
931 	if (sunname->sun_path[0]) {
932 		struct inode *inode;
933 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
934 		if (err)
935 			goto fail;
936 		inode = d_backing_inode(path.dentry);
937 		err = inode_permission(inode, MAY_WRITE);
938 		if (err)
939 			goto put_fail;
940 
941 		err = -ECONNREFUSED;
942 		if (!S_ISSOCK(inode->i_mode))
943 			goto put_fail;
944 		u = unix_find_socket_byinode(inode);
945 		if (!u)
946 			goto put_fail;
947 
948 		if (u->sk_type == type)
949 			touch_atime(&path);
950 
951 		path_put(&path);
952 
953 		err = -EPROTOTYPE;
954 		if (u->sk_type != type) {
955 			sock_put(u);
956 			goto fail;
957 		}
958 	} else {
959 		err = -ECONNREFUSED;
960 		u = unix_find_socket_byname(net, sunname, len, type, hash);
961 		if (u) {
962 			struct dentry *dentry;
963 			dentry = unix_sk(u)->path.dentry;
964 			if (dentry)
965 				touch_atime(&unix_sk(u)->path);
966 		} else
967 			goto fail;
968 	}
969 	return u;
970 
971 put_fail:
972 	path_put(&path);
973 fail:
974 	*error = err;
975 	return NULL;
976 }
977 
978 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
979 {
980 	struct dentry *dentry;
981 	struct path path;
982 	int err = 0;
983 	/*
984 	 * Get the parent directory, calculate the hash for last
985 	 * component.
986 	 */
987 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
988 	err = PTR_ERR(dentry);
989 	if (IS_ERR(dentry))
990 		return err;
991 
992 	/*
993 	 * All right, let's create it.
994 	 */
995 	err = security_path_mknod(&path, dentry, mode, 0);
996 	if (!err) {
997 		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
998 		if (!err) {
999 			res->mnt = mntget(path.mnt);
1000 			res->dentry = dget(dentry);
1001 		}
1002 	}
1003 	done_path_create(&path, dentry);
1004 	return err;
1005 }
1006 
1007 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1008 {
1009 	struct sock *sk = sock->sk;
1010 	struct net *net = sock_net(sk);
1011 	struct unix_sock *u = unix_sk(sk);
1012 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1013 	char *sun_path = sunaddr->sun_path;
1014 	int err;
1015 	unsigned int hash;
1016 	struct unix_address *addr;
1017 	struct hlist_head *list;
1018 	struct path path = { };
1019 
1020 	err = -EINVAL;
1021 	if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1022 	    sunaddr->sun_family != AF_UNIX)
1023 		goto out;
1024 
1025 	if (addr_len == sizeof(short)) {
1026 		err = unix_autobind(sock);
1027 		goto out;
1028 	}
1029 
1030 	err = unix_mkname(sunaddr, addr_len, &hash);
1031 	if (err < 0)
1032 		goto out;
1033 	addr_len = err;
1034 
1035 	if (sun_path[0]) {
1036 		umode_t mode = S_IFSOCK |
1037 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
1038 		err = unix_mknod(sun_path, mode, &path);
1039 		if (err) {
1040 			if (err == -EEXIST)
1041 				err = -EADDRINUSE;
1042 			goto out;
1043 		}
1044 	}
1045 
1046 	err = mutex_lock_interruptible(&u->bindlock);
1047 	if (err)
1048 		goto out_put;
1049 
1050 	err = -EINVAL;
1051 	if (u->addr)
1052 		goto out_up;
1053 
1054 	err = -ENOMEM;
1055 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1056 	if (!addr)
1057 		goto out_up;
1058 
1059 	memcpy(addr->name, sunaddr, addr_len);
1060 	addr->len = addr_len;
1061 	addr->hash = hash ^ sk->sk_type;
1062 	refcount_set(&addr->refcnt, 1);
1063 
1064 	if (sun_path[0]) {
1065 		addr->hash = UNIX_HASH_SIZE;
1066 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1067 		spin_lock(&unix_table_lock);
1068 		u->path = path;
1069 		list = &unix_socket_table[hash];
1070 	} else {
1071 		spin_lock(&unix_table_lock);
1072 		err = -EADDRINUSE;
1073 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
1074 					      sk->sk_type, hash)) {
1075 			unix_release_addr(addr);
1076 			goto out_unlock;
1077 		}
1078 
1079 		list = &unix_socket_table[addr->hash];
1080 	}
1081 
1082 	err = 0;
1083 	__unix_remove_socket(sk);
1084 	smp_store_release(&u->addr, addr);
1085 	__unix_insert_socket(list, sk);
1086 
1087 out_unlock:
1088 	spin_unlock(&unix_table_lock);
1089 out_up:
1090 	mutex_unlock(&u->bindlock);
1091 out_put:
1092 	if (err)
1093 		path_put(&path);
1094 out:
1095 	return err;
1096 }
1097 
1098 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1099 {
1100 	if (unlikely(sk1 == sk2) || !sk2) {
1101 		unix_state_lock(sk1);
1102 		return;
1103 	}
1104 	if (sk1 < sk2) {
1105 		unix_state_lock(sk1);
1106 		unix_state_lock_nested(sk2);
1107 	} else {
1108 		unix_state_lock(sk2);
1109 		unix_state_lock_nested(sk1);
1110 	}
1111 }
1112 
1113 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1114 {
1115 	if (unlikely(sk1 == sk2) || !sk2) {
1116 		unix_state_unlock(sk1);
1117 		return;
1118 	}
1119 	unix_state_unlock(sk1);
1120 	unix_state_unlock(sk2);
1121 }
1122 
1123 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1124 			      int alen, int flags)
1125 {
1126 	struct sock *sk = sock->sk;
1127 	struct net *net = sock_net(sk);
1128 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1129 	struct sock *other;
1130 	unsigned int hash;
1131 	int err;
1132 
1133 	err = -EINVAL;
1134 	if (alen < offsetofend(struct sockaddr, sa_family))
1135 		goto out;
1136 
1137 	if (addr->sa_family != AF_UNSPEC) {
1138 		err = unix_mkname(sunaddr, alen, &hash);
1139 		if (err < 0)
1140 			goto out;
1141 		alen = err;
1142 
1143 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1144 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1145 			goto out;
1146 
1147 restart:
1148 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1149 		if (!other)
1150 			goto out;
1151 
1152 		unix_state_double_lock(sk, other);
1153 
1154 		/* Apparently VFS overslept socket death. Retry. */
1155 		if (sock_flag(other, SOCK_DEAD)) {
1156 			unix_state_double_unlock(sk, other);
1157 			sock_put(other);
1158 			goto restart;
1159 		}
1160 
1161 		err = -EPERM;
1162 		if (!unix_may_send(sk, other))
1163 			goto out_unlock;
1164 
1165 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1166 		if (err)
1167 			goto out_unlock;
1168 
1169 	} else {
1170 		/*
1171 		 *	1003.1g breaking connected state with AF_UNSPEC
1172 		 */
1173 		other = NULL;
1174 		unix_state_double_lock(sk, other);
1175 	}
1176 
1177 	/*
1178 	 * If it was connected, reconnect.
1179 	 */
1180 	if (unix_peer(sk)) {
1181 		struct sock *old_peer = unix_peer(sk);
1182 		unix_peer(sk) = other;
1183 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1184 
1185 		unix_state_double_unlock(sk, other);
1186 
1187 		if (other != old_peer)
1188 			unix_dgram_disconnected(sk, old_peer);
1189 		sock_put(old_peer);
1190 	} else {
1191 		unix_peer(sk) = other;
1192 		unix_state_double_unlock(sk, other);
1193 	}
1194 	return 0;
1195 
1196 out_unlock:
1197 	unix_state_double_unlock(sk, other);
1198 	sock_put(other);
1199 out:
1200 	return err;
1201 }
1202 
1203 static long unix_wait_for_peer(struct sock *other, long timeo)
1204 {
1205 	struct unix_sock *u = unix_sk(other);
1206 	int sched;
1207 	DEFINE_WAIT(wait);
1208 
1209 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1210 
1211 	sched = !sock_flag(other, SOCK_DEAD) &&
1212 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1213 		unix_recvq_full(other);
1214 
1215 	unix_state_unlock(other);
1216 
1217 	if (sched)
1218 		timeo = schedule_timeout(timeo);
1219 
1220 	finish_wait(&u->peer_wait, &wait);
1221 	return timeo;
1222 }
1223 
1224 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1225 			       int addr_len, int flags)
1226 {
1227 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1228 	struct sock *sk = sock->sk;
1229 	struct net *net = sock_net(sk);
1230 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1231 	struct sock *newsk = NULL;
1232 	struct sock *other = NULL;
1233 	struct sk_buff *skb = NULL;
1234 	unsigned int hash;
1235 	int st;
1236 	int err;
1237 	long timeo;
1238 
1239 	err = unix_mkname(sunaddr, addr_len, &hash);
1240 	if (err < 0)
1241 		goto out;
1242 	addr_len = err;
1243 
1244 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1245 	    (err = unix_autobind(sock)) != 0)
1246 		goto out;
1247 
1248 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1249 
1250 	/* First of all allocate resources.
1251 	   If we will make it after state is locked,
1252 	   we will have to recheck all again in any case.
1253 	 */
1254 
1255 	err = -ENOMEM;
1256 
1257 	/* create new sock for complete connection */
1258 	newsk = unix_create1(sock_net(sk), NULL, 0);
1259 	if (newsk == NULL)
1260 		goto out;
1261 
1262 	/* Allocate skb for sending to listening sock */
1263 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1264 	if (skb == NULL)
1265 		goto out;
1266 
1267 restart:
1268 	/*  Find listening sock. */
1269 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1270 	if (!other)
1271 		goto out;
1272 
1273 	/* Latch state of peer */
1274 	unix_state_lock(other);
1275 
1276 	/* Apparently VFS overslept socket death. Retry. */
1277 	if (sock_flag(other, SOCK_DEAD)) {
1278 		unix_state_unlock(other);
1279 		sock_put(other);
1280 		goto restart;
1281 	}
1282 
1283 	err = -ECONNREFUSED;
1284 	if (other->sk_state != TCP_LISTEN)
1285 		goto out_unlock;
1286 	if (other->sk_shutdown & RCV_SHUTDOWN)
1287 		goto out_unlock;
1288 
1289 	if (unix_recvq_full(other)) {
1290 		err = -EAGAIN;
1291 		if (!timeo)
1292 			goto out_unlock;
1293 
1294 		timeo = unix_wait_for_peer(other, timeo);
1295 
1296 		err = sock_intr_errno(timeo);
1297 		if (signal_pending(current))
1298 			goto out;
1299 		sock_put(other);
1300 		goto restart;
1301 	}
1302 
1303 	/* Latch our state.
1304 
1305 	   It is tricky place. We need to grab our state lock and cannot
1306 	   drop lock on peer. It is dangerous because deadlock is
1307 	   possible. Connect to self case and simultaneous
1308 	   attempt to connect are eliminated by checking socket
1309 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1310 	   check this before attempt to grab lock.
1311 
1312 	   Well, and we have to recheck the state after socket locked.
1313 	 */
1314 	st = sk->sk_state;
1315 
1316 	switch (st) {
1317 	case TCP_CLOSE:
1318 		/* This is ok... continue with connect */
1319 		break;
1320 	case TCP_ESTABLISHED:
1321 		/* Socket is already connected */
1322 		err = -EISCONN;
1323 		goto out_unlock;
1324 	default:
1325 		err = -EINVAL;
1326 		goto out_unlock;
1327 	}
1328 
1329 	unix_state_lock_nested(sk);
1330 
1331 	if (sk->sk_state != st) {
1332 		unix_state_unlock(sk);
1333 		unix_state_unlock(other);
1334 		sock_put(other);
1335 		goto restart;
1336 	}
1337 
1338 	err = security_unix_stream_connect(sk, other, newsk);
1339 	if (err) {
1340 		unix_state_unlock(sk);
1341 		goto out_unlock;
1342 	}
1343 
1344 	/* The way is open! Fastly set all the necessary fields... */
1345 
1346 	sock_hold(sk);
1347 	unix_peer(newsk)	= sk;
1348 	newsk->sk_state		= TCP_ESTABLISHED;
1349 	newsk->sk_type		= sk->sk_type;
1350 	init_peercred(newsk);
1351 	newu = unix_sk(newsk);
1352 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1353 	otheru = unix_sk(other);
1354 
1355 	/* copy address information from listening to new sock
1356 	 *
1357 	 * The contents of *(otheru->addr) and otheru->path
1358 	 * are seen fully set up here, since we have found
1359 	 * otheru in hash under unix_table_lock.  Insertion
1360 	 * into the hash chain we'd found it in had been done
1361 	 * in an earlier critical area protected by unix_table_lock,
1362 	 * the same one where we'd set *(otheru->addr) contents,
1363 	 * as well as otheru->path and otheru->addr itself.
1364 	 *
1365 	 * Using smp_store_release() here to set newu->addr
1366 	 * is enough to make those stores, as well as stores
1367 	 * to newu->path visible to anyone who gets newu->addr
1368 	 * by smp_load_acquire().  IOW, the same warranties
1369 	 * as for unix_sock instances bound in unix_bind() or
1370 	 * in unix_autobind().
1371 	 */
1372 	if (otheru->path.dentry) {
1373 		path_get(&otheru->path);
1374 		newu->path = otheru->path;
1375 	}
1376 	refcount_inc(&otheru->addr->refcnt);
1377 	smp_store_release(&newu->addr, otheru->addr);
1378 
1379 	/* Set credentials */
1380 	copy_peercred(sk, other);
1381 
1382 	sock->state	= SS_CONNECTED;
1383 	sk->sk_state	= TCP_ESTABLISHED;
1384 	sock_hold(newsk);
1385 
1386 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1387 	unix_peer(sk)	= newsk;
1388 
1389 	unix_state_unlock(sk);
1390 
1391 	/* take ten and and send info to listening sock */
1392 	spin_lock(&other->sk_receive_queue.lock);
1393 	__skb_queue_tail(&other->sk_receive_queue, skb);
1394 	spin_unlock(&other->sk_receive_queue.lock);
1395 	unix_state_unlock(other);
1396 	other->sk_data_ready(other);
1397 	sock_put(other);
1398 	return 0;
1399 
1400 out_unlock:
1401 	if (other)
1402 		unix_state_unlock(other);
1403 
1404 out:
1405 	kfree_skb(skb);
1406 	if (newsk)
1407 		unix_release_sock(newsk, 0);
1408 	if (other)
1409 		sock_put(other);
1410 	return err;
1411 }
1412 
1413 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1414 {
1415 	struct sock *ska = socka->sk, *skb = sockb->sk;
1416 
1417 	/* Join our sockets back to back */
1418 	sock_hold(ska);
1419 	sock_hold(skb);
1420 	unix_peer(ska) = skb;
1421 	unix_peer(skb) = ska;
1422 	init_peercred(ska);
1423 	init_peercred(skb);
1424 
1425 	if (ska->sk_type != SOCK_DGRAM) {
1426 		ska->sk_state = TCP_ESTABLISHED;
1427 		skb->sk_state = TCP_ESTABLISHED;
1428 		socka->state  = SS_CONNECTED;
1429 		sockb->state  = SS_CONNECTED;
1430 	}
1431 	return 0;
1432 }
1433 
1434 static void unix_sock_inherit_flags(const struct socket *old,
1435 				    struct socket *new)
1436 {
1437 	if (test_bit(SOCK_PASSCRED, &old->flags))
1438 		set_bit(SOCK_PASSCRED, &new->flags);
1439 	if (test_bit(SOCK_PASSSEC, &old->flags))
1440 		set_bit(SOCK_PASSSEC, &new->flags);
1441 }
1442 
1443 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1444 		       bool kern)
1445 {
1446 	struct sock *sk = sock->sk;
1447 	struct sock *tsk;
1448 	struct sk_buff *skb;
1449 	int err;
1450 
1451 	err = -EOPNOTSUPP;
1452 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1453 		goto out;
1454 
1455 	err = -EINVAL;
1456 	if (sk->sk_state != TCP_LISTEN)
1457 		goto out;
1458 
1459 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1460 	 * so that no locks are necessary.
1461 	 */
1462 
1463 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1464 	if (!skb) {
1465 		/* This means receive shutdown. */
1466 		if (err == 0)
1467 			err = -EINVAL;
1468 		goto out;
1469 	}
1470 
1471 	tsk = skb->sk;
1472 	skb_free_datagram(sk, skb);
1473 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1474 
1475 	/* attach accepted sock to socket */
1476 	unix_state_lock(tsk);
1477 	newsock->state = SS_CONNECTED;
1478 	unix_sock_inherit_flags(sock, newsock);
1479 	sock_graft(tsk, newsock);
1480 	unix_state_unlock(tsk);
1481 	return 0;
1482 
1483 out:
1484 	return err;
1485 }
1486 
1487 
1488 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1489 {
1490 	struct sock *sk = sock->sk;
1491 	struct unix_address *addr;
1492 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1493 	int err = 0;
1494 
1495 	if (peer) {
1496 		sk = unix_peer_get(sk);
1497 
1498 		err = -ENOTCONN;
1499 		if (!sk)
1500 			goto out;
1501 		err = 0;
1502 	} else {
1503 		sock_hold(sk);
1504 	}
1505 
1506 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1507 	if (!addr) {
1508 		sunaddr->sun_family = AF_UNIX;
1509 		sunaddr->sun_path[0] = 0;
1510 		err = sizeof(short);
1511 	} else {
1512 		err = addr->len;
1513 		memcpy(sunaddr, addr->name, addr->len);
1514 	}
1515 	sock_put(sk);
1516 out:
1517 	return err;
1518 }
1519 
1520 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1521 {
1522 	int err = 0;
1523 
1524 	UNIXCB(skb).pid  = get_pid(scm->pid);
1525 	UNIXCB(skb).uid = scm->creds.uid;
1526 	UNIXCB(skb).gid = scm->creds.gid;
1527 	UNIXCB(skb).fp = NULL;
1528 	unix_get_secdata(scm, skb);
1529 	if (scm->fp && send_fds)
1530 		err = unix_attach_fds(scm, skb);
1531 
1532 	skb->destructor = unix_destruct_scm;
1533 	return err;
1534 }
1535 
1536 static bool unix_passcred_enabled(const struct socket *sock,
1537 				  const struct sock *other)
1538 {
1539 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1540 	       !other->sk_socket ||
1541 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1542 }
1543 
1544 /*
1545  * Some apps rely on write() giving SCM_CREDENTIALS
1546  * We include credentials if source or destination socket
1547  * asserted SOCK_PASSCRED.
1548  */
1549 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1550 			    const struct sock *other)
1551 {
1552 	if (UNIXCB(skb).pid)
1553 		return;
1554 	if (unix_passcred_enabled(sock, other)) {
1555 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1556 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1557 	}
1558 }
1559 
1560 static int maybe_init_creds(struct scm_cookie *scm,
1561 			    struct socket *socket,
1562 			    const struct sock *other)
1563 {
1564 	int err;
1565 	struct msghdr msg = { .msg_controllen = 0 };
1566 
1567 	err = scm_send(socket, &msg, scm, false);
1568 	if (err)
1569 		return err;
1570 
1571 	if (unix_passcred_enabled(socket, other)) {
1572 		scm->pid = get_pid(task_tgid(current));
1573 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1574 	}
1575 	return err;
1576 }
1577 
1578 static bool unix_skb_scm_eq(struct sk_buff *skb,
1579 			    struct scm_cookie *scm)
1580 {
1581 	const struct unix_skb_parms *u = &UNIXCB(skb);
1582 
1583 	return u->pid == scm->pid &&
1584 	       uid_eq(u->uid, scm->creds.uid) &&
1585 	       gid_eq(u->gid, scm->creds.gid) &&
1586 	       unix_secdata_eq(scm, skb);
1587 }
1588 
1589 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1590 {
1591 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1592 	struct unix_sock *u = unix_sk(sk);
1593 
1594 	lockdep_assert_held(&sk->sk_receive_queue.lock);
1595 
1596 	if (unlikely(fp && fp->count))
1597 		u->scm_stat.nr_fds += fp->count;
1598 }
1599 
1600 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1601 {
1602 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1603 	struct unix_sock *u = unix_sk(sk);
1604 
1605 	lockdep_assert_held(&sk->sk_receive_queue.lock);
1606 
1607 	if (unlikely(fp && fp->count))
1608 		u->scm_stat.nr_fds -= fp->count;
1609 }
1610 
1611 /*
1612  *	Send AF_UNIX data.
1613  */
1614 
1615 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1616 			      size_t len)
1617 {
1618 	struct sock *sk = sock->sk;
1619 	struct net *net = sock_net(sk);
1620 	struct unix_sock *u = unix_sk(sk);
1621 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1622 	struct sock *other = NULL;
1623 	int namelen = 0; /* fake GCC */
1624 	int err;
1625 	unsigned int hash;
1626 	struct sk_buff *skb;
1627 	long timeo;
1628 	struct scm_cookie scm;
1629 	int data_len = 0;
1630 	int sk_locked;
1631 
1632 	wait_for_unix_gc();
1633 	err = scm_send(sock, msg, &scm, false);
1634 	if (err < 0)
1635 		return err;
1636 
1637 	err = -EOPNOTSUPP;
1638 	if (msg->msg_flags&MSG_OOB)
1639 		goto out;
1640 
1641 	if (msg->msg_namelen) {
1642 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1643 		if (err < 0)
1644 			goto out;
1645 		namelen = err;
1646 	} else {
1647 		sunaddr = NULL;
1648 		err = -ENOTCONN;
1649 		other = unix_peer_get(sk);
1650 		if (!other)
1651 			goto out;
1652 	}
1653 
1654 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1655 	    && (err = unix_autobind(sock)) != 0)
1656 		goto out;
1657 
1658 	err = -EMSGSIZE;
1659 	if (len > sk->sk_sndbuf - 32)
1660 		goto out;
1661 
1662 	if (len > SKB_MAX_ALLOC) {
1663 		data_len = min_t(size_t,
1664 				 len - SKB_MAX_ALLOC,
1665 				 MAX_SKB_FRAGS * PAGE_SIZE);
1666 		data_len = PAGE_ALIGN(data_len);
1667 
1668 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1669 	}
1670 
1671 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1672 				   msg->msg_flags & MSG_DONTWAIT, &err,
1673 				   PAGE_ALLOC_COSTLY_ORDER);
1674 	if (skb == NULL)
1675 		goto out;
1676 
1677 	err = unix_scm_to_skb(&scm, skb, true);
1678 	if (err < 0)
1679 		goto out_free;
1680 
1681 	skb_put(skb, len - data_len);
1682 	skb->data_len = data_len;
1683 	skb->len = len;
1684 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1685 	if (err)
1686 		goto out_free;
1687 
1688 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1689 
1690 restart:
1691 	if (!other) {
1692 		err = -ECONNRESET;
1693 		if (sunaddr == NULL)
1694 			goto out_free;
1695 
1696 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1697 					hash, &err);
1698 		if (other == NULL)
1699 			goto out_free;
1700 	}
1701 
1702 	if (sk_filter(other, skb) < 0) {
1703 		/* Toss the packet but do not return any error to the sender */
1704 		err = len;
1705 		goto out_free;
1706 	}
1707 
1708 	sk_locked = 0;
1709 	unix_state_lock(other);
1710 restart_locked:
1711 	err = -EPERM;
1712 	if (!unix_may_send(sk, other))
1713 		goto out_unlock;
1714 
1715 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1716 		/*
1717 		 *	Check with 1003.1g - what should
1718 		 *	datagram error
1719 		 */
1720 		unix_state_unlock(other);
1721 		sock_put(other);
1722 
1723 		if (!sk_locked)
1724 			unix_state_lock(sk);
1725 
1726 		err = 0;
1727 		if (unix_peer(sk) == other) {
1728 			unix_peer(sk) = NULL;
1729 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1730 
1731 			unix_state_unlock(sk);
1732 
1733 			unix_dgram_disconnected(sk, other);
1734 			sock_put(other);
1735 			err = -ECONNREFUSED;
1736 		} else {
1737 			unix_state_unlock(sk);
1738 		}
1739 
1740 		other = NULL;
1741 		if (err)
1742 			goto out_free;
1743 		goto restart;
1744 	}
1745 
1746 	err = -EPIPE;
1747 	if (other->sk_shutdown & RCV_SHUTDOWN)
1748 		goto out_unlock;
1749 
1750 	if (sk->sk_type != SOCK_SEQPACKET) {
1751 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1752 		if (err)
1753 			goto out_unlock;
1754 	}
1755 
1756 	/* other == sk && unix_peer(other) != sk if
1757 	 * - unix_peer(sk) == NULL, destination address bound to sk
1758 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
1759 	 */
1760 	if (other != sk &&
1761 	    unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1762 		if (timeo) {
1763 			timeo = unix_wait_for_peer(other, timeo);
1764 
1765 			err = sock_intr_errno(timeo);
1766 			if (signal_pending(current))
1767 				goto out_free;
1768 
1769 			goto restart;
1770 		}
1771 
1772 		if (!sk_locked) {
1773 			unix_state_unlock(other);
1774 			unix_state_double_lock(sk, other);
1775 		}
1776 
1777 		if (unix_peer(sk) != other ||
1778 		    unix_dgram_peer_wake_me(sk, other)) {
1779 			err = -EAGAIN;
1780 			sk_locked = 1;
1781 			goto out_unlock;
1782 		}
1783 
1784 		if (!sk_locked) {
1785 			sk_locked = 1;
1786 			goto restart_locked;
1787 		}
1788 	}
1789 
1790 	if (unlikely(sk_locked))
1791 		unix_state_unlock(sk);
1792 
1793 	if (sock_flag(other, SOCK_RCVTSTAMP))
1794 		__net_timestamp(skb);
1795 	maybe_add_creds(skb, sock, other);
1796 	spin_lock(&other->sk_receive_queue.lock);
1797 	scm_stat_add(other, skb);
1798 	__skb_queue_tail(&other->sk_receive_queue, skb);
1799 	spin_unlock(&other->sk_receive_queue.lock);
1800 	unix_state_unlock(other);
1801 	other->sk_data_ready(other);
1802 	sock_put(other);
1803 	scm_destroy(&scm);
1804 	return len;
1805 
1806 out_unlock:
1807 	if (sk_locked)
1808 		unix_state_unlock(sk);
1809 	unix_state_unlock(other);
1810 out_free:
1811 	kfree_skb(skb);
1812 out:
1813 	if (other)
1814 		sock_put(other);
1815 	scm_destroy(&scm);
1816 	return err;
1817 }
1818 
1819 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1820  * bytes, and a minimum of a full page.
1821  */
1822 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1823 
1824 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1825 			       size_t len)
1826 {
1827 	struct sock *sk = sock->sk;
1828 	struct sock *other = NULL;
1829 	int err, size;
1830 	struct sk_buff *skb;
1831 	int sent = 0;
1832 	struct scm_cookie scm;
1833 	bool fds_sent = false;
1834 	int data_len;
1835 
1836 	wait_for_unix_gc();
1837 	err = scm_send(sock, msg, &scm, false);
1838 	if (err < 0)
1839 		return err;
1840 
1841 	err = -EOPNOTSUPP;
1842 	if (msg->msg_flags&MSG_OOB)
1843 		goto out_err;
1844 
1845 	if (msg->msg_namelen) {
1846 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1847 		goto out_err;
1848 	} else {
1849 		err = -ENOTCONN;
1850 		other = unix_peer(sk);
1851 		if (!other)
1852 			goto out_err;
1853 	}
1854 
1855 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1856 		goto pipe_err;
1857 
1858 	while (sent < len) {
1859 		size = len - sent;
1860 
1861 		/* Keep two messages in the pipe so it schedules better */
1862 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1863 
1864 		/* allow fallback to order-0 allocations */
1865 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1866 
1867 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1868 
1869 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1870 
1871 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1872 					   msg->msg_flags & MSG_DONTWAIT, &err,
1873 					   get_order(UNIX_SKB_FRAGS_SZ));
1874 		if (!skb)
1875 			goto out_err;
1876 
1877 		/* Only send the fds in the first buffer */
1878 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1879 		if (err < 0) {
1880 			kfree_skb(skb);
1881 			goto out_err;
1882 		}
1883 		fds_sent = true;
1884 
1885 		skb_put(skb, size - data_len);
1886 		skb->data_len = data_len;
1887 		skb->len = size;
1888 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1889 		if (err) {
1890 			kfree_skb(skb);
1891 			goto out_err;
1892 		}
1893 
1894 		unix_state_lock(other);
1895 
1896 		if (sock_flag(other, SOCK_DEAD) ||
1897 		    (other->sk_shutdown & RCV_SHUTDOWN))
1898 			goto pipe_err_free;
1899 
1900 		maybe_add_creds(skb, sock, other);
1901 		spin_lock(&other->sk_receive_queue.lock);
1902 		scm_stat_add(other, skb);
1903 		__skb_queue_tail(&other->sk_receive_queue, skb);
1904 		spin_unlock(&other->sk_receive_queue.lock);
1905 		unix_state_unlock(other);
1906 		other->sk_data_ready(other);
1907 		sent += size;
1908 	}
1909 
1910 	scm_destroy(&scm);
1911 
1912 	return sent;
1913 
1914 pipe_err_free:
1915 	unix_state_unlock(other);
1916 	kfree_skb(skb);
1917 pipe_err:
1918 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1919 		send_sig(SIGPIPE, current, 0);
1920 	err = -EPIPE;
1921 out_err:
1922 	scm_destroy(&scm);
1923 	return sent ? : err;
1924 }
1925 
1926 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1927 				    int offset, size_t size, int flags)
1928 {
1929 	int err;
1930 	bool send_sigpipe = false;
1931 	bool init_scm = true;
1932 	struct scm_cookie scm;
1933 	struct sock *other, *sk = socket->sk;
1934 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1935 
1936 	if (flags & MSG_OOB)
1937 		return -EOPNOTSUPP;
1938 
1939 	other = unix_peer(sk);
1940 	if (!other || sk->sk_state != TCP_ESTABLISHED)
1941 		return -ENOTCONN;
1942 
1943 	if (false) {
1944 alloc_skb:
1945 		unix_state_unlock(other);
1946 		mutex_unlock(&unix_sk(other)->iolock);
1947 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1948 					      &err, 0);
1949 		if (!newskb)
1950 			goto err;
1951 	}
1952 
1953 	/* we must acquire iolock as we modify already present
1954 	 * skbs in the sk_receive_queue and mess with skb->len
1955 	 */
1956 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1957 	if (err) {
1958 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1959 		goto err;
1960 	}
1961 
1962 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
1963 		err = -EPIPE;
1964 		send_sigpipe = true;
1965 		goto err_unlock;
1966 	}
1967 
1968 	unix_state_lock(other);
1969 
1970 	if (sock_flag(other, SOCK_DEAD) ||
1971 	    other->sk_shutdown & RCV_SHUTDOWN) {
1972 		err = -EPIPE;
1973 		send_sigpipe = true;
1974 		goto err_state_unlock;
1975 	}
1976 
1977 	if (init_scm) {
1978 		err = maybe_init_creds(&scm, socket, other);
1979 		if (err)
1980 			goto err_state_unlock;
1981 		init_scm = false;
1982 	}
1983 
1984 	skb = skb_peek_tail(&other->sk_receive_queue);
1985 	if (tail && tail == skb) {
1986 		skb = newskb;
1987 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
1988 		if (newskb) {
1989 			skb = newskb;
1990 		} else {
1991 			tail = skb;
1992 			goto alloc_skb;
1993 		}
1994 	} else if (newskb) {
1995 		/* this is fast path, we don't necessarily need to
1996 		 * call to kfree_skb even though with newskb == NULL
1997 		 * this - does no harm
1998 		 */
1999 		consume_skb(newskb);
2000 		newskb = NULL;
2001 	}
2002 
2003 	if (skb_append_pagefrags(skb, page, offset, size)) {
2004 		tail = skb;
2005 		goto alloc_skb;
2006 	}
2007 
2008 	skb->len += size;
2009 	skb->data_len += size;
2010 	skb->truesize += size;
2011 	refcount_add(size, &sk->sk_wmem_alloc);
2012 
2013 	if (newskb) {
2014 		err = unix_scm_to_skb(&scm, skb, false);
2015 		if (err)
2016 			goto err_state_unlock;
2017 		spin_lock(&other->sk_receive_queue.lock);
2018 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2019 		spin_unlock(&other->sk_receive_queue.lock);
2020 	}
2021 
2022 	unix_state_unlock(other);
2023 	mutex_unlock(&unix_sk(other)->iolock);
2024 
2025 	other->sk_data_ready(other);
2026 	scm_destroy(&scm);
2027 	return size;
2028 
2029 err_state_unlock:
2030 	unix_state_unlock(other);
2031 err_unlock:
2032 	mutex_unlock(&unix_sk(other)->iolock);
2033 err:
2034 	kfree_skb(newskb);
2035 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2036 		send_sig(SIGPIPE, current, 0);
2037 	if (!init_scm)
2038 		scm_destroy(&scm);
2039 	return err;
2040 }
2041 
2042 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2043 				  size_t len)
2044 {
2045 	int err;
2046 	struct sock *sk = sock->sk;
2047 
2048 	err = sock_error(sk);
2049 	if (err)
2050 		return err;
2051 
2052 	if (sk->sk_state != TCP_ESTABLISHED)
2053 		return -ENOTCONN;
2054 
2055 	if (msg->msg_namelen)
2056 		msg->msg_namelen = 0;
2057 
2058 	return unix_dgram_sendmsg(sock, msg, len);
2059 }
2060 
2061 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2062 				  size_t size, int flags)
2063 {
2064 	struct sock *sk = sock->sk;
2065 
2066 	if (sk->sk_state != TCP_ESTABLISHED)
2067 		return -ENOTCONN;
2068 
2069 	return unix_dgram_recvmsg(sock, msg, size, flags);
2070 }
2071 
2072 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2073 {
2074 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2075 
2076 	if (addr) {
2077 		msg->msg_namelen = addr->len;
2078 		memcpy(msg->msg_name, addr->name, addr->len);
2079 	}
2080 }
2081 
2082 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2083 			      size_t size, int flags)
2084 {
2085 	struct scm_cookie scm;
2086 	struct sock *sk = sock->sk;
2087 	struct unix_sock *u = unix_sk(sk);
2088 	struct sk_buff *skb, *last;
2089 	long timeo;
2090 	int skip;
2091 	int err;
2092 
2093 	err = -EOPNOTSUPP;
2094 	if (flags&MSG_OOB)
2095 		goto out;
2096 
2097 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2098 
2099 	do {
2100 		mutex_lock(&u->iolock);
2101 
2102 		skip = sk_peek_offset(sk, flags);
2103 		skb = __skb_try_recv_datagram(sk, flags, scm_stat_del,
2104 					      &skip, &err, &last);
2105 		if (skb)
2106 			break;
2107 
2108 		mutex_unlock(&u->iolock);
2109 
2110 		if (err != -EAGAIN)
2111 			break;
2112 	} while (timeo &&
2113 		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2114 
2115 	if (!skb) { /* implies iolock unlocked */
2116 		unix_state_lock(sk);
2117 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2118 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2119 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2120 			err = 0;
2121 		unix_state_unlock(sk);
2122 		goto out;
2123 	}
2124 
2125 	if (wq_has_sleeper(&u->peer_wait))
2126 		wake_up_interruptible_sync_poll(&u->peer_wait,
2127 						EPOLLOUT | EPOLLWRNORM |
2128 						EPOLLWRBAND);
2129 
2130 	if (msg->msg_name)
2131 		unix_copy_addr(msg, skb->sk);
2132 
2133 	if (size > skb->len - skip)
2134 		size = skb->len - skip;
2135 	else if (size < skb->len - skip)
2136 		msg->msg_flags |= MSG_TRUNC;
2137 
2138 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2139 	if (err)
2140 		goto out_free;
2141 
2142 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2143 		__sock_recv_timestamp(msg, sk, skb);
2144 
2145 	memset(&scm, 0, sizeof(scm));
2146 
2147 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2148 	unix_set_secdata(&scm, skb);
2149 
2150 	if (!(flags & MSG_PEEK)) {
2151 		if (UNIXCB(skb).fp)
2152 			unix_detach_fds(&scm, skb);
2153 
2154 		sk_peek_offset_bwd(sk, skb->len);
2155 	} else {
2156 		/* It is questionable: on PEEK we could:
2157 		   - do not return fds - good, but too simple 8)
2158 		   - return fds, and do not return them on read (old strategy,
2159 		     apparently wrong)
2160 		   - clone fds (I chose it for now, it is the most universal
2161 		     solution)
2162 
2163 		   POSIX 1003.1g does not actually define this clearly
2164 		   at all. POSIX 1003.1g doesn't define a lot of things
2165 		   clearly however!
2166 
2167 		*/
2168 
2169 		sk_peek_offset_fwd(sk, size);
2170 
2171 		if (UNIXCB(skb).fp)
2172 			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2173 	}
2174 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2175 
2176 	scm_recv(sock, msg, &scm, flags);
2177 
2178 out_free:
2179 	skb_free_datagram(sk, skb);
2180 	mutex_unlock(&u->iolock);
2181 out:
2182 	return err;
2183 }
2184 
2185 /*
2186  *	Sleep until more data has arrived. But check for races..
2187  */
2188 static long unix_stream_data_wait(struct sock *sk, long timeo,
2189 				  struct sk_buff *last, unsigned int last_len,
2190 				  bool freezable)
2191 {
2192 	struct sk_buff *tail;
2193 	DEFINE_WAIT(wait);
2194 
2195 	unix_state_lock(sk);
2196 
2197 	for (;;) {
2198 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2199 
2200 		tail = skb_peek_tail(&sk->sk_receive_queue);
2201 		if (tail != last ||
2202 		    (tail && tail->len != last_len) ||
2203 		    sk->sk_err ||
2204 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2205 		    signal_pending(current) ||
2206 		    !timeo)
2207 			break;
2208 
2209 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2210 		unix_state_unlock(sk);
2211 		if (freezable)
2212 			timeo = freezable_schedule_timeout(timeo);
2213 		else
2214 			timeo = schedule_timeout(timeo);
2215 		unix_state_lock(sk);
2216 
2217 		if (sock_flag(sk, SOCK_DEAD))
2218 			break;
2219 
2220 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2221 	}
2222 
2223 	finish_wait(sk_sleep(sk), &wait);
2224 	unix_state_unlock(sk);
2225 	return timeo;
2226 }
2227 
2228 static unsigned int unix_skb_len(const struct sk_buff *skb)
2229 {
2230 	return skb->len - UNIXCB(skb).consumed;
2231 }
2232 
2233 struct unix_stream_read_state {
2234 	int (*recv_actor)(struct sk_buff *, int, int,
2235 			  struct unix_stream_read_state *);
2236 	struct socket *socket;
2237 	struct msghdr *msg;
2238 	struct pipe_inode_info *pipe;
2239 	size_t size;
2240 	int flags;
2241 	unsigned int splice_flags;
2242 };
2243 
2244 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2245 				    bool freezable)
2246 {
2247 	struct scm_cookie scm;
2248 	struct socket *sock = state->socket;
2249 	struct sock *sk = sock->sk;
2250 	struct unix_sock *u = unix_sk(sk);
2251 	int copied = 0;
2252 	int flags = state->flags;
2253 	int noblock = flags & MSG_DONTWAIT;
2254 	bool check_creds = false;
2255 	int target;
2256 	int err = 0;
2257 	long timeo;
2258 	int skip;
2259 	size_t size = state->size;
2260 	unsigned int last_len;
2261 
2262 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2263 		err = -EINVAL;
2264 		goto out;
2265 	}
2266 
2267 	if (unlikely(flags & MSG_OOB)) {
2268 		err = -EOPNOTSUPP;
2269 		goto out;
2270 	}
2271 
2272 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2273 	timeo = sock_rcvtimeo(sk, noblock);
2274 
2275 	memset(&scm, 0, sizeof(scm));
2276 
2277 	/* Lock the socket to prevent queue disordering
2278 	 * while sleeps in memcpy_tomsg
2279 	 */
2280 	mutex_lock(&u->iolock);
2281 
2282 	skip = max(sk_peek_offset(sk, flags), 0);
2283 
2284 	do {
2285 		int chunk;
2286 		bool drop_skb;
2287 		struct sk_buff *skb, *last;
2288 
2289 redo:
2290 		unix_state_lock(sk);
2291 		if (sock_flag(sk, SOCK_DEAD)) {
2292 			err = -ECONNRESET;
2293 			goto unlock;
2294 		}
2295 		last = skb = skb_peek(&sk->sk_receive_queue);
2296 		last_len = last ? last->len : 0;
2297 again:
2298 		if (skb == NULL) {
2299 			if (copied >= target)
2300 				goto unlock;
2301 
2302 			/*
2303 			 *	POSIX 1003.1g mandates this order.
2304 			 */
2305 
2306 			err = sock_error(sk);
2307 			if (err)
2308 				goto unlock;
2309 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2310 				goto unlock;
2311 
2312 			unix_state_unlock(sk);
2313 			if (!timeo) {
2314 				err = -EAGAIN;
2315 				break;
2316 			}
2317 
2318 			mutex_unlock(&u->iolock);
2319 
2320 			timeo = unix_stream_data_wait(sk, timeo, last,
2321 						      last_len, freezable);
2322 
2323 			if (signal_pending(current)) {
2324 				err = sock_intr_errno(timeo);
2325 				scm_destroy(&scm);
2326 				goto out;
2327 			}
2328 
2329 			mutex_lock(&u->iolock);
2330 			goto redo;
2331 unlock:
2332 			unix_state_unlock(sk);
2333 			break;
2334 		}
2335 
2336 		while (skip >= unix_skb_len(skb)) {
2337 			skip -= unix_skb_len(skb);
2338 			last = skb;
2339 			last_len = skb->len;
2340 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2341 			if (!skb)
2342 				goto again;
2343 		}
2344 
2345 		unix_state_unlock(sk);
2346 
2347 		if (check_creds) {
2348 			/* Never glue messages from different writers */
2349 			if (!unix_skb_scm_eq(skb, &scm))
2350 				break;
2351 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2352 			/* Copy credentials */
2353 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2354 			unix_set_secdata(&scm, skb);
2355 			check_creds = true;
2356 		}
2357 
2358 		/* Copy address just once */
2359 		if (state->msg && state->msg->msg_name) {
2360 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2361 					 state->msg->msg_name);
2362 			unix_copy_addr(state->msg, skb->sk);
2363 			sunaddr = NULL;
2364 		}
2365 
2366 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2367 		skb_get(skb);
2368 		chunk = state->recv_actor(skb, skip, chunk, state);
2369 		drop_skb = !unix_skb_len(skb);
2370 		/* skb is only safe to use if !drop_skb */
2371 		consume_skb(skb);
2372 		if (chunk < 0) {
2373 			if (copied == 0)
2374 				copied = -EFAULT;
2375 			break;
2376 		}
2377 		copied += chunk;
2378 		size -= chunk;
2379 
2380 		if (drop_skb) {
2381 			/* the skb was touched by a concurrent reader;
2382 			 * we should not expect anything from this skb
2383 			 * anymore and assume it invalid - we can be
2384 			 * sure it was dropped from the socket queue
2385 			 *
2386 			 * let's report a short read
2387 			 */
2388 			err = 0;
2389 			break;
2390 		}
2391 
2392 		/* Mark read part of skb as used */
2393 		if (!(flags & MSG_PEEK)) {
2394 			UNIXCB(skb).consumed += chunk;
2395 
2396 			sk_peek_offset_bwd(sk, chunk);
2397 
2398 			if (UNIXCB(skb).fp) {
2399 				spin_lock(&sk->sk_receive_queue.lock);
2400 				scm_stat_del(sk, skb);
2401 				spin_unlock(&sk->sk_receive_queue.lock);
2402 				unix_detach_fds(&scm, skb);
2403 			}
2404 
2405 			if (unix_skb_len(skb))
2406 				break;
2407 
2408 			skb_unlink(skb, &sk->sk_receive_queue);
2409 			consume_skb(skb);
2410 
2411 			if (scm.fp)
2412 				break;
2413 		} else {
2414 			/* It is questionable, see note in unix_dgram_recvmsg.
2415 			 */
2416 			if (UNIXCB(skb).fp)
2417 				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2418 
2419 			sk_peek_offset_fwd(sk, chunk);
2420 
2421 			if (UNIXCB(skb).fp)
2422 				break;
2423 
2424 			skip = 0;
2425 			last = skb;
2426 			last_len = skb->len;
2427 			unix_state_lock(sk);
2428 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2429 			if (skb)
2430 				goto again;
2431 			unix_state_unlock(sk);
2432 			break;
2433 		}
2434 	} while (size);
2435 
2436 	mutex_unlock(&u->iolock);
2437 	if (state->msg)
2438 		scm_recv(sock, state->msg, &scm, flags);
2439 	else
2440 		scm_destroy(&scm);
2441 out:
2442 	return copied ? : err;
2443 }
2444 
2445 static int unix_stream_read_actor(struct sk_buff *skb,
2446 				  int skip, int chunk,
2447 				  struct unix_stream_read_state *state)
2448 {
2449 	int ret;
2450 
2451 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2452 				    state->msg, chunk);
2453 	return ret ?: chunk;
2454 }
2455 
2456 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2457 			       size_t size, int flags)
2458 {
2459 	struct unix_stream_read_state state = {
2460 		.recv_actor = unix_stream_read_actor,
2461 		.socket = sock,
2462 		.msg = msg,
2463 		.size = size,
2464 		.flags = flags
2465 	};
2466 
2467 	return unix_stream_read_generic(&state, true);
2468 }
2469 
2470 static int unix_stream_splice_actor(struct sk_buff *skb,
2471 				    int skip, int chunk,
2472 				    struct unix_stream_read_state *state)
2473 {
2474 	return skb_splice_bits(skb, state->socket->sk,
2475 			       UNIXCB(skb).consumed + skip,
2476 			       state->pipe, chunk, state->splice_flags);
2477 }
2478 
2479 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2480 				       struct pipe_inode_info *pipe,
2481 				       size_t size, unsigned int flags)
2482 {
2483 	struct unix_stream_read_state state = {
2484 		.recv_actor = unix_stream_splice_actor,
2485 		.socket = sock,
2486 		.pipe = pipe,
2487 		.size = size,
2488 		.splice_flags = flags,
2489 	};
2490 
2491 	if (unlikely(*ppos))
2492 		return -ESPIPE;
2493 
2494 	if (sock->file->f_flags & O_NONBLOCK ||
2495 	    flags & SPLICE_F_NONBLOCK)
2496 		state.flags = MSG_DONTWAIT;
2497 
2498 	return unix_stream_read_generic(&state, false);
2499 }
2500 
2501 static int unix_shutdown(struct socket *sock, int mode)
2502 {
2503 	struct sock *sk = sock->sk;
2504 	struct sock *other;
2505 
2506 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2507 		return -EINVAL;
2508 	/* This maps:
2509 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2510 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2511 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2512 	 */
2513 	++mode;
2514 
2515 	unix_state_lock(sk);
2516 	sk->sk_shutdown |= mode;
2517 	other = unix_peer(sk);
2518 	if (other)
2519 		sock_hold(other);
2520 	unix_state_unlock(sk);
2521 	sk->sk_state_change(sk);
2522 
2523 	if (other &&
2524 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2525 
2526 		int peer_mode = 0;
2527 
2528 		if (mode&RCV_SHUTDOWN)
2529 			peer_mode |= SEND_SHUTDOWN;
2530 		if (mode&SEND_SHUTDOWN)
2531 			peer_mode |= RCV_SHUTDOWN;
2532 		unix_state_lock(other);
2533 		other->sk_shutdown |= peer_mode;
2534 		unix_state_unlock(other);
2535 		other->sk_state_change(other);
2536 		if (peer_mode == SHUTDOWN_MASK)
2537 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2538 		else if (peer_mode & RCV_SHUTDOWN)
2539 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2540 	}
2541 	if (other)
2542 		sock_put(other);
2543 
2544 	return 0;
2545 }
2546 
2547 long unix_inq_len(struct sock *sk)
2548 {
2549 	struct sk_buff *skb;
2550 	long amount = 0;
2551 
2552 	if (sk->sk_state == TCP_LISTEN)
2553 		return -EINVAL;
2554 
2555 	spin_lock(&sk->sk_receive_queue.lock);
2556 	if (sk->sk_type == SOCK_STREAM ||
2557 	    sk->sk_type == SOCK_SEQPACKET) {
2558 		skb_queue_walk(&sk->sk_receive_queue, skb)
2559 			amount += unix_skb_len(skb);
2560 	} else {
2561 		skb = skb_peek(&sk->sk_receive_queue);
2562 		if (skb)
2563 			amount = skb->len;
2564 	}
2565 	spin_unlock(&sk->sk_receive_queue.lock);
2566 
2567 	return amount;
2568 }
2569 EXPORT_SYMBOL_GPL(unix_inq_len);
2570 
2571 long unix_outq_len(struct sock *sk)
2572 {
2573 	return sk_wmem_alloc_get(sk);
2574 }
2575 EXPORT_SYMBOL_GPL(unix_outq_len);
2576 
2577 static int unix_open_file(struct sock *sk)
2578 {
2579 	struct path path;
2580 	struct file *f;
2581 	int fd;
2582 
2583 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2584 		return -EPERM;
2585 
2586 	if (!smp_load_acquire(&unix_sk(sk)->addr))
2587 		return -ENOENT;
2588 
2589 	path = unix_sk(sk)->path;
2590 	if (!path.dentry)
2591 		return -ENOENT;
2592 
2593 	path_get(&path);
2594 
2595 	fd = get_unused_fd_flags(O_CLOEXEC);
2596 	if (fd < 0)
2597 		goto out;
2598 
2599 	f = dentry_open(&path, O_PATH, current_cred());
2600 	if (IS_ERR(f)) {
2601 		put_unused_fd(fd);
2602 		fd = PTR_ERR(f);
2603 		goto out;
2604 	}
2605 
2606 	fd_install(fd, f);
2607 out:
2608 	path_put(&path);
2609 
2610 	return fd;
2611 }
2612 
2613 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2614 {
2615 	struct sock *sk = sock->sk;
2616 	long amount = 0;
2617 	int err;
2618 
2619 	switch (cmd) {
2620 	case SIOCOUTQ:
2621 		amount = unix_outq_len(sk);
2622 		err = put_user(amount, (int __user *)arg);
2623 		break;
2624 	case SIOCINQ:
2625 		amount = unix_inq_len(sk);
2626 		if (amount < 0)
2627 			err = amount;
2628 		else
2629 			err = put_user(amount, (int __user *)arg);
2630 		break;
2631 	case SIOCUNIXFILE:
2632 		err = unix_open_file(sk);
2633 		break;
2634 	default:
2635 		err = -ENOIOCTLCMD;
2636 		break;
2637 	}
2638 	return err;
2639 }
2640 
2641 #ifdef CONFIG_COMPAT
2642 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2643 {
2644 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2645 }
2646 #endif
2647 
2648 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2649 {
2650 	struct sock *sk = sock->sk;
2651 	__poll_t mask;
2652 
2653 	sock_poll_wait(file, sock, wait);
2654 	mask = 0;
2655 
2656 	/* exceptional events? */
2657 	if (sk->sk_err)
2658 		mask |= EPOLLERR;
2659 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2660 		mask |= EPOLLHUP;
2661 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2662 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2663 
2664 	/* readable? */
2665 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2666 		mask |= EPOLLIN | EPOLLRDNORM;
2667 
2668 	/* Connection-based need to check for termination and startup */
2669 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2670 	    sk->sk_state == TCP_CLOSE)
2671 		mask |= EPOLLHUP;
2672 
2673 	/*
2674 	 * we set writable also when the other side has shut down the
2675 	 * connection. This prevents stuck sockets.
2676 	 */
2677 	if (unix_writable(sk))
2678 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2679 
2680 	return mask;
2681 }
2682 
2683 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2684 				    poll_table *wait)
2685 {
2686 	struct sock *sk = sock->sk, *other;
2687 	unsigned int writable;
2688 	__poll_t mask;
2689 
2690 	sock_poll_wait(file, sock, wait);
2691 	mask = 0;
2692 
2693 	/* exceptional events? */
2694 	if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2695 		mask |= EPOLLERR |
2696 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2697 
2698 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2699 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2700 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2701 		mask |= EPOLLHUP;
2702 
2703 	/* readable? */
2704 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2705 		mask |= EPOLLIN | EPOLLRDNORM;
2706 
2707 	/* Connection-based need to check for termination and startup */
2708 	if (sk->sk_type == SOCK_SEQPACKET) {
2709 		if (sk->sk_state == TCP_CLOSE)
2710 			mask |= EPOLLHUP;
2711 		/* connection hasn't started yet? */
2712 		if (sk->sk_state == TCP_SYN_SENT)
2713 			return mask;
2714 	}
2715 
2716 	/* No write status requested, avoid expensive OUT tests. */
2717 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2718 		return mask;
2719 
2720 	writable = unix_writable(sk);
2721 	if (writable) {
2722 		unix_state_lock(sk);
2723 
2724 		other = unix_peer(sk);
2725 		if (other && unix_peer(other) != sk &&
2726 		    unix_recvq_full(other) &&
2727 		    unix_dgram_peer_wake_me(sk, other))
2728 			writable = 0;
2729 
2730 		unix_state_unlock(sk);
2731 	}
2732 
2733 	if (writable)
2734 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2735 	else
2736 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2737 
2738 	return mask;
2739 }
2740 
2741 #ifdef CONFIG_PROC_FS
2742 
2743 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2744 
2745 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2746 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2747 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2748 
2749 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2750 {
2751 	unsigned long offset = get_offset(*pos);
2752 	unsigned long bucket = get_bucket(*pos);
2753 	struct sock *sk;
2754 	unsigned long count = 0;
2755 
2756 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2757 		if (sock_net(sk) != seq_file_net(seq))
2758 			continue;
2759 		if (++count == offset)
2760 			break;
2761 	}
2762 
2763 	return sk;
2764 }
2765 
2766 static struct sock *unix_next_socket(struct seq_file *seq,
2767 				     struct sock *sk,
2768 				     loff_t *pos)
2769 {
2770 	unsigned long bucket;
2771 
2772 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2773 		sk = sk_next(sk);
2774 		if (!sk)
2775 			goto next_bucket;
2776 		if (sock_net(sk) == seq_file_net(seq))
2777 			return sk;
2778 	}
2779 
2780 	do {
2781 		sk = unix_from_bucket(seq, pos);
2782 		if (sk)
2783 			return sk;
2784 
2785 next_bucket:
2786 		bucket = get_bucket(*pos) + 1;
2787 		*pos = set_bucket_offset(bucket, 1);
2788 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2789 
2790 	return NULL;
2791 }
2792 
2793 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2794 	__acquires(unix_table_lock)
2795 {
2796 	spin_lock(&unix_table_lock);
2797 
2798 	if (!*pos)
2799 		return SEQ_START_TOKEN;
2800 
2801 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2802 		return NULL;
2803 
2804 	return unix_next_socket(seq, NULL, pos);
2805 }
2806 
2807 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2808 {
2809 	++*pos;
2810 	return unix_next_socket(seq, v, pos);
2811 }
2812 
2813 static void unix_seq_stop(struct seq_file *seq, void *v)
2814 	__releases(unix_table_lock)
2815 {
2816 	spin_unlock(&unix_table_lock);
2817 }
2818 
2819 static int unix_seq_show(struct seq_file *seq, void *v)
2820 {
2821 
2822 	if (v == SEQ_START_TOKEN)
2823 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2824 			 "Inode Path\n");
2825 	else {
2826 		struct sock *s = v;
2827 		struct unix_sock *u = unix_sk(s);
2828 		unix_state_lock(s);
2829 
2830 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2831 			s,
2832 			refcount_read(&s->sk_refcnt),
2833 			0,
2834 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2835 			s->sk_type,
2836 			s->sk_socket ?
2837 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2838 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2839 			sock_i_ino(s));
2840 
2841 		if (u->addr) {	// under unix_table_lock here
2842 			int i, len;
2843 			seq_putc(seq, ' ');
2844 
2845 			i = 0;
2846 			len = u->addr->len - sizeof(short);
2847 			if (!UNIX_ABSTRACT(s))
2848 				len--;
2849 			else {
2850 				seq_putc(seq, '@');
2851 				i++;
2852 			}
2853 			for ( ; i < len; i++)
2854 				seq_putc(seq, u->addr->name->sun_path[i] ?:
2855 					 '@');
2856 		}
2857 		unix_state_unlock(s);
2858 		seq_putc(seq, '\n');
2859 	}
2860 
2861 	return 0;
2862 }
2863 
2864 static const struct seq_operations unix_seq_ops = {
2865 	.start  = unix_seq_start,
2866 	.next   = unix_seq_next,
2867 	.stop   = unix_seq_stop,
2868 	.show   = unix_seq_show,
2869 };
2870 #endif
2871 
2872 static const struct net_proto_family unix_family_ops = {
2873 	.family = PF_UNIX,
2874 	.create = unix_create,
2875 	.owner	= THIS_MODULE,
2876 };
2877 
2878 
2879 static int __net_init unix_net_init(struct net *net)
2880 {
2881 	int error = -ENOMEM;
2882 
2883 	net->unx.sysctl_max_dgram_qlen = 10;
2884 	if (unix_sysctl_register(net))
2885 		goto out;
2886 
2887 #ifdef CONFIG_PROC_FS
2888 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2889 			sizeof(struct seq_net_private))) {
2890 		unix_sysctl_unregister(net);
2891 		goto out;
2892 	}
2893 #endif
2894 	error = 0;
2895 out:
2896 	return error;
2897 }
2898 
2899 static void __net_exit unix_net_exit(struct net *net)
2900 {
2901 	unix_sysctl_unregister(net);
2902 	remove_proc_entry("unix", net->proc_net);
2903 }
2904 
2905 static struct pernet_operations unix_net_ops = {
2906 	.init = unix_net_init,
2907 	.exit = unix_net_exit,
2908 };
2909 
2910 static int __init af_unix_init(void)
2911 {
2912 	int rc = -1;
2913 
2914 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2915 
2916 	rc = proto_register(&unix_proto, 1);
2917 	if (rc != 0) {
2918 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2919 		goto out;
2920 	}
2921 
2922 	sock_register(&unix_family_ops);
2923 	register_pernet_subsys(&unix_net_ops);
2924 out:
2925 	return rc;
2926 }
2927 
2928 static void __exit af_unix_exit(void)
2929 {
2930 	sock_unregister(PF_UNIX);
2931 	proto_unregister(&unix_proto);
2932 	unregister_pernet_subsys(&unix_net_ops);
2933 }
2934 
2935 /* Earlier than device_initcall() so that other drivers invoking
2936    request_module() don't end up in a loop when modprobe tries
2937    to use a UNIX socket. But later than subsys_initcall() because
2938    we depend on stuff initialised there */
2939 fs_initcall(af_unix_init);
2940 module_exit(af_unix_exit);
2941 
2942 MODULE_LICENSE("GPL");
2943 MODULE_ALIAS_NETPROTO(PF_UNIX);
2944