xref: /linux/net/unix/af_unix.c (revision 2a10154abcb75ad0d7b6bfea6210ac743ec60897)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84 
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <asm/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 
121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122 EXPORT_SYMBOL_GPL(unix_socket_table);
123 DEFINE_SPINLOCK(unix_table_lock);
124 EXPORT_SYMBOL_GPL(unix_table_lock);
125 static atomic_long_t unix_nr_socks;
126 
127 
128 static struct hlist_head *unix_sockets_unbound(void *addr)
129 {
130 	unsigned long hash = (unsigned long)addr;
131 
132 	hash ^= hash >> 16;
133 	hash ^= hash >> 8;
134 	hash %= UNIX_HASH_SIZE;
135 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
136 }
137 
138 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139 
140 #ifdef CONFIG_SECURITY_NETWORK
141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142 {
143 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
144 }
145 
146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147 {
148 	scm->secid = *UNIXSID(skb);
149 }
150 #else
151 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152 { }
153 
154 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
155 { }
156 #endif /* CONFIG_SECURITY_NETWORK */
157 
158 /*
159  *  SMP locking strategy:
160  *    hash table is protected with spinlock unix_table_lock
161  *    each socket state is protected by separate spin lock.
162  */
163 
164 static inline unsigned int unix_hash_fold(__wsum n)
165 {
166 	unsigned int hash = (__force unsigned int)csum_fold(n);
167 
168 	hash ^= hash>>8;
169 	return hash&(UNIX_HASH_SIZE-1);
170 }
171 
172 #define unix_peer(sk) (unix_sk(sk)->peer)
173 
174 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
175 {
176 	return unix_peer(osk) == sk;
177 }
178 
179 static inline int unix_may_send(struct sock *sk, struct sock *osk)
180 {
181 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
182 }
183 
184 static inline int unix_recvq_full(struct sock const *sk)
185 {
186 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
187 }
188 
189 struct sock *unix_peer_get(struct sock *s)
190 {
191 	struct sock *peer;
192 
193 	unix_state_lock(s);
194 	peer = unix_peer(s);
195 	if (peer)
196 		sock_hold(peer);
197 	unix_state_unlock(s);
198 	return peer;
199 }
200 EXPORT_SYMBOL_GPL(unix_peer_get);
201 
202 static inline void unix_release_addr(struct unix_address *addr)
203 {
204 	if (atomic_dec_and_test(&addr->refcnt))
205 		kfree(addr);
206 }
207 
208 /*
209  *	Check unix socket name:
210  *		- should be not zero length.
211  *	        - if started by not zero, should be NULL terminated (FS object)
212  *		- if started by zero, it is abstract name.
213  */
214 
215 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
216 {
217 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
218 		return -EINVAL;
219 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
220 		return -EINVAL;
221 	if (sunaddr->sun_path[0]) {
222 		/*
223 		 * This may look like an off by one error but it is a bit more
224 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
225 		 * sun_path[108] doesn't as such exist.  However in kernel space
226 		 * we are guaranteed that it is a valid memory location in our
227 		 * kernel address buffer.
228 		 */
229 		((char *)sunaddr)[len] = 0;
230 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
231 		return len;
232 	}
233 
234 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
235 	return len;
236 }
237 
238 static void __unix_remove_socket(struct sock *sk)
239 {
240 	sk_del_node_init(sk);
241 }
242 
243 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
244 {
245 	WARN_ON(!sk_unhashed(sk));
246 	sk_add_node(sk, list);
247 }
248 
249 static inline void unix_remove_socket(struct sock *sk)
250 {
251 	spin_lock(&unix_table_lock);
252 	__unix_remove_socket(sk);
253 	spin_unlock(&unix_table_lock);
254 }
255 
256 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
257 {
258 	spin_lock(&unix_table_lock);
259 	__unix_insert_socket(list, sk);
260 	spin_unlock(&unix_table_lock);
261 }
262 
263 static struct sock *__unix_find_socket_byname(struct net *net,
264 					      struct sockaddr_un *sunname,
265 					      int len, int type, unsigned int hash)
266 {
267 	struct sock *s;
268 
269 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
270 		struct unix_sock *u = unix_sk(s);
271 
272 		if (!net_eq(sock_net(s), net))
273 			continue;
274 
275 		if (u->addr->len == len &&
276 		    !memcmp(u->addr->name, sunname, len))
277 			goto found;
278 	}
279 	s = NULL;
280 found:
281 	return s;
282 }
283 
284 static inline struct sock *unix_find_socket_byname(struct net *net,
285 						   struct sockaddr_un *sunname,
286 						   int len, int type,
287 						   unsigned int hash)
288 {
289 	struct sock *s;
290 
291 	spin_lock(&unix_table_lock);
292 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
293 	if (s)
294 		sock_hold(s);
295 	spin_unlock(&unix_table_lock);
296 	return s;
297 }
298 
299 static struct sock *unix_find_socket_byinode(struct inode *i)
300 {
301 	struct sock *s;
302 
303 	spin_lock(&unix_table_lock);
304 	sk_for_each(s,
305 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
306 		struct dentry *dentry = unix_sk(s)->path.dentry;
307 
308 		if (dentry && d_backing_inode(dentry) == i) {
309 			sock_hold(s);
310 			goto found;
311 		}
312 	}
313 	s = NULL;
314 found:
315 	spin_unlock(&unix_table_lock);
316 	return s;
317 }
318 
319 static inline int unix_writable(struct sock *sk)
320 {
321 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
322 }
323 
324 static void unix_write_space(struct sock *sk)
325 {
326 	struct socket_wq *wq;
327 
328 	rcu_read_lock();
329 	if (unix_writable(sk)) {
330 		wq = rcu_dereference(sk->sk_wq);
331 		if (wq_has_sleeper(wq))
332 			wake_up_interruptible_sync_poll(&wq->wait,
333 				POLLOUT | POLLWRNORM | POLLWRBAND);
334 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
335 	}
336 	rcu_read_unlock();
337 }
338 
339 /* When dgram socket disconnects (or changes its peer), we clear its receive
340  * queue of packets arrived from previous peer. First, it allows to do
341  * flow control based only on wmem_alloc; second, sk connected to peer
342  * may receive messages only from that peer. */
343 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
344 {
345 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
346 		skb_queue_purge(&sk->sk_receive_queue);
347 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
348 
349 		/* If one link of bidirectional dgram pipe is disconnected,
350 		 * we signal error. Messages are lost. Do not make this,
351 		 * when peer was not connected to us.
352 		 */
353 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
354 			other->sk_err = ECONNRESET;
355 			other->sk_error_report(other);
356 		}
357 	}
358 }
359 
360 static void unix_sock_destructor(struct sock *sk)
361 {
362 	struct unix_sock *u = unix_sk(sk);
363 
364 	skb_queue_purge(&sk->sk_receive_queue);
365 
366 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
367 	WARN_ON(!sk_unhashed(sk));
368 	WARN_ON(sk->sk_socket);
369 	if (!sock_flag(sk, SOCK_DEAD)) {
370 		pr_info("Attempt to release alive unix socket: %p\n", sk);
371 		return;
372 	}
373 
374 	if (u->addr)
375 		unix_release_addr(u->addr);
376 
377 	atomic_long_dec(&unix_nr_socks);
378 	local_bh_disable();
379 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
380 	local_bh_enable();
381 #ifdef UNIX_REFCNT_DEBUG
382 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
383 		atomic_long_read(&unix_nr_socks));
384 #endif
385 }
386 
387 static void unix_release_sock(struct sock *sk, int embrion)
388 {
389 	struct unix_sock *u = unix_sk(sk);
390 	struct path path;
391 	struct sock *skpair;
392 	struct sk_buff *skb;
393 	int state;
394 
395 	unix_remove_socket(sk);
396 
397 	/* Clear state */
398 	unix_state_lock(sk);
399 	sock_orphan(sk);
400 	sk->sk_shutdown = SHUTDOWN_MASK;
401 	path	     = u->path;
402 	u->path.dentry = NULL;
403 	u->path.mnt = NULL;
404 	state = sk->sk_state;
405 	sk->sk_state = TCP_CLOSE;
406 	unix_state_unlock(sk);
407 
408 	wake_up_interruptible_all(&u->peer_wait);
409 
410 	skpair = unix_peer(sk);
411 
412 	if (skpair != NULL) {
413 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
414 			unix_state_lock(skpair);
415 			/* No more writes */
416 			skpair->sk_shutdown = SHUTDOWN_MASK;
417 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
418 				skpair->sk_err = ECONNRESET;
419 			unix_state_unlock(skpair);
420 			skpair->sk_state_change(skpair);
421 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
422 		}
423 		sock_put(skpair); /* It may now die */
424 		unix_peer(sk) = NULL;
425 	}
426 
427 	/* Try to flush out this socket. Throw out buffers at least */
428 
429 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
430 		if (state == TCP_LISTEN)
431 			unix_release_sock(skb->sk, 1);
432 		/* passed fds are erased in the kfree_skb hook	      */
433 		kfree_skb(skb);
434 	}
435 
436 	if (path.dentry)
437 		path_put(&path);
438 
439 	sock_put(sk);
440 
441 	/* ---- Socket is dead now and most probably destroyed ---- */
442 
443 	/*
444 	 * Fixme: BSD difference: In BSD all sockets connected to us get
445 	 *	  ECONNRESET and we die on the spot. In Linux we behave
446 	 *	  like files and pipes do and wait for the last
447 	 *	  dereference.
448 	 *
449 	 * Can't we simply set sock->err?
450 	 *
451 	 *	  What the above comment does talk about? --ANK(980817)
452 	 */
453 
454 	if (unix_tot_inflight)
455 		unix_gc();		/* Garbage collect fds */
456 }
457 
458 static void init_peercred(struct sock *sk)
459 {
460 	put_pid(sk->sk_peer_pid);
461 	if (sk->sk_peer_cred)
462 		put_cred(sk->sk_peer_cred);
463 	sk->sk_peer_pid  = get_pid(task_tgid(current));
464 	sk->sk_peer_cred = get_current_cred();
465 }
466 
467 static void copy_peercred(struct sock *sk, struct sock *peersk)
468 {
469 	put_pid(sk->sk_peer_pid);
470 	if (sk->sk_peer_cred)
471 		put_cred(sk->sk_peer_cred);
472 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
473 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
474 }
475 
476 static int unix_listen(struct socket *sock, int backlog)
477 {
478 	int err;
479 	struct sock *sk = sock->sk;
480 	struct unix_sock *u = unix_sk(sk);
481 	struct pid *old_pid = NULL;
482 
483 	err = -EOPNOTSUPP;
484 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
485 		goto out;	/* Only stream/seqpacket sockets accept */
486 	err = -EINVAL;
487 	if (!u->addr)
488 		goto out;	/* No listens on an unbound socket */
489 	unix_state_lock(sk);
490 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
491 		goto out_unlock;
492 	if (backlog > sk->sk_max_ack_backlog)
493 		wake_up_interruptible_all(&u->peer_wait);
494 	sk->sk_max_ack_backlog	= backlog;
495 	sk->sk_state		= TCP_LISTEN;
496 	/* set credentials so connect can copy them */
497 	init_peercred(sk);
498 	err = 0;
499 
500 out_unlock:
501 	unix_state_unlock(sk);
502 	put_pid(old_pid);
503 out:
504 	return err;
505 }
506 
507 static int unix_release(struct socket *);
508 static int unix_bind(struct socket *, struct sockaddr *, int);
509 static int unix_stream_connect(struct socket *, struct sockaddr *,
510 			       int addr_len, int flags);
511 static int unix_socketpair(struct socket *, struct socket *);
512 static int unix_accept(struct socket *, struct socket *, int);
513 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
514 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
515 static unsigned int unix_dgram_poll(struct file *, struct socket *,
516 				    poll_table *);
517 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
518 static int unix_shutdown(struct socket *, int);
519 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
520 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
521 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
522 				    size_t size, int flags);
523 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
524 				       struct pipe_inode_info *, size_t size,
525 				       unsigned int flags);
526 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
527 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
528 static int unix_dgram_connect(struct socket *, struct sockaddr *,
529 			      int, int);
530 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
531 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
532 				  int);
533 
534 static int unix_set_peek_off(struct sock *sk, int val)
535 {
536 	struct unix_sock *u = unix_sk(sk);
537 
538 	if (mutex_lock_interruptible(&u->readlock))
539 		return -EINTR;
540 
541 	sk->sk_peek_off = val;
542 	mutex_unlock(&u->readlock);
543 
544 	return 0;
545 }
546 
547 
548 static const struct proto_ops unix_stream_ops = {
549 	.family =	PF_UNIX,
550 	.owner =	THIS_MODULE,
551 	.release =	unix_release,
552 	.bind =		unix_bind,
553 	.connect =	unix_stream_connect,
554 	.socketpair =	unix_socketpair,
555 	.accept =	unix_accept,
556 	.getname =	unix_getname,
557 	.poll =		unix_poll,
558 	.ioctl =	unix_ioctl,
559 	.listen =	unix_listen,
560 	.shutdown =	unix_shutdown,
561 	.setsockopt =	sock_no_setsockopt,
562 	.getsockopt =	sock_no_getsockopt,
563 	.sendmsg =	unix_stream_sendmsg,
564 	.recvmsg =	unix_stream_recvmsg,
565 	.mmap =		sock_no_mmap,
566 	.sendpage =	unix_stream_sendpage,
567 	.splice_read =	unix_stream_splice_read,
568 	.set_peek_off =	unix_set_peek_off,
569 };
570 
571 static const struct proto_ops unix_dgram_ops = {
572 	.family =	PF_UNIX,
573 	.owner =	THIS_MODULE,
574 	.release =	unix_release,
575 	.bind =		unix_bind,
576 	.connect =	unix_dgram_connect,
577 	.socketpair =	unix_socketpair,
578 	.accept =	sock_no_accept,
579 	.getname =	unix_getname,
580 	.poll =		unix_dgram_poll,
581 	.ioctl =	unix_ioctl,
582 	.listen =	sock_no_listen,
583 	.shutdown =	unix_shutdown,
584 	.setsockopt =	sock_no_setsockopt,
585 	.getsockopt =	sock_no_getsockopt,
586 	.sendmsg =	unix_dgram_sendmsg,
587 	.recvmsg =	unix_dgram_recvmsg,
588 	.mmap =		sock_no_mmap,
589 	.sendpage =	sock_no_sendpage,
590 	.set_peek_off =	unix_set_peek_off,
591 };
592 
593 static const struct proto_ops unix_seqpacket_ops = {
594 	.family =	PF_UNIX,
595 	.owner =	THIS_MODULE,
596 	.release =	unix_release,
597 	.bind =		unix_bind,
598 	.connect =	unix_stream_connect,
599 	.socketpair =	unix_socketpair,
600 	.accept =	unix_accept,
601 	.getname =	unix_getname,
602 	.poll =		unix_dgram_poll,
603 	.ioctl =	unix_ioctl,
604 	.listen =	unix_listen,
605 	.shutdown =	unix_shutdown,
606 	.setsockopt =	sock_no_setsockopt,
607 	.getsockopt =	sock_no_getsockopt,
608 	.sendmsg =	unix_seqpacket_sendmsg,
609 	.recvmsg =	unix_seqpacket_recvmsg,
610 	.mmap =		sock_no_mmap,
611 	.sendpage =	sock_no_sendpage,
612 	.set_peek_off =	unix_set_peek_off,
613 };
614 
615 static struct proto unix_proto = {
616 	.name			= "UNIX",
617 	.owner			= THIS_MODULE,
618 	.obj_size		= sizeof(struct unix_sock),
619 };
620 
621 /*
622  * AF_UNIX sockets do not interact with hardware, hence they
623  * dont trigger interrupts - so it's safe for them to have
624  * bh-unsafe locking for their sk_receive_queue.lock. Split off
625  * this special lock-class by reinitializing the spinlock key:
626  */
627 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
628 
629 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
630 {
631 	struct sock *sk = NULL;
632 	struct unix_sock *u;
633 
634 	atomic_long_inc(&unix_nr_socks);
635 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
636 		goto out;
637 
638 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
639 	if (!sk)
640 		goto out;
641 
642 	sock_init_data(sock, sk);
643 	lockdep_set_class(&sk->sk_receive_queue.lock,
644 				&af_unix_sk_receive_queue_lock_key);
645 
646 	sk->sk_write_space	= unix_write_space;
647 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
648 	sk->sk_destruct		= unix_sock_destructor;
649 	u	  = unix_sk(sk);
650 	u->path.dentry = NULL;
651 	u->path.mnt = NULL;
652 	spin_lock_init(&u->lock);
653 	atomic_long_set(&u->inflight, 0);
654 	INIT_LIST_HEAD(&u->link);
655 	mutex_init(&u->readlock); /* single task reading lock */
656 	init_waitqueue_head(&u->peer_wait);
657 	unix_insert_socket(unix_sockets_unbound(sk), sk);
658 out:
659 	if (sk == NULL)
660 		atomic_long_dec(&unix_nr_socks);
661 	else {
662 		local_bh_disable();
663 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
664 		local_bh_enable();
665 	}
666 	return sk;
667 }
668 
669 static int unix_create(struct net *net, struct socket *sock, int protocol,
670 		       int kern)
671 {
672 	if (protocol && protocol != PF_UNIX)
673 		return -EPROTONOSUPPORT;
674 
675 	sock->state = SS_UNCONNECTED;
676 
677 	switch (sock->type) {
678 	case SOCK_STREAM:
679 		sock->ops = &unix_stream_ops;
680 		break;
681 		/*
682 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
683 		 *	nothing uses it.
684 		 */
685 	case SOCK_RAW:
686 		sock->type = SOCK_DGRAM;
687 	case SOCK_DGRAM:
688 		sock->ops = &unix_dgram_ops;
689 		break;
690 	case SOCK_SEQPACKET:
691 		sock->ops = &unix_seqpacket_ops;
692 		break;
693 	default:
694 		return -ESOCKTNOSUPPORT;
695 	}
696 
697 	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
698 }
699 
700 static int unix_release(struct socket *sock)
701 {
702 	struct sock *sk = sock->sk;
703 
704 	if (!sk)
705 		return 0;
706 
707 	unix_release_sock(sk, 0);
708 	sock->sk = NULL;
709 
710 	return 0;
711 }
712 
713 static int unix_autobind(struct socket *sock)
714 {
715 	struct sock *sk = sock->sk;
716 	struct net *net = sock_net(sk);
717 	struct unix_sock *u = unix_sk(sk);
718 	static u32 ordernum = 1;
719 	struct unix_address *addr;
720 	int err;
721 	unsigned int retries = 0;
722 
723 	err = mutex_lock_interruptible(&u->readlock);
724 	if (err)
725 		return err;
726 
727 	err = 0;
728 	if (u->addr)
729 		goto out;
730 
731 	err = -ENOMEM;
732 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
733 	if (!addr)
734 		goto out;
735 
736 	addr->name->sun_family = AF_UNIX;
737 	atomic_set(&addr->refcnt, 1);
738 
739 retry:
740 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
741 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
742 
743 	spin_lock(&unix_table_lock);
744 	ordernum = (ordernum+1)&0xFFFFF;
745 
746 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
747 				      addr->hash)) {
748 		spin_unlock(&unix_table_lock);
749 		/*
750 		 * __unix_find_socket_byname() may take long time if many names
751 		 * are already in use.
752 		 */
753 		cond_resched();
754 		/* Give up if all names seems to be in use. */
755 		if (retries++ == 0xFFFFF) {
756 			err = -ENOSPC;
757 			kfree(addr);
758 			goto out;
759 		}
760 		goto retry;
761 	}
762 	addr->hash ^= sk->sk_type;
763 
764 	__unix_remove_socket(sk);
765 	u->addr = addr;
766 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
767 	spin_unlock(&unix_table_lock);
768 	err = 0;
769 
770 out:	mutex_unlock(&u->readlock);
771 	return err;
772 }
773 
774 static struct sock *unix_find_other(struct net *net,
775 				    struct sockaddr_un *sunname, int len,
776 				    int type, unsigned int hash, int *error)
777 {
778 	struct sock *u;
779 	struct path path;
780 	int err = 0;
781 
782 	if (sunname->sun_path[0]) {
783 		struct inode *inode;
784 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
785 		if (err)
786 			goto fail;
787 		inode = d_backing_inode(path.dentry);
788 		err = inode_permission(inode, MAY_WRITE);
789 		if (err)
790 			goto put_fail;
791 
792 		err = -ECONNREFUSED;
793 		if (!S_ISSOCK(inode->i_mode))
794 			goto put_fail;
795 		u = unix_find_socket_byinode(inode);
796 		if (!u)
797 			goto put_fail;
798 
799 		if (u->sk_type == type)
800 			touch_atime(&path);
801 
802 		path_put(&path);
803 
804 		err = -EPROTOTYPE;
805 		if (u->sk_type != type) {
806 			sock_put(u);
807 			goto fail;
808 		}
809 	} else {
810 		err = -ECONNREFUSED;
811 		u = unix_find_socket_byname(net, sunname, len, type, hash);
812 		if (u) {
813 			struct dentry *dentry;
814 			dentry = unix_sk(u)->path.dentry;
815 			if (dentry)
816 				touch_atime(&unix_sk(u)->path);
817 		} else
818 			goto fail;
819 	}
820 	return u;
821 
822 put_fail:
823 	path_put(&path);
824 fail:
825 	*error = err;
826 	return NULL;
827 }
828 
829 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
830 {
831 	struct dentry *dentry;
832 	struct path path;
833 	int err = 0;
834 	/*
835 	 * Get the parent directory, calculate the hash for last
836 	 * component.
837 	 */
838 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
839 	err = PTR_ERR(dentry);
840 	if (IS_ERR(dentry))
841 		return err;
842 
843 	/*
844 	 * All right, let's create it.
845 	 */
846 	err = security_path_mknod(&path, dentry, mode, 0);
847 	if (!err) {
848 		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
849 		if (!err) {
850 			res->mnt = mntget(path.mnt);
851 			res->dentry = dget(dentry);
852 		}
853 	}
854 	done_path_create(&path, dentry);
855 	return err;
856 }
857 
858 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
859 {
860 	struct sock *sk = sock->sk;
861 	struct net *net = sock_net(sk);
862 	struct unix_sock *u = unix_sk(sk);
863 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
864 	char *sun_path = sunaddr->sun_path;
865 	int err;
866 	unsigned int hash;
867 	struct unix_address *addr;
868 	struct hlist_head *list;
869 
870 	err = -EINVAL;
871 	if (sunaddr->sun_family != AF_UNIX)
872 		goto out;
873 
874 	if (addr_len == sizeof(short)) {
875 		err = unix_autobind(sock);
876 		goto out;
877 	}
878 
879 	err = unix_mkname(sunaddr, addr_len, &hash);
880 	if (err < 0)
881 		goto out;
882 	addr_len = err;
883 
884 	err = mutex_lock_interruptible(&u->readlock);
885 	if (err)
886 		goto out;
887 
888 	err = -EINVAL;
889 	if (u->addr)
890 		goto out_up;
891 
892 	err = -ENOMEM;
893 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
894 	if (!addr)
895 		goto out_up;
896 
897 	memcpy(addr->name, sunaddr, addr_len);
898 	addr->len = addr_len;
899 	addr->hash = hash ^ sk->sk_type;
900 	atomic_set(&addr->refcnt, 1);
901 
902 	if (sun_path[0]) {
903 		struct path path;
904 		umode_t mode = S_IFSOCK |
905 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
906 		err = unix_mknod(sun_path, mode, &path);
907 		if (err) {
908 			if (err == -EEXIST)
909 				err = -EADDRINUSE;
910 			unix_release_addr(addr);
911 			goto out_up;
912 		}
913 		addr->hash = UNIX_HASH_SIZE;
914 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
915 		spin_lock(&unix_table_lock);
916 		u->path = path;
917 		list = &unix_socket_table[hash];
918 	} else {
919 		spin_lock(&unix_table_lock);
920 		err = -EADDRINUSE;
921 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
922 					      sk->sk_type, hash)) {
923 			unix_release_addr(addr);
924 			goto out_unlock;
925 		}
926 
927 		list = &unix_socket_table[addr->hash];
928 	}
929 
930 	err = 0;
931 	__unix_remove_socket(sk);
932 	u->addr = addr;
933 	__unix_insert_socket(list, sk);
934 
935 out_unlock:
936 	spin_unlock(&unix_table_lock);
937 out_up:
938 	mutex_unlock(&u->readlock);
939 out:
940 	return err;
941 }
942 
943 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
944 {
945 	if (unlikely(sk1 == sk2) || !sk2) {
946 		unix_state_lock(sk1);
947 		return;
948 	}
949 	if (sk1 < sk2) {
950 		unix_state_lock(sk1);
951 		unix_state_lock_nested(sk2);
952 	} else {
953 		unix_state_lock(sk2);
954 		unix_state_lock_nested(sk1);
955 	}
956 }
957 
958 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
959 {
960 	if (unlikely(sk1 == sk2) || !sk2) {
961 		unix_state_unlock(sk1);
962 		return;
963 	}
964 	unix_state_unlock(sk1);
965 	unix_state_unlock(sk2);
966 }
967 
968 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
969 			      int alen, int flags)
970 {
971 	struct sock *sk = sock->sk;
972 	struct net *net = sock_net(sk);
973 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
974 	struct sock *other;
975 	unsigned int hash;
976 	int err;
977 
978 	if (addr->sa_family != AF_UNSPEC) {
979 		err = unix_mkname(sunaddr, alen, &hash);
980 		if (err < 0)
981 			goto out;
982 		alen = err;
983 
984 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
985 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
986 			goto out;
987 
988 restart:
989 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
990 		if (!other)
991 			goto out;
992 
993 		unix_state_double_lock(sk, other);
994 
995 		/* Apparently VFS overslept socket death. Retry. */
996 		if (sock_flag(other, SOCK_DEAD)) {
997 			unix_state_double_unlock(sk, other);
998 			sock_put(other);
999 			goto restart;
1000 		}
1001 
1002 		err = -EPERM;
1003 		if (!unix_may_send(sk, other))
1004 			goto out_unlock;
1005 
1006 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1007 		if (err)
1008 			goto out_unlock;
1009 
1010 	} else {
1011 		/*
1012 		 *	1003.1g breaking connected state with AF_UNSPEC
1013 		 */
1014 		other = NULL;
1015 		unix_state_double_lock(sk, other);
1016 	}
1017 
1018 	/*
1019 	 * If it was connected, reconnect.
1020 	 */
1021 	if (unix_peer(sk)) {
1022 		struct sock *old_peer = unix_peer(sk);
1023 		unix_peer(sk) = other;
1024 		unix_state_double_unlock(sk, other);
1025 
1026 		if (other != old_peer)
1027 			unix_dgram_disconnected(sk, old_peer);
1028 		sock_put(old_peer);
1029 	} else {
1030 		unix_peer(sk) = other;
1031 		unix_state_double_unlock(sk, other);
1032 	}
1033 	return 0;
1034 
1035 out_unlock:
1036 	unix_state_double_unlock(sk, other);
1037 	sock_put(other);
1038 out:
1039 	return err;
1040 }
1041 
1042 static long unix_wait_for_peer(struct sock *other, long timeo)
1043 {
1044 	struct unix_sock *u = unix_sk(other);
1045 	int sched;
1046 	DEFINE_WAIT(wait);
1047 
1048 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1049 
1050 	sched = !sock_flag(other, SOCK_DEAD) &&
1051 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1052 		unix_recvq_full(other);
1053 
1054 	unix_state_unlock(other);
1055 
1056 	if (sched)
1057 		timeo = schedule_timeout(timeo);
1058 
1059 	finish_wait(&u->peer_wait, &wait);
1060 	return timeo;
1061 }
1062 
1063 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1064 			       int addr_len, int flags)
1065 {
1066 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1067 	struct sock *sk = sock->sk;
1068 	struct net *net = sock_net(sk);
1069 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1070 	struct sock *newsk = NULL;
1071 	struct sock *other = NULL;
1072 	struct sk_buff *skb = NULL;
1073 	unsigned int hash;
1074 	int st;
1075 	int err;
1076 	long timeo;
1077 
1078 	err = unix_mkname(sunaddr, addr_len, &hash);
1079 	if (err < 0)
1080 		goto out;
1081 	addr_len = err;
1082 
1083 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1084 	    (err = unix_autobind(sock)) != 0)
1085 		goto out;
1086 
1087 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1088 
1089 	/* First of all allocate resources.
1090 	   If we will make it after state is locked,
1091 	   we will have to recheck all again in any case.
1092 	 */
1093 
1094 	err = -ENOMEM;
1095 
1096 	/* create new sock for complete connection */
1097 	newsk = unix_create1(sock_net(sk), NULL, 0);
1098 	if (newsk == NULL)
1099 		goto out;
1100 
1101 	/* Allocate skb for sending to listening sock */
1102 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1103 	if (skb == NULL)
1104 		goto out;
1105 
1106 restart:
1107 	/*  Find listening sock. */
1108 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1109 	if (!other)
1110 		goto out;
1111 
1112 	/* Latch state of peer */
1113 	unix_state_lock(other);
1114 
1115 	/* Apparently VFS overslept socket death. Retry. */
1116 	if (sock_flag(other, SOCK_DEAD)) {
1117 		unix_state_unlock(other);
1118 		sock_put(other);
1119 		goto restart;
1120 	}
1121 
1122 	err = -ECONNREFUSED;
1123 	if (other->sk_state != TCP_LISTEN)
1124 		goto out_unlock;
1125 	if (other->sk_shutdown & RCV_SHUTDOWN)
1126 		goto out_unlock;
1127 
1128 	if (unix_recvq_full(other)) {
1129 		err = -EAGAIN;
1130 		if (!timeo)
1131 			goto out_unlock;
1132 
1133 		timeo = unix_wait_for_peer(other, timeo);
1134 
1135 		err = sock_intr_errno(timeo);
1136 		if (signal_pending(current))
1137 			goto out;
1138 		sock_put(other);
1139 		goto restart;
1140 	}
1141 
1142 	/* Latch our state.
1143 
1144 	   It is tricky place. We need to grab our state lock and cannot
1145 	   drop lock on peer. It is dangerous because deadlock is
1146 	   possible. Connect to self case and simultaneous
1147 	   attempt to connect are eliminated by checking socket
1148 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1149 	   check this before attempt to grab lock.
1150 
1151 	   Well, and we have to recheck the state after socket locked.
1152 	 */
1153 	st = sk->sk_state;
1154 
1155 	switch (st) {
1156 	case TCP_CLOSE:
1157 		/* This is ok... continue with connect */
1158 		break;
1159 	case TCP_ESTABLISHED:
1160 		/* Socket is already connected */
1161 		err = -EISCONN;
1162 		goto out_unlock;
1163 	default:
1164 		err = -EINVAL;
1165 		goto out_unlock;
1166 	}
1167 
1168 	unix_state_lock_nested(sk);
1169 
1170 	if (sk->sk_state != st) {
1171 		unix_state_unlock(sk);
1172 		unix_state_unlock(other);
1173 		sock_put(other);
1174 		goto restart;
1175 	}
1176 
1177 	err = security_unix_stream_connect(sk, other, newsk);
1178 	if (err) {
1179 		unix_state_unlock(sk);
1180 		goto out_unlock;
1181 	}
1182 
1183 	/* The way is open! Fastly set all the necessary fields... */
1184 
1185 	sock_hold(sk);
1186 	unix_peer(newsk)	= sk;
1187 	newsk->sk_state		= TCP_ESTABLISHED;
1188 	newsk->sk_type		= sk->sk_type;
1189 	init_peercred(newsk);
1190 	newu = unix_sk(newsk);
1191 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1192 	otheru = unix_sk(other);
1193 
1194 	/* copy address information from listening to new sock*/
1195 	if (otheru->addr) {
1196 		atomic_inc(&otheru->addr->refcnt);
1197 		newu->addr = otheru->addr;
1198 	}
1199 	if (otheru->path.dentry) {
1200 		path_get(&otheru->path);
1201 		newu->path = otheru->path;
1202 	}
1203 
1204 	/* Set credentials */
1205 	copy_peercred(sk, other);
1206 
1207 	sock->state	= SS_CONNECTED;
1208 	sk->sk_state	= TCP_ESTABLISHED;
1209 	sock_hold(newsk);
1210 
1211 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1212 	unix_peer(sk)	= newsk;
1213 
1214 	unix_state_unlock(sk);
1215 
1216 	/* take ten and and send info to listening sock */
1217 	spin_lock(&other->sk_receive_queue.lock);
1218 	__skb_queue_tail(&other->sk_receive_queue, skb);
1219 	spin_unlock(&other->sk_receive_queue.lock);
1220 	unix_state_unlock(other);
1221 	other->sk_data_ready(other);
1222 	sock_put(other);
1223 	return 0;
1224 
1225 out_unlock:
1226 	if (other)
1227 		unix_state_unlock(other);
1228 
1229 out:
1230 	kfree_skb(skb);
1231 	if (newsk)
1232 		unix_release_sock(newsk, 0);
1233 	if (other)
1234 		sock_put(other);
1235 	return err;
1236 }
1237 
1238 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1239 {
1240 	struct sock *ska = socka->sk, *skb = sockb->sk;
1241 
1242 	/* Join our sockets back to back */
1243 	sock_hold(ska);
1244 	sock_hold(skb);
1245 	unix_peer(ska) = skb;
1246 	unix_peer(skb) = ska;
1247 	init_peercred(ska);
1248 	init_peercred(skb);
1249 
1250 	if (ska->sk_type != SOCK_DGRAM) {
1251 		ska->sk_state = TCP_ESTABLISHED;
1252 		skb->sk_state = TCP_ESTABLISHED;
1253 		socka->state  = SS_CONNECTED;
1254 		sockb->state  = SS_CONNECTED;
1255 	}
1256 	return 0;
1257 }
1258 
1259 static void unix_sock_inherit_flags(const struct socket *old,
1260 				    struct socket *new)
1261 {
1262 	if (test_bit(SOCK_PASSCRED, &old->flags))
1263 		set_bit(SOCK_PASSCRED, &new->flags);
1264 	if (test_bit(SOCK_PASSSEC, &old->flags))
1265 		set_bit(SOCK_PASSSEC, &new->flags);
1266 }
1267 
1268 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1269 {
1270 	struct sock *sk = sock->sk;
1271 	struct sock *tsk;
1272 	struct sk_buff *skb;
1273 	int err;
1274 
1275 	err = -EOPNOTSUPP;
1276 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1277 		goto out;
1278 
1279 	err = -EINVAL;
1280 	if (sk->sk_state != TCP_LISTEN)
1281 		goto out;
1282 
1283 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1284 	 * so that no locks are necessary.
1285 	 */
1286 
1287 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1288 	if (!skb) {
1289 		/* This means receive shutdown. */
1290 		if (err == 0)
1291 			err = -EINVAL;
1292 		goto out;
1293 	}
1294 
1295 	tsk = skb->sk;
1296 	skb_free_datagram(sk, skb);
1297 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1298 
1299 	/* attach accepted sock to socket */
1300 	unix_state_lock(tsk);
1301 	newsock->state = SS_CONNECTED;
1302 	unix_sock_inherit_flags(sock, newsock);
1303 	sock_graft(tsk, newsock);
1304 	unix_state_unlock(tsk);
1305 	return 0;
1306 
1307 out:
1308 	return err;
1309 }
1310 
1311 
1312 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1313 {
1314 	struct sock *sk = sock->sk;
1315 	struct unix_sock *u;
1316 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1317 	int err = 0;
1318 
1319 	if (peer) {
1320 		sk = unix_peer_get(sk);
1321 
1322 		err = -ENOTCONN;
1323 		if (!sk)
1324 			goto out;
1325 		err = 0;
1326 	} else {
1327 		sock_hold(sk);
1328 	}
1329 
1330 	u = unix_sk(sk);
1331 	unix_state_lock(sk);
1332 	if (!u->addr) {
1333 		sunaddr->sun_family = AF_UNIX;
1334 		sunaddr->sun_path[0] = 0;
1335 		*uaddr_len = sizeof(short);
1336 	} else {
1337 		struct unix_address *addr = u->addr;
1338 
1339 		*uaddr_len = addr->len;
1340 		memcpy(sunaddr, addr->name, *uaddr_len);
1341 	}
1342 	unix_state_unlock(sk);
1343 	sock_put(sk);
1344 out:
1345 	return err;
1346 }
1347 
1348 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1349 {
1350 	int i;
1351 
1352 	scm->fp = UNIXCB(skb).fp;
1353 	UNIXCB(skb).fp = NULL;
1354 
1355 	for (i = scm->fp->count-1; i >= 0; i--)
1356 		unix_notinflight(scm->fp->fp[i]);
1357 }
1358 
1359 static void unix_destruct_scm(struct sk_buff *skb)
1360 {
1361 	struct scm_cookie scm;
1362 	memset(&scm, 0, sizeof(scm));
1363 	scm.pid  = UNIXCB(skb).pid;
1364 	if (UNIXCB(skb).fp)
1365 		unix_detach_fds(&scm, skb);
1366 
1367 	/* Alas, it calls VFS */
1368 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1369 	scm_destroy(&scm);
1370 	sock_wfree(skb);
1371 }
1372 
1373 #define MAX_RECURSION_LEVEL 4
1374 
1375 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1376 {
1377 	int i;
1378 	unsigned char max_level = 0;
1379 	int unix_sock_count = 0;
1380 
1381 	for (i = scm->fp->count - 1; i >= 0; i--) {
1382 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1383 
1384 		if (sk) {
1385 			unix_sock_count++;
1386 			max_level = max(max_level,
1387 					unix_sk(sk)->recursion_level);
1388 		}
1389 	}
1390 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1391 		return -ETOOMANYREFS;
1392 
1393 	/*
1394 	 * Need to duplicate file references for the sake of garbage
1395 	 * collection.  Otherwise a socket in the fps might become a
1396 	 * candidate for GC while the skb is not yet queued.
1397 	 */
1398 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1399 	if (!UNIXCB(skb).fp)
1400 		return -ENOMEM;
1401 
1402 	if (unix_sock_count) {
1403 		for (i = scm->fp->count - 1; i >= 0; i--)
1404 			unix_inflight(scm->fp->fp[i]);
1405 	}
1406 	return max_level;
1407 }
1408 
1409 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1410 {
1411 	int err = 0;
1412 
1413 	UNIXCB(skb).pid  = get_pid(scm->pid);
1414 	UNIXCB(skb).uid = scm->creds.uid;
1415 	UNIXCB(skb).gid = scm->creds.gid;
1416 	UNIXCB(skb).fp = NULL;
1417 	if (scm->fp && send_fds)
1418 		err = unix_attach_fds(scm, skb);
1419 
1420 	skb->destructor = unix_destruct_scm;
1421 	return err;
1422 }
1423 
1424 /*
1425  * Some apps rely on write() giving SCM_CREDENTIALS
1426  * We include credentials if source or destination socket
1427  * asserted SOCK_PASSCRED.
1428  */
1429 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1430 			    const struct sock *other)
1431 {
1432 	if (UNIXCB(skb).pid)
1433 		return;
1434 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1435 	    !other->sk_socket ||
1436 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1437 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1438 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1439 	}
1440 }
1441 
1442 /*
1443  *	Send AF_UNIX data.
1444  */
1445 
1446 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1447 			      size_t len)
1448 {
1449 	struct sock *sk = sock->sk;
1450 	struct net *net = sock_net(sk);
1451 	struct unix_sock *u = unix_sk(sk);
1452 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1453 	struct sock *other = NULL;
1454 	int namelen = 0; /* fake GCC */
1455 	int err;
1456 	unsigned int hash;
1457 	struct sk_buff *skb;
1458 	long timeo;
1459 	struct scm_cookie scm;
1460 	int max_level;
1461 	int data_len = 0;
1462 
1463 	wait_for_unix_gc();
1464 	err = scm_send(sock, msg, &scm, false);
1465 	if (err < 0)
1466 		return err;
1467 
1468 	err = -EOPNOTSUPP;
1469 	if (msg->msg_flags&MSG_OOB)
1470 		goto out;
1471 
1472 	if (msg->msg_namelen) {
1473 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1474 		if (err < 0)
1475 			goto out;
1476 		namelen = err;
1477 	} else {
1478 		sunaddr = NULL;
1479 		err = -ENOTCONN;
1480 		other = unix_peer_get(sk);
1481 		if (!other)
1482 			goto out;
1483 	}
1484 
1485 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1486 	    && (err = unix_autobind(sock)) != 0)
1487 		goto out;
1488 
1489 	err = -EMSGSIZE;
1490 	if (len > sk->sk_sndbuf - 32)
1491 		goto out;
1492 
1493 	if (len > SKB_MAX_ALLOC) {
1494 		data_len = min_t(size_t,
1495 				 len - SKB_MAX_ALLOC,
1496 				 MAX_SKB_FRAGS * PAGE_SIZE);
1497 		data_len = PAGE_ALIGN(data_len);
1498 
1499 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1500 	}
1501 
1502 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1503 				   msg->msg_flags & MSG_DONTWAIT, &err,
1504 				   PAGE_ALLOC_COSTLY_ORDER);
1505 	if (skb == NULL)
1506 		goto out;
1507 
1508 	err = unix_scm_to_skb(&scm, skb, true);
1509 	if (err < 0)
1510 		goto out_free;
1511 	max_level = err + 1;
1512 	unix_get_secdata(&scm, skb);
1513 
1514 	skb_put(skb, len - data_len);
1515 	skb->data_len = data_len;
1516 	skb->len = len;
1517 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1518 	if (err)
1519 		goto out_free;
1520 
1521 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1522 
1523 restart:
1524 	if (!other) {
1525 		err = -ECONNRESET;
1526 		if (sunaddr == NULL)
1527 			goto out_free;
1528 
1529 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1530 					hash, &err);
1531 		if (other == NULL)
1532 			goto out_free;
1533 	}
1534 
1535 	if (sk_filter(other, skb) < 0) {
1536 		/* Toss the packet but do not return any error to the sender */
1537 		err = len;
1538 		goto out_free;
1539 	}
1540 
1541 	unix_state_lock(other);
1542 	err = -EPERM;
1543 	if (!unix_may_send(sk, other))
1544 		goto out_unlock;
1545 
1546 	if (sock_flag(other, SOCK_DEAD)) {
1547 		/*
1548 		 *	Check with 1003.1g - what should
1549 		 *	datagram error
1550 		 */
1551 		unix_state_unlock(other);
1552 		sock_put(other);
1553 
1554 		err = 0;
1555 		unix_state_lock(sk);
1556 		if (unix_peer(sk) == other) {
1557 			unix_peer(sk) = NULL;
1558 			unix_state_unlock(sk);
1559 
1560 			unix_dgram_disconnected(sk, other);
1561 			sock_put(other);
1562 			err = -ECONNREFUSED;
1563 		} else {
1564 			unix_state_unlock(sk);
1565 		}
1566 
1567 		other = NULL;
1568 		if (err)
1569 			goto out_free;
1570 		goto restart;
1571 	}
1572 
1573 	err = -EPIPE;
1574 	if (other->sk_shutdown & RCV_SHUTDOWN)
1575 		goto out_unlock;
1576 
1577 	if (sk->sk_type != SOCK_SEQPACKET) {
1578 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1579 		if (err)
1580 			goto out_unlock;
1581 	}
1582 
1583 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1584 		if (!timeo) {
1585 			err = -EAGAIN;
1586 			goto out_unlock;
1587 		}
1588 
1589 		timeo = unix_wait_for_peer(other, timeo);
1590 
1591 		err = sock_intr_errno(timeo);
1592 		if (signal_pending(current))
1593 			goto out_free;
1594 
1595 		goto restart;
1596 	}
1597 
1598 	if (sock_flag(other, SOCK_RCVTSTAMP))
1599 		__net_timestamp(skb);
1600 	maybe_add_creds(skb, sock, other);
1601 	skb_queue_tail(&other->sk_receive_queue, skb);
1602 	if (max_level > unix_sk(other)->recursion_level)
1603 		unix_sk(other)->recursion_level = max_level;
1604 	unix_state_unlock(other);
1605 	other->sk_data_ready(other);
1606 	sock_put(other);
1607 	scm_destroy(&scm);
1608 	return len;
1609 
1610 out_unlock:
1611 	unix_state_unlock(other);
1612 out_free:
1613 	kfree_skb(skb);
1614 out:
1615 	if (other)
1616 		sock_put(other);
1617 	scm_destroy(&scm);
1618 	return err;
1619 }
1620 
1621 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1622  * bytes, and a minimun of a full page.
1623  */
1624 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1625 
1626 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1627 			       size_t len)
1628 {
1629 	struct sock *sk = sock->sk;
1630 	struct sock *other = NULL;
1631 	int err, size;
1632 	struct sk_buff *skb;
1633 	int sent = 0;
1634 	struct scm_cookie scm;
1635 	bool fds_sent = false;
1636 	int max_level;
1637 	int data_len;
1638 
1639 	wait_for_unix_gc();
1640 	err = scm_send(sock, msg, &scm, false);
1641 	if (err < 0)
1642 		return err;
1643 
1644 	err = -EOPNOTSUPP;
1645 	if (msg->msg_flags&MSG_OOB)
1646 		goto out_err;
1647 
1648 	if (msg->msg_namelen) {
1649 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1650 		goto out_err;
1651 	} else {
1652 		err = -ENOTCONN;
1653 		other = unix_peer(sk);
1654 		if (!other)
1655 			goto out_err;
1656 	}
1657 
1658 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1659 		goto pipe_err;
1660 
1661 	while (sent < len) {
1662 		size = len - sent;
1663 
1664 		/* Keep two messages in the pipe so it schedules better */
1665 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1666 
1667 		/* allow fallback to order-0 allocations */
1668 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1669 
1670 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1671 
1672 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1673 
1674 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1675 					   msg->msg_flags & MSG_DONTWAIT, &err,
1676 					   get_order(UNIX_SKB_FRAGS_SZ));
1677 		if (!skb)
1678 			goto out_err;
1679 
1680 		/* Only send the fds in the first buffer */
1681 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1682 		if (err < 0) {
1683 			kfree_skb(skb);
1684 			goto out_err;
1685 		}
1686 		max_level = err + 1;
1687 		fds_sent = true;
1688 
1689 		skb_put(skb, size - data_len);
1690 		skb->data_len = data_len;
1691 		skb->len = size;
1692 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1693 		if (err) {
1694 			kfree_skb(skb);
1695 			goto out_err;
1696 		}
1697 
1698 		unix_state_lock(other);
1699 
1700 		if (sock_flag(other, SOCK_DEAD) ||
1701 		    (other->sk_shutdown & RCV_SHUTDOWN))
1702 			goto pipe_err_free;
1703 
1704 		maybe_add_creds(skb, sock, other);
1705 		skb_queue_tail(&other->sk_receive_queue, skb);
1706 		if (max_level > unix_sk(other)->recursion_level)
1707 			unix_sk(other)->recursion_level = max_level;
1708 		unix_state_unlock(other);
1709 		other->sk_data_ready(other);
1710 		sent += size;
1711 	}
1712 
1713 	scm_destroy(&scm);
1714 
1715 	return sent;
1716 
1717 pipe_err_free:
1718 	unix_state_unlock(other);
1719 	kfree_skb(skb);
1720 pipe_err:
1721 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1722 		send_sig(SIGPIPE, current, 0);
1723 	err = -EPIPE;
1724 out_err:
1725 	scm_destroy(&scm);
1726 	return sent ? : err;
1727 }
1728 
1729 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1730 				    int offset, size_t size, int flags)
1731 {
1732 	int err = 0;
1733 	bool send_sigpipe = true;
1734 	struct sock *other, *sk = socket->sk;
1735 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1736 
1737 	if (flags & MSG_OOB)
1738 		return -EOPNOTSUPP;
1739 
1740 	other = unix_peer(sk);
1741 	if (!other || sk->sk_state != TCP_ESTABLISHED)
1742 		return -ENOTCONN;
1743 
1744 	if (false) {
1745 alloc_skb:
1746 		unix_state_unlock(other);
1747 		mutex_unlock(&unix_sk(other)->readlock);
1748 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1749 					      &err, 0);
1750 		if (!newskb)
1751 			return err;
1752 	}
1753 
1754 	/* we must acquire readlock as we modify already present
1755 	 * skbs in the sk_receive_queue and mess with skb->len
1756 	 */
1757 	err = mutex_lock_interruptible(&unix_sk(other)->readlock);
1758 	if (err) {
1759 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1760 		send_sigpipe = false;
1761 		goto err;
1762 	}
1763 
1764 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
1765 		err = -EPIPE;
1766 		goto err_unlock;
1767 	}
1768 
1769 	unix_state_lock(other);
1770 
1771 	if (sock_flag(other, SOCK_DEAD) ||
1772 	    other->sk_shutdown & RCV_SHUTDOWN) {
1773 		err = -EPIPE;
1774 		goto err_state_unlock;
1775 	}
1776 
1777 	skb = skb_peek_tail(&other->sk_receive_queue);
1778 	if (tail && tail == skb) {
1779 		skb = newskb;
1780 	} else if (!skb) {
1781 		if (newskb)
1782 			skb = newskb;
1783 		else
1784 			goto alloc_skb;
1785 	} else if (newskb) {
1786 		/* this is fast path, we don't necessarily need to
1787 		 * call to kfree_skb even though with newskb == NULL
1788 		 * this - does no harm
1789 		 */
1790 		consume_skb(newskb);
1791 	}
1792 
1793 	if (skb_append_pagefrags(skb, page, offset, size)) {
1794 		tail = skb;
1795 		goto alloc_skb;
1796 	}
1797 
1798 	skb->len += size;
1799 	skb->data_len += size;
1800 	skb->truesize += size;
1801 	atomic_add(size, &sk->sk_wmem_alloc);
1802 
1803 	if (newskb)
1804 		__skb_queue_tail(&other->sk_receive_queue, newskb);
1805 
1806 	unix_state_unlock(other);
1807 	mutex_unlock(&unix_sk(other)->readlock);
1808 
1809 	other->sk_data_ready(other);
1810 
1811 	return size;
1812 
1813 err_state_unlock:
1814 	unix_state_unlock(other);
1815 err_unlock:
1816 	mutex_unlock(&unix_sk(other)->readlock);
1817 err:
1818 	kfree_skb(newskb);
1819 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
1820 		send_sig(SIGPIPE, current, 0);
1821 	return err;
1822 }
1823 
1824 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1825 				  size_t len)
1826 {
1827 	int err;
1828 	struct sock *sk = sock->sk;
1829 
1830 	err = sock_error(sk);
1831 	if (err)
1832 		return err;
1833 
1834 	if (sk->sk_state != TCP_ESTABLISHED)
1835 		return -ENOTCONN;
1836 
1837 	if (msg->msg_namelen)
1838 		msg->msg_namelen = 0;
1839 
1840 	return unix_dgram_sendmsg(sock, msg, len);
1841 }
1842 
1843 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1844 				  size_t size, int flags)
1845 {
1846 	struct sock *sk = sock->sk;
1847 
1848 	if (sk->sk_state != TCP_ESTABLISHED)
1849 		return -ENOTCONN;
1850 
1851 	return unix_dgram_recvmsg(sock, msg, size, flags);
1852 }
1853 
1854 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1855 {
1856 	struct unix_sock *u = unix_sk(sk);
1857 
1858 	if (u->addr) {
1859 		msg->msg_namelen = u->addr->len;
1860 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1861 	}
1862 }
1863 
1864 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1865 			      size_t size, int flags)
1866 {
1867 	struct scm_cookie scm;
1868 	struct sock *sk = sock->sk;
1869 	struct unix_sock *u = unix_sk(sk);
1870 	int noblock = flags & MSG_DONTWAIT;
1871 	struct sk_buff *skb;
1872 	int err;
1873 	int peeked, skip;
1874 
1875 	err = -EOPNOTSUPP;
1876 	if (flags&MSG_OOB)
1877 		goto out;
1878 
1879 	err = mutex_lock_interruptible(&u->readlock);
1880 	if (unlikely(err)) {
1881 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1882 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1883 		 */
1884 		err = noblock ? -EAGAIN : -ERESTARTSYS;
1885 		goto out;
1886 	}
1887 
1888 	skip = sk_peek_offset(sk, flags);
1889 
1890 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1891 	if (!skb) {
1892 		unix_state_lock(sk);
1893 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1894 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1895 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1896 			err = 0;
1897 		unix_state_unlock(sk);
1898 		goto out_unlock;
1899 	}
1900 
1901 	wake_up_interruptible_sync_poll(&u->peer_wait,
1902 					POLLOUT | POLLWRNORM | POLLWRBAND);
1903 
1904 	if (msg->msg_name)
1905 		unix_copy_addr(msg, skb->sk);
1906 
1907 	if (size > skb->len - skip)
1908 		size = skb->len - skip;
1909 	else if (size < skb->len - skip)
1910 		msg->msg_flags |= MSG_TRUNC;
1911 
1912 	err = skb_copy_datagram_msg(skb, skip, msg, size);
1913 	if (err)
1914 		goto out_free;
1915 
1916 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1917 		__sock_recv_timestamp(msg, sk, skb);
1918 
1919 	memset(&scm, 0, sizeof(scm));
1920 
1921 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1922 	unix_set_secdata(&scm, skb);
1923 
1924 	if (!(flags & MSG_PEEK)) {
1925 		if (UNIXCB(skb).fp)
1926 			unix_detach_fds(&scm, skb);
1927 
1928 		sk_peek_offset_bwd(sk, skb->len);
1929 	} else {
1930 		/* It is questionable: on PEEK we could:
1931 		   - do not return fds - good, but too simple 8)
1932 		   - return fds, and do not return them on read (old strategy,
1933 		     apparently wrong)
1934 		   - clone fds (I chose it for now, it is the most universal
1935 		     solution)
1936 
1937 		   POSIX 1003.1g does not actually define this clearly
1938 		   at all. POSIX 1003.1g doesn't define a lot of things
1939 		   clearly however!
1940 
1941 		*/
1942 
1943 		sk_peek_offset_fwd(sk, size);
1944 
1945 		if (UNIXCB(skb).fp)
1946 			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1947 	}
1948 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1949 
1950 	scm_recv(sock, msg, &scm, flags);
1951 
1952 out_free:
1953 	skb_free_datagram(sk, skb);
1954 out_unlock:
1955 	mutex_unlock(&u->readlock);
1956 out:
1957 	return err;
1958 }
1959 
1960 /*
1961  *	Sleep until more data has arrived. But check for races..
1962  */
1963 static long unix_stream_data_wait(struct sock *sk, long timeo,
1964 				  struct sk_buff *last, unsigned int last_len)
1965 {
1966 	struct sk_buff *tail;
1967 	DEFINE_WAIT(wait);
1968 
1969 	unix_state_lock(sk);
1970 
1971 	for (;;) {
1972 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1973 
1974 		tail = skb_peek_tail(&sk->sk_receive_queue);
1975 		if (tail != last ||
1976 		    (tail && tail->len != last_len) ||
1977 		    sk->sk_err ||
1978 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1979 		    signal_pending(current) ||
1980 		    !timeo)
1981 			break;
1982 
1983 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1984 		unix_state_unlock(sk);
1985 		timeo = freezable_schedule_timeout(timeo);
1986 		unix_state_lock(sk);
1987 
1988 		if (sock_flag(sk, SOCK_DEAD))
1989 			break;
1990 
1991 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1992 	}
1993 
1994 	finish_wait(sk_sleep(sk), &wait);
1995 	unix_state_unlock(sk);
1996 	return timeo;
1997 }
1998 
1999 static unsigned int unix_skb_len(const struct sk_buff *skb)
2000 {
2001 	return skb->len - UNIXCB(skb).consumed;
2002 }
2003 
2004 struct unix_stream_read_state {
2005 	int (*recv_actor)(struct sk_buff *, int, int,
2006 			  struct unix_stream_read_state *);
2007 	struct socket *socket;
2008 	struct msghdr *msg;
2009 	struct pipe_inode_info *pipe;
2010 	size_t size;
2011 	int flags;
2012 	unsigned int splice_flags;
2013 };
2014 
2015 static int unix_stream_read_generic(struct unix_stream_read_state *state)
2016 {
2017 	struct scm_cookie scm;
2018 	struct socket *sock = state->socket;
2019 	struct sock *sk = sock->sk;
2020 	struct unix_sock *u = unix_sk(sk);
2021 	int copied = 0;
2022 	int flags = state->flags;
2023 	int noblock = flags & MSG_DONTWAIT;
2024 	bool check_creds = false;
2025 	int target;
2026 	int err = 0;
2027 	long timeo;
2028 	int skip;
2029 	size_t size = state->size;
2030 	unsigned int last_len;
2031 
2032 	err = -EINVAL;
2033 	if (sk->sk_state != TCP_ESTABLISHED)
2034 		goto out;
2035 
2036 	err = -EOPNOTSUPP;
2037 	if (flags & MSG_OOB)
2038 		goto out;
2039 
2040 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2041 	timeo = sock_rcvtimeo(sk, noblock);
2042 
2043 	memset(&scm, 0, sizeof(scm));
2044 
2045 	/* Lock the socket to prevent queue disordering
2046 	 * while sleeps in memcpy_tomsg
2047 	 */
2048 	err = mutex_lock_interruptible(&u->readlock);
2049 	if (unlikely(err)) {
2050 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
2051 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
2052 		 */
2053 		err = noblock ? -EAGAIN : -ERESTARTSYS;
2054 		goto out;
2055 	}
2056 
2057 	do {
2058 		int chunk;
2059 		struct sk_buff *skb, *last;
2060 
2061 		unix_state_lock(sk);
2062 		if (sock_flag(sk, SOCK_DEAD)) {
2063 			err = -ECONNRESET;
2064 			goto unlock;
2065 		}
2066 		last = skb = skb_peek(&sk->sk_receive_queue);
2067 		last_len = last ? last->len : 0;
2068 again:
2069 		if (skb == NULL) {
2070 			unix_sk(sk)->recursion_level = 0;
2071 			if (copied >= target)
2072 				goto unlock;
2073 
2074 			/*
2075 			 *	POSIX 1003.1g mandates this order.
2076 			 */
2077 
2078 			err = sock_error(sk);
2079 			if (err)
2080 				goto unlock;
2081 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2082 				goto unlock;
2083 
2084 			unix_state_unlock(sk);
2085 			err = -EAGAIN;
2086 			if (!timeo)
2087 				break;
2088 			mutex_unlock(&u->readlock);
2089 
2090 			timeo = unix_stream_data_wait(sk, timeo, last,
2091 						      last_len);
2092 
2093 			if (signal_pending(current) ||
2094 			    mutex_lock_interruptible(&u->readlock)) {
2095 				err = sock_intr_errno(timeo);
2096 				goto out;
2097 			}
2098 
2099 			continue;
2100 unlock:
2101 			unix_state_unlock(sk);
2102 			break;
2103 		}
2104 
2105 		skip = sk_peek_offset(sk, flags);
2106 		while (skip >= unix_skb_len(skb)) {
2107 			skip -= unix_skb_len(skb);
2108 			last = skb;
2109 			last_len = skb->len;
2110 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2111 			if (!skb)
2112 				goto again;
2113 		}
2114 
2115 		unix_state_unlock(sk);
2116 
2117 		if (check_creds) {
2118 			/* Never glue messages from different writers */
2119 			if ((UNIXCB(skb).pid  != scm.pid) ||
2120 			    !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
2121 			    !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
2122 				break;
2123 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2124 			/* Copy credentials */
2125 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2126 			check_creds = true;
2127 		}
2128 
2129 		/* Copy address just once */
2130 		if (state->msg && state->msg->msg_name) {
2131 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2132 					 state->msg->msg_name);
2133 			unix_copy_addr(state->msg, skb->sk);
2134 			sunaddr = NULL;
2135 		}
2136 
2137 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2138 		chunk = state->recv_actor(skb, skip, chunk, state);
2139 		if (chunk < 0) {
2140 			if (copied == 0)
2141 				copied = -EFAULT;
2142 			break;
2143 		}
2144 		copied += chunk;
2145 		size -= chunk;
2146 
2147 		/* Mark read part of skb as used */
2148 		if (!(flags & MSG_PEEK)) {
2149 			UNIXCB(skb).consumed += chunk;
2150 
2151 			sk_peek_offset_bwd(sk, chunk);
2152 
2153 			if (UNIXCB(skb).fp)
2154 				unix_detach_fds(&scm, skb);
2155 
2156 			if (unix_skb_len(skb))
2157 				break;
2158 
2159 			skb_unlink(skb, &sk->sk_receive_queue);
2160 			consume_skb(skb);
2161 
2162 			if (scm.fp)
2163 				break;
2164 		} else {
2165 			/* It is questionable, see note in unix_dgram_recvmsg.
2166 			 */
2167 			if (UNIXCB(skb).fp)
2168 				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2169 
2170 			sk_peek_offset_fwd(sk, chunk);
2171 
2172 			break;
2173 		}
2174 	} while (size);
2175 
2176 	mutex_unlock(&u->readlock);
2177 	if (state->msg)
2178 		scm_recv(sock, state->msg, &scm, flags);
2179 	else
2180 		scm_destroy(&scm);
2181 out:
2182 	return copied ? : err;
2183 }
2184 
2185 static int unix_stream_read_actor(struct sk_buff *skb,
2186 				  int skip, int chunk,
2187 				  struct unix_stream_read_state *state)
2188 {
2189 	int ret;
2190 
2191 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2192 				    state->msg, chunk);
2193 	return ret ?: chunk;
2194 }
2195 
2196 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2197 			       size_t size, int flags)
2198 {
2199 	struct unix_stream_read_state state = {
2200 		.recv_actor = unix_stream_read_actor,
2201 		.socket = sock,
2202 		.msg = msg,
2203 		.size = size,
2204 		.flags = flags
2205 	};
2206 
2207 	return unix_stream_read_generic(&state);
2208 }
2209 
2210 static ssize_t skb_unix_socket_splice(struct sock *sk,
2211 				      struct pipe_inode_info *pipe,
2212 				      struct splice_pipe_desc *spd)
2213 {
2214 	int ret;
2215 	struct unix_sock *u = unix_sk(sk);
2216 
2217 	mutex_unlock(&u->readlock);
2218 	ret = splice_to_pipe(pipe, spd);
2219 	mutex_lock(&u->readlock);
2220 
2221 	return ret;
2222 }
2223 
2224 static int unix_stream_splice_actor(struct sk_buff *skb,
2225 				    int skip, int chunk,
2226 				    struct unix_stream_read_state *state)
2227 {
2228 	return skb_splice_bits(skb, state->socket->sk,
2229 			       UNIXCB(skb).consumed + skip,
2230 			       state->pipe, chunk, state->splice_flags,
2231 			       skb_unix_socket_splice);
2232 }
2233 
2234 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2235 				       struct pipe_inode_info *pipe,
2236 				       size_t size, unsigned int flags)
2237 {
2238 	struct unix_stream_read_state state = {
2239 		.recv_actor = unix_stream_splice_actor,
2240 		.socket = sock,
2241 		.pipe = pipe,
2242 		.size = size,
2243 		.splice_flags = flags,
2244 	};
2245 
2246 	if (unlikely(*ppos))
2247 		return -ESPIPE;
2248 
2249 	if (sock->file->f_flags & O_NONBLOCK ||
2250 	    flags & SPLICE_F_NONBLOCK)
2251 		state.flags = MSG_DONTWAIT;
2252 
2253 	return unix_stream_read_generic(&state);
2254 }
2255 
2256 static int unix_shutdown(struct socket *sock, int mode)
2257 {
2258 	struct sock *sk = sock->sk;
2259 	struct sock *other;
2260 
2261 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2262 		return -EINVAL;
2263 	/* This maps:
2264 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2265 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2266 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2267 	 */
2268 	++mode;
2269 
2270 	unix_state_lock(sk);
2271 	sk->sk_shutdown |= mode;
2272 	other = unix_peer(sk);
2273 	if (other)
2274 		sock_hold(other);
2275 	unix_state_unlock(sk);
2276 	sk->sk_state_change(sk);
2277 
2278 	if (other &&
2279 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2280 
2281 		int peer_mode = 0;
2282 
2283 		if (mode&RCV_SHUTDOWN)
2284 			peer_mode |= SEND_SHUTDOWN;
2285 		if (mode&SEND_SHUTDOWN)
2286 			peer_mode |= RCV_SHUTDOWN;
2287 		unix_state_lock(other);
2288 		other->sk_shutdown |= peer_mode;
2289 		unix_state_unlock(other);
2290 		other->sk_state_change(other);
2291 		if (peer_mode == SHUTDOWN_MASK)
2292 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2293 		else if (peer_mode & RCV_SHUTDOWN)
2294 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2295 	}
2296 	if (other)
2297 		sock_put(other);
2298 
2299 	return 0;
2300 }
2301 
2302 long unix_inq_len(struct sock *sk)
2303 {
2304 	struct sk_buff *skb;
2305 	long amount = 0;
2306 
2307 	if (sk->sk_state == TCP_LISTEN)
2308 		return -EINVAL;
2309 
2310 	spin_lock(&sk->sk_receive_queue.lock);
2311 	if (sk->sk_type == SOCK_STREAM ||
2312 	    sk->sk_type == SOCK_SEQPACKET) {
2313 		skb_queue_walk(&sk->sk_receive_queue, skb)
2314 			amount += unix_skb_len(skb);
2315 	} else {
2316 		skb = skb_peek(&sk->sk_receive_queue);
2317 		if (skb)
2318 			amount = skb->len;
2319 	}
2320 	spin_unlock(&sk->sk_receive_queue.lock);
2321 
2322 	return amount;
2323 }
2324 EXPORT_SYMBOL_GPL(unix_inq_len);
2325 
2326 long unix_outq_len(struct sock *sk)
2327 {
2328 	return sk_wmem_alloc_get(sk);
2329 }
2330 EXPORT_SYMBOL_GPL(unix_outq_len);
2331 
2332 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2333 {
2334 	struct sock *sk = sock->sk;
2335 	long amount = 0;
2336 	int err;
2337 
2338 	switch (cmd) {
2339 	case SIOCOUTQ:
2340 		amount = unix_outq_len(sk);
2341 		err = put_user(amount, (int __user *)arg);
2342 		break;
2343 	case SIOCINQ:
2344 		amount = unix_inq_len(sk);
2345 		if (amount < 0)
2346 			err = amount;
2347 		else
2348 			err = put_user(amount, (int __user *)arg);
2349 		break;
2350 	default:
2351 		err = -ENOIOCTLCMD;
2352 		break;
2353 	}
2354 	return err;
2355 }
2356 
2357 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2358 {
2359 	struct sock *sk = sock->sk;
2360 	unsigned int mask;
2361 
2362 	sock_poll_wait(file, sk_sleep(sk), wait);
2363 	mask = 0;
2364 
2365 	/* exceptional events? */
2366 	if (sk->sk_err)
2367 		mask |= POLLERR;
2368 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2369 		mask |= POLLHUP;
2370 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2371 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2372 
2373 	/* readable? */
2374 	if (!skb_queue_empty(&sk->sk_receive_queue))
2375 		mask |= POLLIN | POLLRDNORM;
2376 
2377 	/* Connection-based need to check for termination and startup */
2378 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2379 	    sk->sk_state == TCP_CLOSE)
2380 		mask |= POLLHUP;
2381 
2382 	/*
2383 	 * we set writable also when the other side has shut down the
2384 	 * connection. This prevents stuck sockets.
2385 	 */
2386 	if (unix_writable(sk))
2387 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2388 
2389 	return mask;
2390 }
2391 
2392 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2393 				    poll_table *wait)
2394 {
2395 	struct sock *sk = sock->sk, *other;
2396 	unsigned int mask, writable;
2397 
2398 	sock_poll_wait(file, sk_sleep(sk), wait);
2399 	mask = 0;
2400 
2401 	/* exceptional events? */
2402 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2403 		mask |= POLLERR |
2404 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2405 
2406 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2407 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2408 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2409 		mask |= POLLHUP;
2410 
2411 	/* readable? */
2412 	if (!skb_queue_empty(&sk->sk_receive_queue))
2413 		mask |= POLLIN | POLLRDNORM;
2414 
2415 	/* Connection-based need to check for termination and startup */
2416 	if (sk->sk_type == SOCK_SEQPACKET) {
2417 		if (sk->sk_state == TCP_CLOSE)
2418 			mask |= POLLHUP;
2419 		/* connection hasn't started yet? */
2420 		if (sk->sk_state == TCP_SYN_SENT)
2421 			return mask;
2422 	}
2423 
2424 	/* No write status requested, avoid expensive OUT tests. */
2425 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2426 		return mask;
2427 
2428 	writable = unix_writable(sk);
2429 	other = unix_peer_get(sk);
2430 	if (other) {
2431 		if (unix_peer(other) != sk) {
2432 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2433 			if (unix_recvq_full(other))
2434 				writable = 0;
2435 		}
2436 		sock_put(other);
2437 	}
2438 
2439 	if (writable)
2440 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2441 	else
2442 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2443 
2444 	return mask;
2445 }
2446 
2447 #ifdef CONFIG_PROC_FS
2448 
2449 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2450 
2451 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2452 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2453 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2454 
2455 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2456 {
2457 	unsigned long offset = get_offset(*pos);
2458 	unsigned long bucket = get_bucket(*pos);
2459 	struct sock *sk;
2460 	unsigned long count = 0;
2461 
2462 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2463 		if (sock_net(sk) != seq_file_net(seq))
2464 			continue;
2465 		if (++count == offset)
2466 			break;
2467 	}
2468 
2469 	return sk;
2470 }
2471 
2472 static struct sock *unix_next_socket(struct seq_file *seq,
2473 				     struct sock *sk,
2474 				     loff_t *pos)
2475 {
2476 	unsigned long bucket;
2477 
2478 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2479 		sk = sk_next(sk);
2480 		if (!sk)
2481 			goto next_bucket;
2482 		if (sock_net(sk) == seq_file_net(seq))
2483 			return sk;
2484 	}
2485 
2486 	do {
2487 		sk = unix_from_bucket(seq, pos);
2488 		if (sk)
2489 			return sk;
2490 
2491 next_bucket:
2492 		bucket = get_bucket(*pos) + 1;
2493 		*pos = set_bucket_offset(bucket, 1);
2494 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2495 
2496 	return NULL;
2497 }
2498 
2499 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2500 	__acquires(unix_table_lock)
2501 {
2502 	spin_lock(&unix_table_lock);
2503 
2504 	if (!*pos)
2505 		return SEQ_START_TOKEN;
2506 
2507 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2508 		return NULL;
2509 
2510 	return unix_next_socket(seq, NULL, pos);
2511 }
2512 
2513 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2514 {
2515 	++*pos;
2516 	return unix_next_socket(seq, v, pos);
2517 }
2518 
2519 static void unix_seq_stop(struct seq_file *seq, void *v)
2520 	__releases(unix_table_lock)
2521 {
2522 	spin_unlock(&unix_table_lock);
2523 }
2524 
2525 static int unix_seq_show(struct seq_file *seq, void *v)
2526 {
2527 
2528 	if (v == SEQ_START_TOKEN)
2529 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2530 			 "Inode Path\n");
2531 	else {
2532 		struct sock *s = v;
2533 		struct unix_sock *u = unix_sk(s);
2534 		unix_state_lock(s);
2535 
2536 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2537 			s,
2538 			atomic_read(&s->sk_refcnt),
2539 			0,
2540 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2541 			s->sk_type,
2542 			s->sk_socket ?
2543 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2544 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2545 			sock_i_ino(s));
2546 
2547 		if (u->addr) {
2548 			int i, len;
2549 			seq_putc(seq, ' ');
2550 
2551 			i = 0;
2552 			len = u->addr->len - sizeof(short);
2553 			if (!UNIX_ABSTRACT(s))
2554 				len--;
2555 			else {
2556 				seq_putc(seq, '@');
2557 				i++;
2558 			}
2559 			for ( ; i < len; i++)
2560 				seq_putc(seq, u->addr->name->sun_path[i]);
2561 		}
2562 		unix_state_unlock(s);
2563 		seq_putc(seq, '\n');
2564 	}
2565 
2566 	return 0;
2567 }
2568 
2569 static const struct seq_operations unix_seq_ops = {
2570 	.start  = unix_seq_start,
2571 	.next   = unix_seq_next,
2572 	.stop   = unix_seq_stop,
2573 	.show   = unix_seq_show,
2574 };
2575 
2576 static int unix_seq_open(struct inode *inode, struct file *file)
2577 {
2578 	return seq_open_net(inode, file, &unix_seq_ops,
2579 			    sizeof(struct seq_net_private));
2580 }
2581 
2582 static const struct file_operations unix_seq_fops = {
2583 	.owner		= THIS_MODULE,
2584 	.open		= unix_seq_open,
2585 	.read		= seq_read,
2586 	.llseek		= seq_lseek,
2587 	.release	= seq_release_net,
2588 };
2589 
2590 #endif
2591 
2592 static const struct net_proto_family unix_family_ops = {
2593 	.family = PF_UNIX,
2594 	.create = unix_create,
2595 	.owner	= THIS_MODULE,
2596 };
2597 
2598 
2599 static int __net_init unix_net_init(struct net *net)
2600 {
2601 	int error = -ENOMEM;
2602 
2603 	net->unx.sysctl_max_dgram_qlen = 10;
2604 	if (unix_sysctl_register(net))
2605 		goto out;
2606 
2607 #ifdef CONFIG_PROC_FS
2608 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2609 		unix_sysctl_unregister(net);
2610 		goto out;
2611 	}
2612 #endif
2613 	error = 0;
2614 out:
2615 	return error;
2616 }
2617 
2618 static void __net_exit unix_net_exit(struct net *net)
2619 {
2620 	unix_sysctl_unregister(net);
2621 	remove_proc_entry("unix", net->proc_net);
2622 }
2623 
2624 static struct pernet_operations unix_net_ops = {
2625 	.init = unix_net_init,
2626 	.exit = unix_net_exit,
2627 };
2628 
2629 static int __init af_unix_init(void)
2630 {
2631 	int rc = -1;
2632 
2633 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2634 
2635 	rc = proto_register(&unix_proto, 1);
2636 	if (rc != 0) {
2637 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2638 		goto out;
2639 	}
2640 
2641 	sock_register(&unix_family_ops);
2642 	register_pernet_subsys(&unix_net_ops);
2643 out:
2644 	return rc;
2645 }
2646 
2647 static void __exit af_unix_exit(void)
2648 {
2649 	sock_unregister(PF_UNIX);
2650 	proto_unregister(&unix_proto);
2651 	unregister_pernet_subsys(&unix_net_ops);
2652 }
2653 
2654 /* Earlier than device_initcall() so that other drivers invoking
2655    request_module() don't end up in a loop when modprobe tries
2656    to use a UNIX socket. But later than subsys_initcall() because
2657    we depend on stuff initialised there */
2658 fs_initcall(af_unix_init);
2659 module_exit(af_unix_exit);
2660 
2661 MODULE_LICENSE("GPL");
2662 MODULE_ALIAS_NETPROTO(PF_UNIX);
2663