xref: /linux/net/unix/af_unix.c (revision 0cf55934ecace74bb7d26c0e9679fb41675a8903)
1  /*
2   * NET4:	Implementation of BSD Unix domain sockets.
3   *
4   * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5   *
6   *		This program is free software; you can redistribute it and/or
7   *		modify it under the terms of the GNU General Public License
8   *		as published by the Free Software Foundation; either version
9   *		2 of the License, or (at your option) any later version.
10   *
11   * Fixes:
12   *		Linus Torvalds	:	Assorted bug cures.
13   *		Niibe Yutaka	:	async I/O support.
14   *		Carsten Paeth	:	PF_UNIX check, address fixes.
15   *		Alan Cox	:	Limit size of allocated blocks.
16   *		Alan Cox	:	Fixed the stupid socketpair bug.
17   *		Alan Cox	:	BSD compatibility fine tuning.
18   *		Alan Cox	:	Fixed a bug in connect when interrupted.
19   *		Alan Cox	:	Sorted out a proper draft version of
20   *					file descriptor passing hacked up from
21   *					Mike Shaver's work.
22   *		Marty Leisner	:	Fixes to fd passing
23   *		Nick Nevin	:	recvmsg bugfix.
24   *		Alan Cox	:	Started proper garbage collector
25   *		Heiko EiBfeldt	:	Missing verify_area check
26   *		Alan Cox	:	Started POSIXisms
27   *		Andreas Schwab	:	Replace inode by dentry for proper
28   *					reference counting
29   *		Kirk Petersen	:	Made this a module
30   *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31   *					Lots of bug fixes.
32   *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33   *					by above two patches.
34   *	     Andrea Arcangeli	:	If possible we block in connect(2)
35   *					if the max backlog of the listen socket
36   *					is been reached. This won't break
37   *					old apps and it will avoid huge amount
38   *					of socks hashed (this for unix_gc()
39   *					performances reasons).
40   *					Security fix that limits the max
41   *					number of socks to 2*max_files and
42   *					the number of skb queueable in the
43   *					dgram receiver.
44   *		Artur Skawina   :	Hash function optimizations
45   *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46   *	      Malcolm Beattie   :	Set peercred for socketpair
47   *	     Michal Ostrowski   :       Module initialization cleanup.
48   *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49   *	     				the core infrastructure is doing that
50   *	     				for all net proto families now (2.5.69+)
51   *
52   *
53   * Known differences from reference BSD that was tested:
54   *
55   *	[TO FIX]
56   *	ECONNREFUSED is not returned from one end of a connected() socket to the
57   *		other the moment one end closes.
58   *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59   *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60   *	[NOT TO FIX]
61   *	accept() returns a path name even if the connecting socket has closed
62   *		in the meantime (BSD loses the path and gives up).
63   *	accept() returns 0 length path for an unbound connector. BSD returns 16
64   *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65   *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66   *	BSD af_unix apparently has connect forgetting to block properly.
67   *		(need to check this with the POSIX spec in detail)
68   *
69   * Differences from 2.0.0-11-... (ANK)
70   *	Bug fixes and improvements.
71   *		- client shutdown killed server socket.
72   *		- removed all useless cli/sti pairs.
73   *
74   *	Semantic changes/extensions.
75   *		- generic control message passing.
76   *		- SCM_CREDENTIALS control message.
77   *		- "Abstract" (not FS based) socket bindings.
78   *		  Abstract names are sequences of bytes (not zero terminated)
79   *		  started by 0, so that this name space does not intersect
80   *		  with BSD names.
81   */
82  
83  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84  
85  #include <linux/module.h>
86  #include <linux/kernel.h>
87  #include <linux/signal.h>
88  #include <linux/sched.h>
89  #include <linux/errno.h>
90  #include <linux/string.h>
91  #include <linux/stat.h>
92  #include <linux/dcache.h>
93  #include <linux/namei.h>
94  #include <linux/socket.h>
95  #include <linux/un.h>
96  #include <linux/fcntl.h>
97  #include <linux/termios.h>
98  #include <linux/sockios.h>
99  #include <linux/net.h>
100  #include <linux/in.h>
101  #include <linux/fs.h>
102  #include <linux/slab.h>
103  #include <asm/uaccess.h>
104  #include <linux/skbuff.h>
105  #include <linux/netdevice.h>
106  #include <net/net_namespace.h>
107  #include <net/sock.h>
108  #include <net/tcp_states.h>
109  #include <net/af_unix.h>
110  #include <linux/proc_fs.h>
111  #include <linux/seq_file.h>
112  #include <net/scm.h>
113  #include <linux/init.h>
114  #include <linux/poll.h>
115  #include <linux/rtnetlink.h>
116  #include <linux/mount.h>
117  #include <net/checksum.h>
118  #include <linux/security.h>
119  #include <linux/freezer.h>
120  
121  struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122  EXPORT_SYMBOL_GPL(unix_socket_table);
123  DEFINE_SPINLOCK(unix_table_lock);
124  EXPORT_SYMBOL_GPL(unix_table_lock);
125  static atomic_long_t unix_nr_socks;
126  
127  
128  static struct hlist_head *unix_sockets_unbound(void *addr)
129  {
130  	unsigned long hash = (unsigned long)addr;
131  
132  	hash ^= hash >> 16;
133  	hash ^= hash >> 8;
134  	hash %= UNIX_HASH_SIZE;
135  	return &unix_socket_table[UNIX_HASH_SIZE + hash];
136  }
137  
138  #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139  
140  #ifdef CONFIG_SECURITY_NETWORK
141  static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142  {
143  	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
144  }
145  
146  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147  {
148  	scm->secid = *UNIXSID(skb);
149  }
150  #else
151  static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152  { }
153  
154  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
155  { }
156  #endif /* CONFIG_SECURITY_NETWORK */
157  
158  /*
159   *  SMP locking strategy:
160   *    hash table is protected with spinlock unix_table_lock
161   *    each socket state is protected by separate spin lock.
162   */
163  
164  static inline unsigned int unix_hash_fold(__wsum n)
165  {
166  	unsigned int hash = (__force unsigned int)csum_fold(n);
167  
168  	hash ^= hash>>8;
169  	return hash&(UNIX_HASH_SIZE-1);
170  }
171  
172  #define unix_peer(sk) (unix_sk(sk)->peer)
173  
174  static inline int unix_our_peer(struct sock *sk, struct sock *osk)
175  {
176  	return unix_peer(osk) == sk;
177  }
178  
179  static inline int unix_may_send(struct sock *sk, struct sock *osk)
180  {
181  	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
182  }
183  
184  static inline int unix_recvq_full(struct sock const *sk)
185  {
186  	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
187  }
188  
189  struct sock *unix_peer_get(struct sock *s)
190  {
191  	struct sock *peer;
192  
193  	unix_state_lock(s);
194  	peer = unix_peer(s);
195  	if (peer)
196  		sock_hold(peer);
197  	unix_state_unlock(s);
198  	return peer;
199  }
200  EXPORT_SYMBOL_GPL(unix_peer_get);
201  
202  static inline void unix_release_addr(struct unix_address *addr)
203  {
204  	if (atomic_dec_and_test(&addr->refcnt))
205  		kfree(addr);
206  }
207  
208  /*
209   *	Check unix socket name:
210   *		- should be not zero length.
211   *	        - if started by not zero, should be NULL terminated (FS object)
212   *		- if started by zero, it is abstract name.
213   */
214  
215  static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
216  {
217  	if (len <= sizeof(short) || len > sizeof(*sunaddr))
218  		return -EINVAL;
219  	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
220  		return -EINVAL;
221  	if (sunaddr->sun_path[0]) {
222  		/*
223  		 * This may look like an off by one error but it is a bit more
224  		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
225  		 * sun_path[108] doesn't as such exist.  However in kernel space
226  		 * we are guaranteed that it is a valid memory location in our
227  		 * kernel address buffer.
228  		 */
229  		((char *)sunaddr)[len] = 0;
230  		len = strlen(sunaddr->sun_path)+1+sizeof(short);
231  		return len;
232  	}
233  
234  	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
235  	return len;
236  }
237  
238  static void __unix_remove_socket(struct sock *sk)
239  {
240  	sk_del_node_init(sk);
241  }
242  
243  static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
244  {
245  	WARN_ON(!sk_unhashed(sk));
246  	sk_add_node(sk, list);
247  }
248  
249  static inline void unix_remove_socket(struct sock *sk)
250  {
251  	spin_lock(&unix_table_lock);
252  	__unix_remove_socket(sk);
253  	spin_unlock(&unix_table_lock);
254  }
255  
256  static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
257  {
258  	spin_lock(&unix_table_lock);
259  	__unix_insert_socket(list, sk);
260  	spin_unlock(&unix_table_lock);
261  }
262  
263  static struct sock *__unix_find_socket_byname(struct net *net,
264  					      struct sockaddr_un *sunname,
265  					      int len, int type, unsigned int hash)
266  {
267  	struct sock *s;
268  
269  	sk_for_each(s, &unix_socket_table[hash ^ type]) {
270  		struct unix_sock *u = unix_sk(s);
271  
272  		if (!net_eq(sock_net(s), net))
273  			continue;
274  
275  		if (u->addr->len == len &&
276  		    !memcmp(u->addr->name, sunname, len))
277  			goto found;
278  	}
279  	s = NULL;
280  found:
281  	return s;
282  }
283  
284  static inline struct sock *unix_find_socket_byname(struct net *net,
285  						   struct sockaddr_un *sunname,
286  						   int len, int type,
287  						   unsigned int hash)
288  {
289  	struct sock *s;
290  
291  	spin_lock(&unix_table_lock);
292  	s = __unix_find_socket_byname(net, sunname, len, type, hash);
293  	if (s)
294  		sock_hold(s);
295  	spin_unlock(&unix_table_lock);
296  	return s;
297  }
298  
299  static struct sock *unix_find_socket_byinode(struct inode *i)
300  {
301  	struct sock *s;
302  
303  	spin_lock(&unix_table_lock);
304  	sk_for_each(s,
305  		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
306  		struct dentry *dentry = unix_sk(s)->path.dentry;
307  
308  		if (dentry && dentry->d_inode == i) {
309  			sock_hold(s);
310  			goto found;
311  		}
312  	}
313  	s = NULL;
314  found:
315  	spin_unlock(&unix_table_lock);
316  	return s;
317  }
318  
319  static inline int unix_writable(struct sock *sk)
320  {
321  	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
322  }
323  
324  static void unix_write_space(struct sock *sk)
325  {
326  	struct socket_wq *wq;
327  
328  	rcu_read_lock();
329  	if (unix_writable(sk)) {
330  		wq = rcu_dereference(sk->sk_wq);
331  		if (wq_has_sleeper(wq))
332  			wake_up_interruptible_sync_poll(&wq->wait,
333  				POLLOUT | POLLWRNORM | POLLWRBAND);
334  		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
335  	}
336  	rcu_read_unlock();
337  }
338  
339  /* When dgram socket disconnects (or changes its peer), we clear its receive
340   * queue of packets arrived from previous peer. First, it allows to do
341   * flow control based only on wmem_alloc; second, sk connected to peer
342   * may receive messages only from that peer. */
343  static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
344  {
345  	if (!skb_queue_empty(&sk->sk_receive_queue)) {
346  		skb_queue_purge(&sk->sk_receive_queue);
347  		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
348  
349  		/* If one link of bidirectional dgram pipe is disconnected,
350  		 * we signal error. Messages are lost. Do not make this,
351  		 * when peer was not connected to us.
352  		 */
353  		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
354  			other->sk_err = ECONNRESET;
355  			other->sk_error_report(other);
356  		}
357  	}
358  }
359  
360  static void unix_sock_destructor(struct sock *sk)
361  {
362  	struct unix_sock *u = unix_sk(sk);
363  
364  	skb_queue_purge(&sk->sk_receive_queue);
365  
366  	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
367  	WARN_ON(!sk_unhashed(sk));
368  	WARN_ON(sk->sk_socket);
369  	if (!sock_flag(sk, SOCK_DEAD)) {
370  		pr_info("Attempt to release alive unix socket: %p\n", sk);
371  		return;
372  	}
373  
374  	if (u->addr)
375  		unix_release_addr(u->addr);
376  
377  	atomic_long_dec(&unix_nr_socks);
378  	local_bh_disable();
379  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
380  	local_bh_enable();
381  #ifdef UNIX_REFCNT_DEBUG
382  	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
383  		atomic_long_read(&unix_nr_socks));
384  #endif
385  }
386  
387  static void unix_release_sock(struct sock *sk, int embrion)
388  {
389  	struct unix_sock *u = unix_sk(sk);
390  	struct path path;
391  	struct sock *skpair;
392  	struct sk_buff *skb;
393  	int state;
394  
395  	unix_remove_socket(sk);
396  
397  	/* Clear state */
398  	unix_state_lock(sk);
399  	sock_orphan(sk);
400  	sk->sk_shutdown = SHUTDOWN_MASK;
401  	path	     = u->path;
402  	u->path.dentry = NULL;
403  	u->path.mnt = NULL;
404  	state = sk->sk_state;
405  	sk->sk_state = TCP_CLOSE;
406  	unix_state_unlock(sk);
407  
408  	wake_up_interruptible_all(&u->peer_wait);
409  
410  	skpair = unix_peer(sk);
411  
412  	if (skpair != NULL) {
413  		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
414  			unix_state_lock(skpair);
415  			/* No more writes */
416  			skpair->sk_shutdown = SHUTDOWN_MASK;
417  			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
418  				skpair->sk_err = ECONNRESET;
419  			unix_state_unlock(skpair);
420  			skpair->sk_state_change(skpair);
421  			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
422  		}
423  		sock_put(skpair); /* It may now die */
424  		unix_peer(sk) = NULL;
425  	}
426  
427  	/* Try to flush out this socket. Throw out buffers at least */
428  
429  	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
430  		if (state == TCP_LISTEN)
431  			unix_release_sock(skb->sk, 1);
432  		/* passed fds are erased in the kfree_skb hook	      */
433  		kfree_skb(skb);
434  	}
435  
436  	if (path.dentry)
437  		path_put(&path);
438  
439  	sock_put(sk);
440  
441  	/* ---- Socket is dead now and most probably destroyed ---- */
442  
443  	/*
444  	 * Fixme: BSD difference: In BSD all sockets connected to us get
445  	 *	  ECONNRESET and we die on the spot. In Linux we behave
446  	 *	  like files and pipes do and wait for the last
447  	 *	  dereference.
448  	 *
449  	 * Can't we simply set sock->err?
450  	 *
451  	 *	  What the above comment does talk about? --ANK(980817)
452  	 */
453  
454  	if (unix_tot_inflight)
455  		unix_gc();		/* Garbage collect fds */
456  }
457  
458  static void init_peercred(struct sock *sk)
459  {
460  	put_pid(sk->sk_peer_pid);
461  	if (sk->sk_peer_cred)
462  		put_cred(sk->sk_peer_cred);
463  	sk->sk_peer_pid  = get_pid(task_tgid(current));
464  	sk->sk_peer_cred = get_current_cred();
465  }
466  
467  static void copy_peercred(struct sock *sk, struct sock *peersk)
468  {
469  	put_pid(sk->sk_peer_pid);
470  	if (sk->sk_peer_cred)
471  		put_cred(sk->sk_peer_cred);
472  	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
473  	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
474  }
475  
476  static int unix_listen(struct socket *sock, int backlog)
477  {
478  	int err;
479  	struct sock *sk = sock->sk;
480  	struct unix_sock *u = unix_sk(sk);
481  	struct pid *old_pid = NULL;
482  
483  	err = -EOPNOTSUPP;
484  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
485  		goto out;	/* Only stream/seqpacket sockets accept */
486  	err = -EINVAL;
487  	if (!u->addr)
488  		goto out;	/* No listens on an unbound socket */
489  	unix_state_lock(sk);
490  	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
491  		goto out_unlock;
492  	if (backlog > sk->sk_max_ack_backlog)
493  		wake_up_interruptible_all(&u->peer_wait);
494  	sk->sk_max_ack_backlog	= backlog;
495  	sk->sk_state		= TCP_LISTEN;
496  	/* set credentials so connect can copy them */
497  	init_peercred(sk);
498  	err = 0;
499  
500  out_unlock:
501  	unix_state_unlock(sk);
502  	put_pid(old_pid);
503  out:
504  	return err;
505  }
506  
507  static int unix_release(struct socket *);
508  static int unix_bind(struct socket *, struct sockaddr *, int);
509  static int unix_stream_connect(struct socket *, struct sockaddr *,
510  			       int addr_len, int flags);
511  static int unix_socketpair(struct socket *, struct socket *);
512  static int unix_accept(struct socket *, struct socket *, int);
513  static int unix_getname(struct socket *, struct sockaddr *, int *, int);
514  static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
515  static unsigned int unix_dgram_poll(struct file *, struct socket *,
516  				    poll_table *);
517  static int unix_ioctl(struct socket *, unsigned int, unsigned long);
518  static int unix_shutdown(struct socket *, int);
519  static int unix_stream_sendmsg(struct kiocb *, struct socket *,
520  			       struct msghdr *, size_t);
521  static int unix_stream_recvmsg(struct kiocb *, struct socket *,
522  			       struct msghdr *, size_t, int);
523  static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
524  			      struct msghdr *, size_t);
525  static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
526  			      struct msghdr *, size_t, int);
527  static int unix_dgram_connect(struct socket *, struct sockaddr *,
528  			      int, int);
529  static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
530  				  struct msghdr *, size_t);
531  static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
532  				  struct msghdr *, size_t, int);
533  
534  static int unix_set_peek_off(struct sock *sk, int val)
535  {
536  	struct unix_sock *u = unix_sk(sk);
537  
538  	if (mutex_lock_interruptible(&u->readlock))
539  		return -EINTR;
540  
541  	sk->sk_peek_off = val;
542  	mutex_unlock(&u->readlock);
543  
544  	return 0;
545  }
546  
547  
548  static const struct proto_ops unix_stream_ops = {
549  	.family =	PF_UNIX,
550  	.owner =	THIS_MODULE,
551  	.release =	unix_release,
552  	.bind =		unix_bind,
553  	.connect =	unix_stream_connect,
554  	.socketpair =	unix_socketpair,
555  	.accept =	unix_accept,
556  	.getname =	unix_getname,
557  	.poll =		unix_poll,
558  	.ioctl =	unix_ioctl,
559  	.listen =	unix_listen,
560  	.shutdown =	unix_shutdown,
561  	.setsockopt =	sock_no_setsockopt,
562  	.getsockopt =	sock_no_getsockopt,
563  	.sendmsg =	unix_stream_sendmsg,
564  	.recvmsg =	unix_stream_recvmsg,
565  	.mmap =		sock_no_mmap,
566  	.sendpage =	sock_no_sendpage,
567  	.set_peek_off =	unix_set_peek_off,
568  };
569  
570  static const struct proto_ops unix_dgram_ops = {
571  	.family =	PF_UNIX,
572  	.owner =	THIS_MODULE,
573  	.release =	unix_release,
574  	.bind =		unix_bind,
575  	.connect =	unix_dgram_connect,
576  	.socketpair =	unix_socketpair,
577  	.accept =	sock_no_accept,
578  	.getname =	unix_getname,
579  	.poll =		unix_dgram_poll,
580  	.ioctl =	unix_ioctl,
581  	.listen =	sock_no_listen,
582  	.shutdown =	unix_shutdown,
583  	.setsockopt =	sock_no_setsockopt,
584  	.getsockopt =	sock_no_getsockopt,
585  	.sendmsg =	unix_dgram_sendmsg,
586  	.recvmsg =	unix_dgram_recvmsg,
587  	.mmap =		sock_no_mmap,
588  	.sendpage =	sock_no_sendpage,
589  	.set_peek_off =	unix_set_peek_off,
590  };
591  
592  static const struct proto_ops unix_seqpacket_ops = {
593  	.family =	PF_UNIX,
594  	.owner =	THIS_MODULE,
595  	.release =	unix_release,
596  	.bind =		unix_bind,
597  	.connect =	unix_stream_connect,
598  	.socketpair =	unix_socketpair,
599  	.accept =	unix_accept,
600  	.getname =	unix_getname,
601  	.poll =		unix_dgram_poll,
602  	.ioctl =	unix_ioctl,
603  	.listen =	unix_listen,
604  	.shutdown =	unix_shutdown,
605  	.setsockopt =	sock_no_setsockopt,
606  	.getsockopt =	sock_no_getsockopt,
607  	.sendmsg =	unix_seqpacket_sendmsg,
608  	.recvmsg =	unix_seqpacket_recvmsg,
609  	.mmap =		sock_no_mmap,
610  	.sendpage =	sock_no_sendpage,
611  	.set_peek_off =	unix_set_peek_off,
612  };
613  
614  static struct proto unix_proto = {
615  	.name			= "UNIX",
616  	.owner			= THIS_MODULE,
617  	.obj_size		= sizeof(struct unix_sock),
618  };
619  
620  /*
621   * AF_UNIX sockets do not interact with hardware, hence they
622   * dont trigger interrupts - so it's safe for them to have
623   * bh-unsafe locking for their sk_receive_queue.lock. Split off
624   * this special lock-class by reinitializing the spinlock key:
625   */
626  static struct lock_class_key af_unix_sk_receive_queue_lock_key;
627  
628  static struct sock *unix_create1(struct net *net, struct socket *sock)
629  {
630  	struct sock *sk = NULL;
631  	struct unix_sock *u;
632  
633  	atomic_long_inc(&unix_nr_socks);
634  	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
635  		goto out;
636  
637  	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
638  	if (!sk)
639  		goto out;
640  
641  	sock_init_data(sock, sk);
642  	lockdep_set_class(&sk->sk_receive_queue.lock,
643  				&af_unix_sk_receive_queue_lock_key);
644  
645  	sk->sk_write_space	= unix_write_space;
646  	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
647  	sk->sk_destruct		= unix_sock_destructor;
648  	u	  = unix_sk(sk);
649  	u->path.dentry = NULL;
650  	u->path.mnt = NULL;
651  	spin_lock_init(&u->lock);
652  	atomic_long_set(&u->inflight, 0);
653  	INIT_LIST_HEAD(&u->link);
654  	mutex_init(&u->readlock); /* single task reading lock */
655  	init_waitqueue_head(&u->peer_wait);
656  	unix_insert_socket(unix_sockets_unbound(sk), sk);
657  out:
658  	if (sk == NULL)
659  		atomic_long_dec(&unix_nr_socks);
660  	else {
661  		local_bh_disable();
662  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
663  		local_bh_enable();
664  	}
665  	return sk;
666  }
667  
668  static int unix_create(struct net *net, struct socket *sock, int protocol,
669  		       int kern)
670  {
671  	if (protocol && protocol != PF_UNIX)
672  		return -EPROTONOSUPPORT;
673  
674  	sock->state = SS_UNCONNECTED;
675  
676  	switch (sock->type) {
677  	case SOCK_STREAM:
678  		sock->ops = &unix_stream_ops;
679  		break;
680  		/*
681  		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
682  		 *	nothing uses it.
683  		 */
684  	case SOCK_RAW:
685  		sock->type = SOCK_DGRAM;
686  	case SOCK_DGRAM:
687  		sock->ops = &unix_dgram_ops;
688  		break;
689  	case SOCK_SEQPACKET:
690  		sock->ops = &unix_seqpacket_ops;
691  		break;
692  	default:
693  		return -ESOCKTNOSUPPORT;
694  	}
695  
696  	return unix_create1(net, sock) ? 0 : -ENOMEM;
697  }
698  
699  static int unix_release(struct socket *sock)
700  {
701  	struct sock *sk = sock->sk;
702  
703  	if (!sk)
704  		return 0;
705  
706  	unix_release_sock(sk, 0);
707  	sock->sk = NULL;
708  
709  	return 0;
710  }
711  
712  static int unix_autobind(struct socket *sock)
713  {
714  	struct sock *sk = sock->sk;
715  	struct net *net = sock_net(sk);
716  	struct unix_sock *u = unix_sk(sk);
717  	static u32 ordernum = 1;
718  	struct unix_address *addr;
719  	int err;
720  	unsigned int retries = 0;
721  
722  	err = mutex_lock_interruptible(&u->readlock);
723  	if (err)
724  		return err;
725  
726  	err = 0;
727  	if (u->addr)
728  		goto out;
729  
730  	err = -ENOMEM;
731  	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
732  	if (!addr)
733  		goto out;
734  
735  	addr->name->sun_family = AF_UNIX;
736  	atomic_set(&addr->refcnt, 1);
737  
738  retry:
739  	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
740  	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
741  
742  	spin_lock(&unix_table_lock);
743  	ordernum = (ordernum+1)&0xFFFFF;
744  
745  	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
746  				      addr->hash)) {
747  		spin_unlock(&unix_table_lock);
748  		/*
749  		 * __unix_find_socket_byname() may take long time if many names
750  		 * are already in use.
751  		 */
752  		cond_resched();
753  		/* Give up if all names seems to be in use. */
754  		if (retries++ == 0xFFFFF) {
755  			err = -ENOSPC;
756  			kfree(addr);
757  			goto out;
758  		}
759  		goto retry;
760  	}
761  	addr->hash ^= sk->sk_type;
762  
763  	__unix_remove_socket(sk);
764  	u->addr = addr;
765  	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
766  	spin_unlock(&unix_table_lock);
767  	err = 0;
768  
769  out:	mutex_unlock(&u->readlock);
770  	return err;
771  }
772  
773  static struct sock *unix_find_other(struct net *net,
774  				    struct sockaddr_un *sunname, int len,
775  				    int type, unsigned int hash, int *error)
776  {
777  	struct sock *u;
778  	struct path path;
779  	int err = 0;
780  
781  	if (sunname->sun_path[0]) {
782  		struct inode *inode;
783  		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
784  		if (err)
785  			goto fail;
786  		inode = path.dentry->d_inode;
787  		err = inode_permission(inode, MAY_WRITE);
788  		if (err)
789  			goto put_fail;
790  
791  		err = -ECONNREFUSED;
792  		if (!S_ISSOCK(inode->i_mode))
793  			goto put_fail;
794  		u = unix_find_socket_byinode(inode);
795  		if (!u)
796  			goto put_fail;
797  
798  		if (u->sk_type == type)
799  			touch_atime(&path);
800  
801  		path_put(&path);
802  
803  		err = -EPROTOTYPE;
804  		if (u->sk_type != type) {
805  			sock_put(u);
806  			goto fail;
807  		}
808  	} else {
809  		err = -ECONNREFUSED;
810  		u = unix_find_socket_byname(net, sunname, len, type, hash);
811  		if (u) {
812  			struct dentry *dentry;
813  			dentry = unix_sk(u)->path.dentry;
814  			if (dentry)
815  				touch_atime(&unix_sk(u)->path);
816  		} else
817  			goto fail;
818  	}
819  	return u;
820  
821  put_fail:
822  	path_put(&path);
823  fail:
824  	*error = err;
825  	return NULL;
826  }
827  
828  static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
829  {
830  	struct dentry *dentry;
831  	struct path path;
832  	int err = 0;
833  	/*
834  	 * Get the parent directory, calculate the hash for last
835  	 * component.
836  	 */
837  	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
838  	err = PTR_ERR(dentry);
839  	if (IS_ERR(dentry))
840  		return err;
841  
842  	/*
843  	 * All right, let's create it.
844  	 */
845  	err = security_path_mknod(&path, dentry, mode, 0);
846  	if (!err) {
847  		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
848  		if (!err) {
849  			res->mnt = mntget(path.mnt);
850  			res->dentry = dget(dentry);
851  		}
852  	}
853  	done_path_create(&path, dentry);
854  	return err;
855  }
856  
857  static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
858  {
859  	struct sock *sk = sock->sk;
860  	struct net *net = sock_net(sk);
861  	struct unix_sock *u = unix_sk(sk);
862  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
863  	char *sun_path = sunaddr->sun_path;
864  	int err;
865  	unsigned int hash;
866  	struct unix_address *addr;
867  	struct hlist_head *list;
868  
869  	err = -EINVAL;
870  	if (sunaddr->sun_family != AF_UNIX)
871  		goto out;
872  
873  	if (addr_len == sizeof(short)) {
874  		err = unix_autobind(sock);
875  		goto out;
876  	}
877  
878  	err = unix_mkname(sunaddr, addr_len, &hash);
879  	if (err < 0)
880  		goto out;
881  	addr_len = err;
882  
883  	err = mutex_lock_interruptible(&u->readlock);
884  	if (err)
885  		goto out;
886  
887  	err = -EINVAL;
888  	if (u->addr)
889  		goto out_up;
890  
891  	err = -ENOMEM;
892  	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
893  	if (!addr)
894  		goto out_up;
895  
896  	memcpy(addr->name, sunaddr, addr_len);
897  	addr->len = addr_len;
898  	addr->hash = hash ^ sk->sk_type;
899  	atomic_set(&addr->refcnt, 1);
900  
901  	if (sun_path[0]) {
902  		struct path path;
903  		umode_t mode = S_IFSOCK |
904  		       (SOCK_INODE(sock)->i_mode & ~current_umask());
905  		err = unix_mknod(sun_path, mode, &path);
906  		if (err) {
907  			if (err == -EEXIST)
908  				err = -EADDRINUSE;
909  			unix_release_addr(addr);
910  			goto out_up;
911  		}
912  		addr->hash = UNIX_HASH_SIZE;
913  		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
914  		spin_lock(&unix_table_lock);
915  		u->path = path;
916  		list = &unix_socket_table[hash];
917  	} else {
918  		spin_lock(&unix_table_lock);
919  		err = -EADDRINUSE;
920  		if (__unix_find_socket_byname(net, sunaddr, addr_len,
921  					      sk->sk_type, hash)) {
922  			unix_release_addr(addr);
923  			goto out_unlock;
924  		}
925  
926  		list = &unix_socket_table[addr->hash];
927  	}
928  
929  	err = 0;
930  	__unix_remove_socket(sk);
931  	u->addr = addr;
932  	__unix_insert_socket(list, sk);
933  
934  out_unlock:
935  	spin_unlock(&unix_table_lock);
936  out_up:
937  	mutex_unlock(&u->readlock);
938  out:
939  	return err;
940  }
941  
942  static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
943  {
944  	if (unlikely(sk1 == sk2) || !sk2) {
945  		unix_state_lock(sk1);
946  		return;
947  	}
948  	if (sk1 < sk2) {
949  		unix_state_lock(sk1);
950  		unix_state_lock_nested(sk2);
951  	} else {
952  		unix_state_lock(sk2);
953  		unix_state_lock_nested(sk1);
954  	}
955  }
956  
957  static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
958  {
959  	if (unlikely(sk1 == sk2) || !sk2) {
960  		unix_state_unlock(sk1);
961  		return;
962  	}
963  	unix_state_unlock(sk1);
964  	unix_state_unlock(sk2);
965  }
966  
967  static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
968  			      int alen, int flags)
969  {
970  	struct sock *sk = sock->sk;
971  	struct net *net = sock_net(sk);
972  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
973  	struct sock *other;
974  	unsigned int hash;
975  	int err;
976  
977  	if (addr->sa_family != AF_UNSPEC) {
978  		err = unix_mkname(sunaddr, alen, &hash);
979  		if (err < 0)
980  			goto out;
981  		alen = err;
982  
983  		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
984  		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
985  			goto out;
986  
987  restart:
988  		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
989  		if (!other)
990  			goto out;
991  
992  		unix_state_double_lock(sk, other);
993  
994  		/* Apparently VFS overslept socket death. Retry. */
995  		if (sock_flag(other, SOCK_DEAD)) {
996  			unix_state_double_unlock(sk, other);
997  			sock_put(other);
998  			goto restart;
999  		}
1000  
1001  		err = -EPERM;
1002  		if (!unix_may_send(sk, other))
1003  			goto out_unlock;
1004  
1005  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1006  		if (err)
1007  			goto out_unlock;
1008  
1009  	} else {
1010  		/*
1011  		 *	1003.1g breaking connected state with AF_UNSPEC
1012  		 */
1013  		other = NULL;
1014  		unix_state_double_lock(sk, other);
1015  	}
1016  
1017  	/*
1018  	 * If it was connected, reconnect.
1019  	 */
1020  	if (unix_peer(sk)) {
1021  		struct sock *old_peer = unix_peer(sk);
1022  		unix_peer(sk) = other;
1023  		unix_state_double_unlock(sk, other);
1024  
1025  		if (other != old_peer)
1026  			unix_dgram_disconnected(sk, old_peer);
1027  		sock_put(old_peer);
1028  	} else {
1029  		unix_peer(sk) = other;
1030  		unix_state_double_unlock(sk, other);
1031  	}
1032  	return 0;
1033  
1034  out_unlock:
1035  	unix_state_double_unlock(sk, other);
1036  	sock_put(other);
1037  out:
1038  	return err;
1039  }
1040  
1041  static long unix_wait_for_peer(struct sock *other, long timeo)
1042  {
1043  	struct unix_sock *u = unix_sk(other);
1044  	int sched;
1045  	DEFINE_WAIT(wait);
1046  
1047  	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1048  
1049  	sched = !sock_flag(other, SOCK_DEAD) &&
1050  		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1051  		unix_recvq_full(other);
1052  
1053  	unix_state_unlock(other);
1054  
1055  	if (sched)
1056  		timeo = schedule_timeout(timeo);
1057  
1058  	finish_wait(&u->peer_wait, &wait);
1059  	return timeo;
1060  }
1061  
1062  static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1063  			       int addr_len, int flags)
1064  {
1065  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1066  	struct sock *sk = sock->sk;
1067  	struct net *net = sock_net(sk);
1068  	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1069  	struct sock *newsk = NULL;
1070  	struct sock *other = NULL;
1071  	struct sk_buff *skb = NULL;
1072  	unsigned int hash;
1073  	int st;
1074  	int err;
1075  	long timeo;
1076  
1077  	err = unix_mkname(sunaddr, addr_len, &hash);
1078  	if (err < 0)
1079  		goto out;
1080  	addr_len = err;
1081  
1082  	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1083  	    (err = unix_autobind(sock)) != 0)
1084  		goto out;
1085  
1086  	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1087  
1088  	/* First of all allocate resources.
1089  	   If we will make it after state is locked,
1090  	   we will have to recheck all again in any case.
1091  	 */
1092  
1093  	err = -ENOMEM;
1094  
1095  	/* create new sock for complete connection */
1096  	newsk = unix_create1(sock_net(sk), NULL);
1097  	if (newsk == NULL)
1098  		goto out;
1099  
1100  	/* Allocate skb for sending to listening sock */
1101  	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1102  	if (skb == NULL)
1103  		goto out;
1104  
1105  restart:
1106  	/*  Find listening sock. */
1107  	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1108  	if (!other)
1109  		goto out;
1110  
1111  	/* Latch state of peer */
1112  	unix_state_lock(other);
1113  
1114  	/* Apparently VFS overslept socket death. Retry. */
1115  	if (sock_flag(other, SOCK_DEAD)) {
1116  		unix_state_unlock(other);
1117  		sock_put(other);
1118  		goto restart;
1119  	}
1120  
1121  	err = -ECONNREFUSED;
1122  	if (other->sk_state != TCP_LISTEN)
1123  		goto out_unlock;
1124  	if (other->sk_shutdown & RCV_SHUTDOWN)
1125  		goto out_unlock;
1126  
1127  	if (unix_recvq_full(other)) {
1128  		err = -EAGAIN;
1129  		if (!timeo)
1130  			goto out_unlock;
1131  
1132  		timeo = unix_wait_for_peer(other, timeo);
1133  
1134  		err = sock_intr_errno(timeo);
1135  		if (signal_pending(current))
1136  			goto out;
1137  		sock_put(other);
1138  		goto restart;
1139  	}
1140  
1141  	/* Latch our state.
1142  
1143  	   It is tricky place. We need to grab our state lock and cannot
1144  	   drop lock on peer. It is dangerous because deadlock is
1145  	   possible. Connect to self case and simultaneous
1146  	   attempt to connect are eliminated by checking socket
1147  	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1148  	   check this before attempt to grab lock.
1149  
1150  	   Well, and we have to recheck the state after socket locked.
1151  	 */
1152  	st = sk->sk_state;
1153  
1154  	switch (st) {
1155  	case TCP_CLOSE:
1156  		/* This is ok... continue with connect */
1157  		break;
1158  	case TCP_ESTABLISHED:
1159  		/* Socket is already connected */
1160  		err = -EISCONN;
1161  		goto out_unlock;
1162  	default:
1163  		err = -EINVAL;
1164  		goto out_unlock;
1165  	}
1166  
1167  	unix_state_lock_nested(sk);
1168  
1169  	if (sk->sk_state != st) {
1170  		unix_state_unlock(sk);
1171  		unix_state_unlock(other);
1172  		sock_put(other);
1173  		goto restart;
1174  	}
1175  
1176  	err = security_unix_stream_connect(sk, other, newsk);
1177  	if (err) {
1178  		unix_state_unlock(sk);
1179  		goto out_unlock;
1180  	}
1181  
1182  	/* The way is open! Fastly set all the necessary fields... */
1183  
1184  	sock_hold(sk);
1185  	unix_peer(newsk)	= sk;
1186  	newsk->sk_state		= TCP_ESTABLISHED;
1187  	newsk->sk_type		= sk->sk_type;
1188  	init_peercred(newsk);
1189  	newu = unix_sk(newsk);
1190  	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1191  	otheru = unix_sk(other);
1192  
1193  	/* copy address information from listening to new sock*/
1194  	if (otheru->addr) {
1195  		atomic_inc(&otheru->addr->refcnt);
1196  		newu->addr = otheru->addr;
1197  	}
1198  	if (otheru->path.dentry) {
1199  		path_get(&otheru->path);
1200  		newu->path = otheru->path;
1201  	}
1202  
1203  	/* Set credentials */
1204  	copy_peercred(sk, other);
1205  
1206  	sock->state	= SS_CONNECTED;
1207  	sk->sk_state	= TCP_ESTABLISHED;
1208  	sock_hold(newsk);
1209  
1210  	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1211  	unix_peer(sk)	= newsk;
1212  
1213  	unix_state_unlock(sk);
1214  
1215  	/* take ten and and send info to listening sock */
1216  	spin_lock(&other->sk_receive_queue.lock);
1217  	__skb_queue_tail(&other->sk_receive_queue, skb);
1218  	spin_unlock(&other->sk_receive_queue.lock);
1219  	unix_state_unlock(other);
1220  	other->sk_data_ready(other);
1221  	sock_put(other);
1222  	return 0;
1223  
1224  out_unlock:
1225  	if (other)
1226  		unix_state_unlock(other);
1227  
1228  out:
1229  	kfree_skb(skb);
1230  	if (newsk)
1231  		unix_release_sock(newsk, 0);
1232  	if (other)
1233  		sock_put(other);
1234  	return err;
1235  }
1236  
1237  static int unix_socketpair(struct socket *socka, struct socket *sockb)
1238  {
1239  	struct sock *ska = socka->sk, *skb = sockb->sk;
1240  
1241  	/* Join our sockets back to back */
1242  	sock_hold(ska);
1243  	sock_hold(skb);
1244  	unix_peer(ska) = skb;
1245  	unix_peer(skb) = ska;
1246  	init_peercred(ska);
1247  	init_peercred(skb);
1248  
1249  	if (ska->sk_type != SOCK_DGRAM) {
1250  		ska->sk_state = TCP_ESTABLISHED;
1251  		skb->sk_state = TCP_ESTABLISHED;
1252  		socka->state  = SS_CONNECTED;
1253  		sockb->state  = SS_CONNECTED;
1254  	}
1255  	return 0;
1256  }
1257  
1258  static void unix_sock_inherit_flags(const struct socket *old,
1259  				    struct socket *new)
1260  {
1261  	if (test_bit(SOCK_PASSCRED, &old->flags))
1262  		set_bit(SOCK_PASSCRED, &new->flags);
1263  	if (test_bit(SOCK_PASSSEC, &old->flags))
1264  		set_bit(SOCK_PASSSEC, &new->flags);
1265  }
1266  
1267  static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1268  {
1269  	struct sock *sk = sock->sk;
1270  	struct sock *tsk;
1271  	struct sk_buff *skb;
1272  	int err;
1273  
1274  	err = -EOPNOTSUPP;
1275  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1276  		goto out;
1277  
1278  	err = -EINVAL;
1279  	if (sk->sk_state != TCP_LISTEN)
1280  		goto out;
1281  
1282  	/* If socket state is TCP_LISTEN it cannot change (for now...),
1283  	 * so that no locks are necessary.
1284  	 */
1285  
1286  	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1287  	if (!skb) {
1288  		/* This means receive shutdown. */
1289  		if (err == 0)
1290  			err = -EINVAL;
1291  		goto out;
1292  	}
1293  
1294  	tsk = skb->sk;
1295  	skb_free_datagram(sk, skb);
1296  	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1297  
1298  	/* attach accepted sock to socket */
1299  	unix_state_lock(tsk);
1300  	newsock->state = SS_CONNECTED;
1301  	unix_sock_inherit_flags(sock, newsock);
1302  	sock_graft(tsk, newsock);
1303  	unix_state_unlock(tsk);
1304  	return 0;
1305  
1306  out:
1307  	return err;
1308  }
1309  
1310  
1311  static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1312  {
1313  	struct sock *sk = sock->sk;
1314  	struct unix_sock *u;
1315  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1316  	int err = 0;
1317  
1318  	if (peer) {
1319  		sk = unix_peer_get(sk);
1320  
1321  		err = -ENOTCONN;
1322  		if (!sk)
1323  			goto out;
1324  		err = 0;
1325  	} else {
1326  		sock_hold(sk);
1327  	}
1328  
1329  	u = unix_sk(sk);
1330  	unix_state_lock(sk);
1331  	if (!u->addr) {
1332  		sunaddr->sun_family = AF_UNIX;
1333  		sunaddr->sun_path[0] = 0;
1334  		*uaddr_len = sizeof(short);
1335  	} else {
1336  		struct unix_address *addr = u->addr;
1337  
1338  		*uaddr_len = addr->len;
1339  		memcpy(sunaddr, addr->name, *uaddr_len);
1340  	}
1341  	unix_state_unlock(sk);
1342  	sock_put(sk);
1343  out:
1344  	return err;
1345  }
1346  
1347  static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1348  {
1349  	int i;
1350  
1351  	scm->fp = UNIXCB(skb).fp;
1352  	UNIXCB(skb).fp = NULL;
1353  
1354  	for (i = scm->fp->count-1; i >= 0; i--)
1355  		unix_notinflight(scm->fp->fp[i]);
1356  }
1357  
1358  static void unix_destruct_scm(struct sk_buff *skb)
1359  {
1360  	struct scm_cookie scm;
1361  	memset(&scm, 0, sizeof(scm));
1362  	scm.pid  = UNIXCB(skb).pid;
1363  	if (UNIXCB(skb).fp)
1364  		unix_detach_fds(&scm, skb);
1365  
1366  	/* Alas, it calls VFS */
1367  	/* So fscking what? fput() had been SMP-safe since the last Summer */
1368  	scm_destroy(&scm);
1369  	sock_wfree(skb);
1370  }
1371  
1372  #define MAX_RECURSION_LEVEL 4
1373  
1374  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1375  {
1376  	int i;
1377  	unsigned char max_level = 0;
1378  	int unix_sock_count = 0;
1379  
1380  	for (i = scm->fp->count - 1; i >= 0; i--) {
1381  		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1382  
1383  		if (sk) {
1384  			unix_sock_count++;
1385  			max_level = max(max_level,
1386  					unix_sk(sk)->recursion_level);
1387  		}
1388  	}
1389  	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1390  		return -ETOOMANYREFS;
1391  
1392  	/*
1393  	 * Need to duplicate file references for the sake of garbage
1394  	 * collection.  Otherwise a socket in the fps might become a
1395  	 * candidate for GC while the skb is not yet queued.
1396  	 */
1397  	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1398  	if (!UNIXCB(skb).fp)
1399  		return -ENOMEM;
1400  
1401  	if (unix_sock_count) {
1402  		for (i = scm->fp->count - 1; i >= 0; i--)
1403  			unix_inflight(scm->fp->fp[i]);
1404  	}
1405  	return max_level;
1406  }
1407  
1408  static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1409  {
1410  	int err = 0;
1411  
1412  	UNIXCB(skb).pid  = get_pid(scm->pid);
1413  	UNIXCB(skb).uid = scm->creds.uid;
1414  	UNIXCB(skb).gid = scm->creds.gid;
1415  	UNIXCB(skb).fp = NULL;
1416  	if (scm->fp && send_fds)
1417  		err = unix_attach_fds(scm, skb);
1418  
1419  	skb->destructor = unix_destruct_scm;
1420  	return err;
1421  }
1422  
1423  /*
1424   * Some apps rely on write() giving SCM_CREDENTIALS
1425   * We include credentials if source or destination socket
1426   * asserted SOCK_PASSCRED.
1427   */
1428  static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1429  			    const struct sock *other)
1430  {
1431  	if (UNIXCB(skb).pid)
1432  		return;
1433  	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1434  	    !other->sk_socket ||
1435  	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1436  		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1437  		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1438  	}
1439  }
1440  
1441  /*
1442   *	Send AF_UNIX data.
1443   */
1444  
1445  static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1446  			      struct msghdr *msg, size_t len)
1447  {
1448  	struct sock *sk = sock->sk;
1449  	struct net *net = sock_net(sk);
1450  	struct unix_sock *u = unix_sk(sk);
1451  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1452  	struct sock *other = NULL;
1453  	int namelen = 0; /* fake GCC */
1454  	int err;
1455  	unsigned int hash;
1456  	struct sk_buff *skb;
1457  	long timeo;
1458  	struct scm_cookie scm;
1459  	int max_level;
1460  	int data_len = 0;
1461  
1462  	wait_for_unix_gc();
1463  	err = scm_send(sock, msg, &scm, false);
1464  	if (err < 0)
1465  		return err;
1466  
1467  	err = -EOPNOTSUPP;
1468  	if (msg->msg_flags&MSG_OOB)
1469  		goto out;
1470  
1471  	if (msg->msg_namelen) {
1472  		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1473  		if (err < 0)
1474  			goto out;
1475  		namelen = err;
1476  	} else {
1477  		sunaddr = NULL;
1478  		err = -ENOTCONN;
1479  		other = unix_peer_get(sk);
1480  		if (!other)
1481  			goto out;
1482  	}
1483  
1484  	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1485  	    && (err = unix_autobind(sock)) != 0)
1486  		goto out;
1487  
1488  	err = -EMSGSIZE;
1489  	if (len > sk->sk_sndbuf - 32)
1490  		goto out;
1491  
1492  	if (len > SKB_MAX_ALLOC) {
1493  		data_len = min_t(size_t,
1494  				 len - SKB_MAX_ALLOC,
1495  				 MAX_SKB_FRAGS * PAGE_SIZE);
1496  		data_len = PAGE_ALIGN(data_len);
1497  
1498  		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1499  	}
1500  
1501  	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1502  				   msg->msg_flags & MSG_DONTWAIT, &err,
1503  				   PAGE_ALLOC_COSTLY_ORDER);
1504  	if (skb == NULL)
1505  		goto out;
1506  
1507  	err = unix_scm_to_skb(&scm, skb, true);
1508  	if (err < 0)
1509  		goto out_free;
1510  	max_level = err + 1;
1511  	unix_get_secdata(&scm, skb);
1512  
1513  	skb_put(skb, len - data_len);
1514  	skb->data_len = data_len;
1515  	skb->len = len;
1516  	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1517  	if (err)
1518  		goto out_free;
1519  
1520  	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1521  
1522  restart:
1523  	if (!other) {
1524  		err = -ECONNRESET;
1525  		if (sunaddr == NULL)
1526  			goto out_free;
1527  
1528  		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1529  					hash, &err);
1530  		if (other == NULL)
1531  			goto out_free;
1532  	}
1533  
1534  	if (sk_filter(other, skb) < 0) {
1535  		/* Toss the packet but do not return any error to the sender */
1536  		err = len;
1537  		goto out_free;
1538  	}
1539  
1540  	unix_state_lock(other);
1541  	err = -EPERM;
1542  	if (!unix_may_send(sk, other))
1543  		goto out_unlock;
1544  
1545  	if (sock_flag(other, SOCK_DEAD)) {
1546  		/*
1547  		 *	Check with 1003.1g - what should
1548  		 *	datagram error
1549  		 */
1550  		unix_state_unlock(other);
1551  		sock_put(other);
1552  
1553  		err = 0;
1554  		unix_state_lock(sk);
1555  		if (unix_peer(sk) == other) {
1556  			unix_peer(sk) = NULL;
1557  			unix_state_unlock(sk);
1558  
1559  			unix_dgram_disconnected(sk, other);
1560  			sock_put(other);
1561  			err = -ECONNREFUSED;
1562  		} else {
1563  			unix_state_unlock(sk);
1564  		}
1565  
1566  		other = NULL;
1567  		if (err)
1568  			goto out_free;
1569  		goto restart;
1570  	}
1571  
1572  	err = -EPIPE;
1573  	if (other->sk_shutdown & RCV_SHUTDOWN)
1574  		goto out_unlock;
1575  
1576  	if (sk->sk_type != SOCK_SEQPACKET) {
1577  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1578  		if (err)
1579  			goto out_unlock;
1580  	}
1581  
1582  	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1583  		if (!timeo) {
1584  			err = -EAGAIN;
1585  			goto out_unlock;
1586  		}
1587  
1588  		timeo = unix_wait_for_peer(other, timeo);
1589  
1590  		err = sock_intr_errno(timeo);
1591  		if (signal_pending(current))
1592  			goto out_free;
1593  
1594  		goto restart;
1595  	}
1596  
1597  	if (sock_flag(other, SOCK_RCVTSTAMP))
1598  		__net_timestamp(skb);
1599  	maybe_add_creds(skb, sock, other);
1600  	skb_queue_tail(&other->sk_receive_queue, skb);
1601  	if (max_level > unix_sk(other)->recursion_level)
1602  		unix_sk(other)->recursion_level = max_level;
1603  	unix_state_unlock(other);
1604  	other->sk_data_ready(other);
1605  	sock_put(other);
1606  	scm_destroy(&scm);
1607  	return len;
1608  
1609  out_unlock:
1610  	unix_state_unlock(other);
1611  out_free:
1612  	kfree_skb(skb);
1613  out:
1614  	if (other)
1615  		sock_put(other);
1616  	scm_destroy(&scm);
1617  	return err;
1618  }
1619  
1620  /* We use paged skbs for stream sockets, and limit occupancy to 32768
1621   * bytes, and a minimun of a full page.
1622   */
1623  #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1624  
1625  static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1626  			       struct msghdr *msg, size_t len)
1627  {
1628  	struct sock *sk = sock->sk;
1629  	struct sock *other = NULL;
1630  	int err, size;
1631  	struct sk_buff *skb;
1632  	int sent = 0;
1633  	struct scm_cookie scm;
1634  	bool fds_sent = false;
1635  	int max_level;
1636  	int data_len;
1637  
1638  	wait_for_unix_gc();
1639  	err = scm_send(sock, msg, &scm, false);
1640  	if (err < 0)
1641  		return err;
1642  
1643  	err = -EOPNOTSUPP;
1644  	if (msg->msg_flags&MSG_OOB)
1645  		goto out_err;
1646  
1647  	if (msg->msg_namelen) {
1648  		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1649  		goto out_err;
1650  	} else {
1651  		err = -ENOTCONN;
1652  		other = unix_peer(sk);
1653  		if (!other)
1654  			goto out_err;
1655  	}
1656  
1657  	if (sk->sk_shutdown & SEND_SHUTDOWN)
1658  		goto pipe_err;
1659  
1660  	while (sent < len) {
1661  		size = len - sent;
1662  
1663  		/* Keep two messages in the pipe so it schedules better */
1664  		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1665  
1666  		/* allow fallback to order-0 allocations */
1667  		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1668  
1669  		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1670  
1671  		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1672  
1673  		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1674  					   msg->msg_flags & MSG_DONTWAIT, &err,
1675  					   get_order(UNIX_SKB_FRAGS_SZ));
1676  		if (!skb)
1677  			goto out_err;
1678  
1679  		/* Only send the fds in the first buffer */
1680  		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1681  		if (err < 0) {
1682  			kfree_skb(skb);
1683  			goto out_err;
1684  		}
1685  		max_level = err + 1;
1686  		fds_sent = true;
1687  
1688  		skb_put(skb, size - data_len);
1689  		skb->data_len = data_len;
1690  		skb->len = size;
1691  		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1692  		if (err) {
1693  			kfree_skb(skb);
1694  			goto out_err;
1695  		}
1696  
1697  		unix_state_lock(other);
1698  
1699  		if (sock_flag(other, SOCK_DEAD) ||
1700  		    (other->sk_shutdown & RCV_SHUTDOWN))
1701  			goto pipe_err_free;
1702  
1703  		maybe_add_creds(skb, sock, other);
1704  		skb_queue_tail(&other->sk_receive_queue, skb);
1705  		if (max_level > unix_sk(other)->recursion_level)
1706  			unix_sk(other)->recursion_level = max_level;
1707  		unix_state_unlock(other);
1708  		other->sk_data_ready(other);
1709  		sent += size;
1710  	}
1711  
1712  	scm_destroy(&scm);
1713  
1714  	return sent;
1715  
1716  pipe_err_free:
1717  	unix_state_unlock(other);
1718  	kfree_skb(skb);
1719  pipe_err:
1720  	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1721  		send_sig(SIGPIPE, current, 0);
1722  	err = -EPIPE;
1723  out_err:
1724  	scm_destroy(&scm);
1725  	return sent ? : err;
1726  }
1727  
1728  static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1729  				  struct msghdr *msg, size_t len)
1730  {
1731  	int err;
1732  	struct sock *sk = sock->sk;
1733  
1734  	err = sock_error(sk);
1735  	if (err)
1736  		return err;
1737  
1738  	if (sk->sk_state != TCP_ESTABLISHED)
1739  		return -ENOTCONN;
1740  
1741  	if (msg->msg_namelen)
1742  		msg->msg_namelen = 0;
1743  
1744  	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1745  }
1746  
1747  static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1748  			      struct msghdr *msg, size_t size,
1749  			      int flags)
1750  {
1751  	struct sock *sk = sock->sk;
1752  
1753  	if (sk->sk_state != TCP_ESTABLISHED)
1754  		return -ENOTCONN;
1755  
1756  	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1757  }
1758  
1759  static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1760  {
1761  	struct unix_sock *u = unix_sk(sk);
1762  
1763  	if (u->addr) {
1764  		msg->msg_namelen = u->addr->len;
1765  		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1766  	}
1767  }
1768  
1769  static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1770  			      struct msghdr *msg, size_t size,
1771  			      int flags)
1772  {
1773  	struct scm_cookie scm;
1774  	struct sock *sk = sock->sk;
1775  	struct unix_sock *u = unix_sk(sk);
1776  	int noblock = flags & MSG_DONTWAIT;
1777  	struct sk_buff *skb;
1778  	int err;
1779  	int peeked, skip;
1780  
1781  	err = -EOPNOTSUPP;
1782  	if (flags&MSG_OOB)
1783  		goto out;
1784  
1785  	err = mutex_lock_interruptible(&u->readlock);
1786  	if (unlikely(err)) {
1787  		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1788  		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1789  		 */
1790  		err = noblock ? -EAGAIN : -ERESTARTSYS;
1791  		goto out;
1792  	}
1793  
1794  	skip = sk_peek_offset(sk, flags);
1795  
1796  	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1797  	if (!skb) {
1798  		unix_state_lock(sk);
1799  		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1800  		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1801  		    (sk->sk_shutdown & RCV_SHUTDOWN))
1802  			err = 0;
1803  		unix_state_unlock(sk);
1804  		goto out_unlock;
1805  	}
1806  
1807  	wake_up_interruptible_sync_poll(&u->peer_wait,
1808  					POLLOUT | POLLWRNORM | POLLWRBAND);
1809  
1810  	if (msg->msg_name)
1811  		unix_copy_addr(msg, skb->sk);
1812  
1813  	if (size > skb->len - skip)
1814  		size = skb->len - skip;
1815  	else if (size < skb->len - skip)
1816  		msg->msg_flags |= MSG_TRUNC;
1817  
1818  	err = skb_copy_datagram_msg(skb, skip, msg, size);
1819  	if (err)
1820  		goto out_free;
1821  
1822  	if (sock_flag(sk, SOCK_RCVTSTAMP))
1823  		__sock_recv_timestamp(msg, sk, skb);
1824  
1825  	memset(&scm, 0, sizeof(scm));
1826  
1827  	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1828  	unix_set_secdata(&scm, skb);
1829  
1830  	if (!(flags & MSG_PEEK)) {
1831  		if (UNIXCB(skb).fp)
1832  			unix_detach_fds(&scm, skb);
1833  
1834  		sk_peek_offset_bwd(sk, skb->len);
1835  	} else {
1836  		/* It is questionable: on PEEK we could:
1837  		   - do not return fds - good, but too simple 8)
1838  		   - return fds, and do not return them on read (old strategy,
1839  		     apparently wrong)
1840  		   - clone fds (I chose it for now, it is the most universal
1841  		     solution)
1842  
1843  		   POSIX 1003.1g does not actually define this clearly
1844  		   at all. POSIX 1003.1g doesn't define a lot of things
1845  		   clearly however!
1846  
1847  		*/
1848  
1849  		sk_peek_offset_fwd(sk, size);
1850  
1851  		if (UNIXCB(skb).fp)
1852  			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1853  	}
1854  	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1855  
1856  	scm_recv(sock, msg, &scm, flags);
1857  
1858  out_free:
1859  	skb_free_datagram(sk, skb);
1860  out_unlock:
1861  	mutex_unlock(&u->readlock);
1862  out:
1863  	return err;
1864  }
1865  
1866  /*
1867   *	Sleep until more data has arrived. But check for races..
1868   */
1869  static long unix_stream_data_wait(struct sock *sk, long timeo,
1870  				  struct sk_buff *last)
1871  {
1872  	DEFINE_WAIT(wait);
1873  
1874  	unix_state_lock(sk);
1875  
1876  	for (;;) {
1877  		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1878  
1879  		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1880  		    sk->sk_err ||
1881  		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1882  		    signal_pending(current) ||
1883  		    !timeo)
1884  			break;
1885  
1886  		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1887  		unix_state_unlock(sk);
1888  		timeo = freezable_schedule_timeout(timeo);
1889  		unix_state_lock(sk);
1890  		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1891  	}
1892  
1893  	finish_wait(sk_sleep(sk), &wait);
1894  	unix_state_unlock(sk);
1895  	return timeo;
1896  }
1897  
1898  static unsigned int unix_skb_len(const struct sk_buff *skb)
1899  {
1900  	return skb->len - UNIXCB(skb).consumed;
1901  }
1902  
1903  static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1904  			       struct msghdr *msg, size_t size,
1905  			       int flags)
1906  {
1907  	struct scm_cookie scm;
1908  	struct sock *sk = sock->sk;
1909  	struct unix_sock *u = unix_sk(sk);
1910  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1911  	int copied = 0;
1912  	int noblock = flags & MSG_DONTWAIT;
1913  	int check_creds = 0;
1914  	int target;
1915  	int err = 0;
1916  	long timeo;
1917  	int skip;
1918  
1919  	err = -EINVAL;
1920  	if (sk->sk_state != TCP_ESTABLISHED)
1921  		goto out;
1922  
1923  	err = -EOPNOTSUPP;
1924  	if (flags&MSG_OOB)
1925  		goto out;
1926  
1927  	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1928  	timeo = sock_rcvtimeo(sk, noblock);
1929  
1930  	/* Lock the socket to prevent queue disordering
1931  	 * while sleeps in memcpy_tomsg
1932  	 */
1933  
1934  	memset(&scm, 0, sizeof(scm));
1935  
1936  	err = mutex_lock_interruptible(&u->readlock);
1937  	if (unlikely(err)) {
1938  		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1939  		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1940  		 */
1941  		err = noblock ? -EAGAIN : -ERESTARTSYS;
1942  		goto out;
1943  	}
1944  
1945  	do {
1946  		int chunk;
1947  		struct sk_buff *skb, *last;
1948  
1949  		unix_state_lock(sk);
1950  		last = skb = skb_peek(&sk->sk_receive_queue);
1951  again:
1952  		if (skb == NULL) {
1953  			unix_sk(sk)->recursion_level = 0;
1954  			if (copied >= target)
1955  				goto unlock;
1956  
1957  			/*
1958  			 *	POSIX 1003.1g mandates this order.
1959  			 */
1960  
1961  			err = sock_error(sk);
1962  			if (err)
1963  				goto unlock;
1964  			if (sk->sk_shutdown & RCV_SHUTDOWN)
1965  				goto unlock;
1966  
1967  			unix_state_unlock(sk);
1968  			err = -EAGAIN;
1969  			if (!timeo)
1970  				break;
1971  			mutex_unlock(&u->readlock);
1972  
1973  			timeo = unix_stream_data_wait(sk, timeo, last);
1974  
1975  			if (signal_pending(current)
1976  			    ||  mutex_lock_interruptible(&u->readlock)) {
1977  				err = sock_intr_errno(timeo);
1978  				goto out;
1979  			}
1980  
1981  			continue;
1982   unlock:
1983  			unix_state_unlock(sk);
1984  			break;
1985  		}
1986  
1987  		skip = sk_peek_offset(sk, flags);
1988  		while (skip >= unix_skb_len(skb)) {
1989  			skip -= unix_skb_len(skb);
1990  			last = skb;
1991  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1992  			if (!skb)
1993  				goto again;
1994  		}
1995  
1996  		unix_state_unlock(sk);
1997  
1998  		if (check_creds) {
1999  			/* Never glue messages from different writers */
2000  			if ((UNIXCB(skb).pid  != scm.pid) ||
2001  			    !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
2002  			    !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
2003  				break;
2004  		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2005  			/* Copy credentials */
2006  			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2007  			check_creds = 1;
2008  		}
2009  
2010  		/* Copy address just once */
2011  		if (sunaddr) {
2012  			unix_copy_addr(msg, skb->sk);
2013  			sunaddr = NULL;
2014  		}
2015  
2016  		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2017  		if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2018  					  msg, chunk)) {
2019  			if (copied == 0)
2020  				copied = -EFAULT;
2021  			break;
2022  		}
2023  		copied += chunk;
2024  		size -= chunk;
2025  
2026  		/* Mark read part of skb as used */
2027  		if (!(flags & MSG_PEEK)) {
2028  			UNIXCB(skb).consumed += chunk;
2029  
2030  			sk_peek_offset_bwd(sk, chunk);
2031  
2032  			if (UNIXCB(skb).fp)
2033  				unix_detach_fds(&scm, skb);
2034  
2035  			if (unix_skb_len(skb))
2036  				break;
2037  
2038  			skb_unlink(skb, &sk->sk_receive_queue);
2039  			consume_skb(skb);
2040  
2041  			if (scm.fp)
2042  				break;
2043  		} else {
2044  			/* It is questionable, see note in unix_dgram_recvmsg.
2045  			 */
2046  			if (UNIXCB(skb).fp)
2047  				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2048  
2049  			sk_peek_offset_fwd(sk, chunk);
2050  
2051  			break;
2052  		}
2053  	} while (size);
2054  
2055  	mutex_unlock(&u->readlock);
2056  	scm_recv(sock, msg, &scm, flags);
2057  out:
2058  	return copied ? : err;
2059  }
2060  
2061  static int unix_shutdown(struct socket *sock, int mode)
2062  {
2063  	struct sock *sk = sock->sk;
2064  	struct sock *other;
2065  
2066  	if (mode < SHUT_RD || mode > SHUT_RDWR)
2067  		return -EINVAL;
2068  	/* This maps:
2069  	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2070  	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2071  	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2072  	 */
2073  	++mode;
2074  
2075  	unix_state_lock(sk);
2076  	sk->sk_shutdown |= mode;
2077  	other = unix_peer(sk);
2078  	if (other)
2079  		sock_hold(other);
2080  	unix_state_unlock(sk);
2081  	sk->sk_state_change(sk);
2082  
2083  	if (other &&
2084  		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2085  
2086  		int peer_mode = 0;
2087  
2088  		if (mode&RCV_SHUTDOWN)
2089  			peer_mode |= SEND_SHUTDOWN;
2090  		if (mode&SEND_SHUTDOWN)
2091  			peer_mode |= RCV_SHUTDOWN;
2092  		unix_state_lock(other);
2093  		other->sk_shutdown |= peer_mode;
2094  		unix_state_unlock(other);
2095  		other->sk_state_change(other);
2096  		if (peer_mode == SHUTDOWN_MASK)
2097  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2098  		else if (peer_mode & RCV_SHUTDOWN)
2099  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2100  	}
2101  	if (other)
2102  		sock_put(other);
2103  
2104  	return 0;
2105  }
2106  
2107  long unix_inq_len(struct sock *sk)
2108  {
2109  	struct sk_buff *skb;
2110  	long amount = 0;
2111  
2112  	if (sk->sk_state == TCP_LISTEN)
2113  		return -EINVAL;
2114  
2115  	spin_lock(&sk->sk_receive_queue.lock);
2116  	if (sk->sk_type == SOCK_STREAM ||
2117  	    sk->sk_type == SOCK_SEQPACKET) {
2118  		skb_queue_walk(&sk->sk_receive_queue, skb)
2119  			amount += unix_skb_len(skb);
2120  	} else {
2121  		skb = skb_peek(&sk->sk_receive_queue);
2122  		if (skb)
2123  			amount = skb->len;
2124  	}
2125  	spin_unlock(&sk->sk_receive_queue.lock);
2126  
2127  	return amount;
2128  }
2129  EXPORT_SYMBOL_GPL(unix_inq_len);
2130  
2131  long unix_outq_len(struct sock *sk)
2132  {
2133  	return sk_wmem_alloc_get(sk);
2134  }
2135  EXPORT_SYMBOL_GPL(unix_outq_len);
2136  
2137  static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2138  {
2139  	struct sock *sk = sock->sk;
2140  	long amount = 0;
2141  	int err;
2142  
2143  	switch (cmd) {
2144  	case SIOCOUTQ:
2145  		amount = unix_outq_len(sk);
2146  		err = put_user(amount, (int __user *)arg);
2147  		break;
2148  	case SIOCINQ:
2149  		amount = unix_inq_len(sk);
2150  		if (amount < 0)
2151  			err = amount;
2152  		else
2153  			err = put_user(amount, (int __user *)arg);
2154  		break;
2155  	default:
2156  		err = -ENOIOCTLCMD;
2157  		break;
2158  	}
2159  	return err;
2160  }
2161  
2162  static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2163  {
2164  	struct sock *sk = sock->sk;
2165  	unsigned int mask;
2166  
2167  	sock_poll_wait(file, sk_sleep(sk), wait);
2168  	mask = 0;
2169  
2170  	/* exceptional events? */
2171  	if (sk->sk_err)
2172  		mask |= POLLERR;
2173  	if (sk->sk_shutdown == SHUTDOWN_MASK)
2174  		mask |= POLLHUP;
2175  	if (sk->sk_shutdown & RCV_SHUTDOWN)
2176  		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2177  
2178  	/* readable? */
2179  	if (!skb_queue_empty(&sk->sk_receive_queue))
2180  		mask |= POLLIN | POLLRDNORM;
2181  
2182  	/* Connection-based need to check for termination and startup */
2183  	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2184  	    sk->sk_state == TCP_CLOSE)
2185  		mask |= POLLHUP;
2186  
2187  	/*
2188  	 * we set writable also when the other side has shut down the
2189  	 * connection. This prevents stuck sockets.
2190  	 */
2191  	if (unix_writable(sk))
2192  		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2193  
2194  	return mask;
2195  }
2196  
2197  static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2198  				    poll_table *wait)
2199  {
2200  	struct sock *sk = sock->sk, *other;
2201  	unsigned int mask, writable;
2202  
2203  	sock_poll_wait(file, sk_sleep(sk), wait);
2204  	mask = 0;
2205  
2206  	/* exceptional events? */
2207  	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2208  		mask |= POLLERR |
2209  			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2210  
2211  	if (sk->sk_shutdown & RCV_SHUTDOWN)
2212  		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2213  	if (sk->sk_shutdown == SHUTDOWN_MASK)
2214  		mask |= POLLHUP;
2215  
2216  	/* readable? */
2217  	if (!skb_queue_empty(&sk->sk_receive_queue))
2218  		mask |= POLLIN | POLLRDNORM;
2219  
2220  	/* Connection-based need to check for termination and startup */
2221  	if (sk->sk_type == SOCK_SEQPACKET) {
2222  		if (sk->sk_state == TCP_CLOSE)
2223  			mask |= POLLHUP;
2224  		/* connection hasn't started yet? */
2225  		if (sk->sk_state == TCP_SYN_SENT)
2226  			return mask;
2227  	}
2228  
2229  	/* No write status requested, avoid expensive OUT tests. */
2230  	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2231  		return mask;
2232  
2233  	writable = unix_writable(sk);
2234  	other = unix_peer_get(sk);
2235  	if (other) {
2236  		if (unix_peer(other) != sk) {
2237  			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2238  			if (unix_recvq_full(other))
2239  				writable = 0;
2240  		}
2241  		sock_put(other);
2242  	}
2243  
2244  	if (writable)
2245  		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2246  	else
2247  		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2248  
2249  	return mask;
2250  }
2251  
2252  #ifdef CONFIG_PROC_FS
2253  
2254  #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2255  
2256  #define get_bucket(x) ((x) >> BUCKET_SPACE)
2257  #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2258  #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2259  
2260  static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2261  {
2262  	unsigned long offset = get_offset(*pos);
2263  	unsigned long bucket = get_bucket(*pos);
2264  	struct sock *sk;
2265  	unsigned long count = 0;
2266  
2267  	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2268  		if (sock_net(sk) != seq_file_net(seq))
2269  			continue;
2270  		if (++count == offset)
2271  			break;
2272  	}
2273  
2274  	return sk;
2275  }
2276  
2277  static struct sock *unix_next_socket(struct seq_file *seq,
2278  				     struct sock *sk,
2279  				     loff_t *pos)
2280  {
2281  	unsigned long bucket;
2282  
2283  	while (sk > (struct sock *)SEQ_START_TOKEN) {
2284  		sk = sk_next(sk);
2285  		if (!sk)
2286  			goto next_bucket;
2287  		if (sock_net(sk) == seq_file_net(seq))
2288  			return sk;
2289  	}
2290  
2291  	do {
2292  		sk = unix_from_bucket(seq, pos);
2293  		if (sk)
2294  			return sk;
2295  
2296  next_bucket:
2297  		bucket = get_bucket(*pos) + 1;
2298  		*pos = set_bucket_offset(bucket, 1);
2299  	} while (bucket < ARRAY_SIZE(unix_socket_table));
2300  
2301  	return NULL;
2302  }
2303  
2304  static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2305  	__acquires(unix_table_lock)
2306  {
2307  	spin_lock(&unix_table_lock);
2308  
2309  	if (!*pos)
2310  		return SEQ_START_TOKEN;
2311  
2312  	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2313  		return NULL;
2314  
2315  	return unix_next_socket(seq, NULL, pos);
2316  }
2317  
2318  static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2319  {
2320  	++*pos;
2321  	return unix_next_socket(seq, v, pos);
2322  }
2323  
2324  static void unix_seq_stop(struct seq_file *seq, void *v)
2325  	__releases(unix_table_lock)
2326  {
2327  	spin_unlock(&unix_table_lock);
2328  }
2329  
2330  static int unix_seq_show(struct seq_file *seq, void *v)
2331  {
2332  
2333  	if (v == SEQ_START_TOKEN)
2334  		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2335  			 "Inode Path\n");
2336  	else {
2337  		struct sock *s = v;
2338  		struct unix_sock *u = unix_sk(s);
2339  		unix_state_lock(s);
2340  
2341  		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2342  			s,
2343  			atomic_read(&s->sk_refcnt),
2344  			0,
2345  			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2346  			s->sk_type,
2347  			s->sk_socket ?
2348  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2349  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2350  			sock_i_ino(s));
2351  
2352  		if (u->addr) {
2353  			int i, len;
2354  			seq_putc(seq, ' ');
2355  
2356  			i = 0;
2357  			len = u->addr->len - sizeof(short);
2358  			if (!UNIX_ABSTRACT(s))
2359  				len--;
2360  			else {
2361  				seq_putc(seq, '@');
2362  				i++;
2363  			}
2364  			for ( ; i < len; i++)
2365  				seq_putc(seq, u->addr->name->sun_path[i]);
2366  		}
2367  		unix_state_unlock(s);
2368  		seq_putc(seq, '\n');
2369  	}
2370  
2371  	return 0;
2372  }
2373  
2374  static const struct seq_operations unix_seq_ops = {
2375  	.start  = unix_seq_start,
2376  	.next   = unix_seq_next,
2377  	.stop   = unix_seq_stop,
2378  	.show   = unix_seq_show,
2379  };
2380  
2381  static int unix_seq_open(struct inode *inode, struct file *file)
2382  {
2383  	return seq_open_net(inode, file, &unix_seq_ops,
2384  			    sizeof(struct seq_net_private));
2385  }
2386  
2387  static const struct file_operations unix_seq_fops = {
2388  	.owner		= THIS_MODULE,
2389  	.open		= unix_seq_open,
2390  	.read		= seq_read,
2391  	.llseek		= seq_lseek,
2392  	.release	= seq_release_net,
2393  };
2394  
2395  #endif
2396  
2397  static const struct net_proto_family unix_family_ops = {
2398  	.family = PF_UNIX,
2399  	.create = unix_create,
2400  	.owner	= THIS_MODULE,
2401  };
2402  
2403  
2404  static int __net_init unix_net_init(struct net *net)
2405  {
2406  	int error = -ENOMEM;
2407  
2408  	net->unx.sysctl_max_dgram_qlen = 10;
2409  	if (unix_sysctl_register(net))
2410  		goto out;
2411  
2412  #ifdef CONFIG_PROC_FS
2413  	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2414  		unix_sysctl_unregister(net);
2415  		goto out;
2416  	}
2417  #endif
2418  	error = 0;
2419  out:
2420  	return error;
2421  }
2422  
2423  static void __net_exit unix_net_exit(struct net *net)
2424  {
2425  	unix_sysctl_unregister(net);
2426  	remove_proc_entry("unix", net->proc_net);
2427  }
2428  
2429  static struct pernet_operations unix_net_ops = {
2430  	.init = unix_net_init,
2431  	.exit = unix_net_exit,
2432  };
2433  
2434  static int __init af_unix_init(void)
2435  {
2436  	int rc = -1;
2437  
2438  	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2439  
2440  	rc = proto_register(&unix_proto, 1);
2441  	if (rc != 0) {
2442  		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2443  		goto out;
2444  	}
2445  
2446  	sock_register(&unix_family_ops);
2447  	register_pernet_subsys(&unix_net_ops);
2448  out:
2449  	return rc;
2450  }
2451  
2452  static void __exit af_unix_exit(void)
2453  {
2454  	sock_unregister(PF_UNIX);
2455  	proto_unregister(&unix_proto);
2456  	unregister_pernet_subsys(&unix_net_ops);
2457  }
2458  
2459  /* Earlier than device_initcall() so that other drivers invoking
2460     request_module() don't end up in a loop when modprobe tries
2461     to use a UNIX socket. But later than subsys_initcall() because
2462     we depend on stuff initialised there */
2463  fs_initcall(af_unix_init);
2464  module_exit(af_unix_exit);
2465  
2466  MODULE_LICENSE("GPL");
2467  MODULE_ALIAS_NETPROTO(PF_UNIX);
2468