xref: /linux/net/unix/af_unix.c (revision 27f62b9f294b7e2019c94c385abda43a0af6bb8b)
1  /*
2   * NET4:	Implementation of BSD Unix domain sockets.
3   *
4   * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5   *
6   *		This program is free software; you can redistribute it and/or
7   *		modify it under the terms of the GNU General Public License
8   *		as published by the Free Software Foundation; either version
9   *		2 of the License, or (at your option) any later version.
10   *
11   * Fixes:
12   *		Linus Torvalds	:	Assorted bug cures.
13   *		Niibe Yutaka	:	async I/O support.
14   *		Carsten Paeth	:	PF_UNIX check, address fixes.
15   *		Alan Cox	:	Limit size of allocated blocks.
16   *		Alan Cox	:	Fixed the stupid socketpair bug.
17   *		Alan Cox	:	BSD compatibility fine tuning.
18   *		Alan Cox	:	Fixed a bug in connect when interrupted.
19   *		Alan Cox	:	Sorted out a proper draft version of
20   *					file descriptor passing hacked up from
21   *					Mike Shaver's work.
22   *		Marty Leisner	:	Fixes to fd passing
23   *		Nick Nevin	:	recvmsg bugfix.
24   *		Alan Cox	:	Started proper garbage collector
25   *		Heiko EiBfeldt	:	Missing verify_area check
26   *		Alan Cox	:	Started POSIXisms
27   *		Andreas Schwab	:	Replace inode by dentry for proper
28   *					reference counting
29   *		Kirk Petersen	:	Made this a module
30   *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31   *					Lots of bug fixes.
32   *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33   *					by above two patches.
34   *	     Andrea Arcangeli	:	If possible we block in connect(2)
35   *					if the max backlog of the listen socket
36   *					is been reached. This won't break
37   *					old apps and it will avoid huge amount
38   *					of socks hashed (this for unix_gc()
39   *					performances reasons).
40   *					Security fix that limits the max
41   *					number of socks to 2*max_files and
42   *					the number of skb queueable in the
43   *					dgram receiver.
44   *		Artur Skawina   :	Hash function optimizations
45   *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46   *	      Malcolm Beattie   :	Set peercred for socketpair
47   *	     Michal Ostrowski   :       Module initialization cleanup.
48   *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49   *	     				the core infrastructure is doing that
50   *	     				for all net proto families now (2.5.69+)
51   *
52   *
53   * Known differences from reference BSD that was tested:
54   *
55   *	[TO FIX]
56   *	ECONNREFUSED is not returned from one end of a connected() socket to the
57   *		other the moment one end closes.
58   *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59   *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60   *	[NOT TO FIX]
61   *	accept() returns a path name even if the connecting socket has closed
62   *		in the meantime (BSD loses the path and gives up).
63   *	accept() returns 0 length path for an unbound connector. BSD returns 16
64   *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65   *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66   *	BSD af_unix apparently has connect forgetting to block properly.
67   *		(need to check this with the POSIX spec in detail)
68   *
69   * Differences from 2.0.0-11-... (ANK)
70   *	Bug fixes and improvements.
71   *		- client shutdown killed server socket.
72   *		- removed all useless cli/sti pairs.
73   *
74   *	Semantic changes/extensions.
75   *		- generic control message passing.
76   *		- SCM_CREDENTIALS control message.
77   *		- "Abstract" (not FS based) socket bindings.
78   *		  Abstract names are sequences of bytes (not zero terminated)
79   *		  started by 0, so that this name space does not intersect
80   *		  with BSD names.
81   */
82  
83  #include <linux/module.h>
84  #include <linux/kernel.h>
85  #include <linux/signal.h>
86  #include <linux/sched.h>
87  #include <linux/errno.h>
88  #include <linux/string.h>
89  #include <linux/stat.h>
90  #include <linux/dcache.h>
91  #include <linux/namei.h>
92  #include <linux/socket.h>
93  #include <linux/un.h>
94  #include <linux/fcntl.h>
95  #include <linux/termios.h>
96  #include <linux/sockios.h>
97  #include <linux/net.h>
98  #include <linux/in.h>
99  #include <linux/fs.h>
100  #include <linux/slab.h>
101  #include <asm/uaccess.h>
102  #include <linux/skbuff.h>
103  #include <linux/netdevice.h>
104  #include <net/net_namespace.h>
105  #include <net/sock.h>
106  #include <net/tcp_states.h>
107  #include <net/af_unix.h>
108  #include <linux/proc_fs.h>
109  #include <linux/seq_file.h>
110  #include <net/scm.h>
111  #include <linux/init.h>
112  #include <linux/poll.h>
113  #include <linux/rtnetlink.h>
114  #include <linux/mount.h>
115  #include <net/checksum.h>
116  #include <linux/security.h>
117  
118  struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
119  EXPORT_SYMBOL_GPL(unix_socket_table);
120  DEFINE_SPINLOCK(unix_table_lock);
121  EXPORT_SYMBOL_GPL(unix_table_lock);
122  static atomic_long_t unix_nr_socks;
123  
124  
125  static struct hlist_head *unix_sockets_unbound(void *addr)
126  {
127  	unsigned long hash = (unsigned long)addr;
128  
129  	hash ^= hash >> 16;
130  	hash ^= hash >> 8;
131  	hash %= UNIX_HASH_SIZE;
132  	return &unix_socket_table[UNIX_HASH_SIZE + hash];
133  }
134  
135  #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
136  
137  #ifdef CONFIG_SECURITY_NETWORK
138  static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
139  {
140  	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
141  }
142  
143  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
144  {
145  	scm->secid = *UNIXSID(skb);
146  }
147  #else
148  static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
149  { }
150  
151  static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152  { }
153  #endif /* CONFIG_SECURITY_NETWORK */
154  
155  /*
156   *  SMP locking strategy:
157   *    hash table is protected with spinlock unix_table_lock
158   *    each socket state is protected by separate spin lock.
159   */
160  
161  static inline unsigned int unix_hash_fold(__wsum n)
162  {
163  	unsigned int hash = (__force unsigned int)n;
164  
165  	hash ^= hash>>16;
166  	hash ^= hash>>8;
167  	return hash&(UNIX_HASH_SIZE-1);
168  }
169  
170  #define unix_peer(sk) (unix_sk(sk)->peer)
171  
172  static inline int unix_our_peer(struct sock *sk, struct sock *osk)
173  {
174  	return unix_peer(osk) == sk;
175  }
176  
177  static inline int unix_may_send(struct sock *sk, struct sock *osk)
178  {
179  	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
180  }
181  
182  static inline int unix_recvq_full(struct sock const *sk)
183  {
184  	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
185  }
186  
187  struct sock *unix_peer_get(struct sock *s)
188  {
189  	struct sock *peer;
190  
191  	unix_state_lock(s);
192  	peer = unix_peer(s);
193  	if (peer)
194  		sock_hold(peer);
195  	unix_state_unlock(s);
196  	return peer;
197  }
198  EXPORT_SYMBOL_GPL(unix_peer_get);
199  
200  static inline void unix_release_addr(struct unix_address *addr)
201  {
202  	if (atomic_dec_and_test(&addr->refcnt))
203  		kfree(addr);
204  }
205  
206  /*
207   *	Check unix socket name:
208   *		- should be not zero length.
209   *	        - if started by not zero, should be NULL terminated (FS object)
210   *		- if started by zero, it is abstract name.
211   */
212  
213  static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
214  {
215  	if (len <= sizeof(short) || len > sizeof(*sunaddr))
216  		return -EINVAL;
217  	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
218  		return -EINVAL;
219  	if (sunaddr->sun_path[0]) {
220  		/*
221  		 * This may look like an off by one error but it is a bit more
222  		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
223  		 * sun_path[108] doesn't as such exist.  However in kernel space
224  		 * we are guaranteed that it is a valid memory location in our
225  		 * kernel address buffer.
226  		 */
227  		((char *)sunaddr)[len] = 0;
228  		len = strlen(sunaddr->sun_path)+1+sizeof(short);
229  		return len;
230  	}
231  
232  	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
233  	return len;
234  }
235  
236  static void __unix_remove_socket(struct sock *sk)
237  {
238  	sk_del_node_init(sk);
239  }
240  
241  static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
242  {
243  	WARN_ON(!sk_unhashed(sk));
244  	sk_add_node(sk, list);
245  }
246  
247  static inline void unix_remove_socket(struct sock *sk)
248  {
249  	spin_lock(&unix_table_lock);
250  	__unix_remove_socket(sk);
251  	spin_unlock(&unix_table_lock);
252  }
253  
254  static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
255  {
256  	spin_lock(&unix_table_lock);
257  	__unix_insert_socket(list, sk);
258  	spin_unlock(&unix_table_lock);
259  }
260  
261  static struct sock *__unix_find_socket_byname(struct net *net,
262  					      struct sockaddr_un *sunname,
263  					      int len, int type, unsigned int hash)
264  {
265  	struct sock *s;
266  
267  	sk_for_each(s, &unix_socket_table[hash ^ type]) {
268  		struct unix_sock *u = unix_sk(s);
269  
270  		if (!net_eq(sock_net(s), net))
271  			continue;
272  
273  		if (u->addr->len == len &&
274  		    !memcmp(u->addr->name, sunname, len))
275  			goto found;
276  	}
277  	s = NULL;
278  found:
279  	return s;
280  }
281  
282  static inline struct sock *unix_find_socket_byname(struct net *net,
283  						   struct sockaddr_un *sunname,
284  						   int len, int type,
285  						   unsigned int hash)
286  {
287  	struct sock *s;
288  
289  	spin_lock(&unix_table_lock);
290  	s = __unix_find_socket_byname(net, sunname, len, type, hash);
291  	if (s)
292  		sock_hold(s);
293  	spin_unlock(&unix_table_lock);
294  	return s;
295  }
296  
297  static struct sock *unix_find_socket_byinode(struct inode *i)
298  {
299  	struct sock *s;
300  
301  	spin_lock(&unix_table_lock);
302  	sk_for_each(s,
303  		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
304  		struct dentry *dentry = unix_sk(s)->path.dentry;
305  
306  		if (dentry && dentry->d_inode == i) {
307  			sock_hold(s);
308  			goto found;
309  		}
310  	}
311  	s = NULL;
312  found:
313  	spin_unlock(&unix_table_lock);
314  	return s;
315  }
316  
317  static inline int unix_writable(struct sock *sk)
318  {
319  	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
320  }
321  
322  static void unix_write_space(struct sock *sk)
323  {
324  	struct socket_wq *wq;
325  
326  	rcu_read_lock();
327  	if (unix_writable(sk)) {
328  		wq = rcu_dereference(sk->sk_wq);
329  		if (wq_has_sleeper(wq))
330  			wake_up_interruptible_sync_poll(&wq->wait,
331  				POLLOUT | POLLWRNORM | POLLWRBAND);
332  		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
333  	}
334  	rcu_read_unlock();
335  }
336  
337  /* When dgram socket disconnects (or changes its peer), we clear its receive
338   * queue of packets arrived from previous peer. First, it allows to do
339   * flow control based only on wmem_alloc; second, sk connected to peer
340   * may receive messages only from that peer. */
341  static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
342  {
343  	if (!skb_queue_empty(&sk->sk_receive_queue)) {
344  		skb_queue_purge(&sk->sk_receive_queue);
345  		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
346  
347  		/* If one link of bidirectional dgram pipe is disconnected,
348  		 * we signal error. Messages are lost. Do not make this,
349  		 * when peer was not connected to us.
350  		 */
351  		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
352  			other->sk_err = ECONNRESET;
353  			other->sk_error_report(other);
354  		}
355  	}
356  }
357  
358  static void unix_sock_destructor(struct sock *sk)
359  {
360  	struct unix_sock *u = unix_sk(sk);
361  
362  	skb_queue_purge(&sk->sk_receive_queue);
363  
364  	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
365  	WARN_ON(!sk_unhashed(sk));
366  	WARN_ON(sk->sk_socket);
367  	if (!sock_flag(sk, SOCK_DEAD)) {
368  		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
369  		return;
370  	}
371  
372  	if (u->addr)
373  		unix_release_addr(u->addr);
374  
375  	atomic_long_dec(&unix_nr_socks);
376  	local_bh_disable();
377  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
378  	local_bh_enable();
379  #ifdef UNIX_REFCNT_DEBUG
380  	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
381  		atomic_long_read(&unix_nr_socks));
382  #endif
383  }
384  
385  static void unix_release_sock(struct sock *sk, int embrion)
386  {
387  	struct unix_sock *u = unix_sk(sk);
388  	struct path path;
389  	struct sock *skpair;
390  	struct sk_buff *skb;
391  	int state;
392  
393  	unix_remove_socket(sk);
394  
395  	/* Clear state */
396  	unix_state_lock(sk);
397  	sock_orphan(sk);
398  	sk->sk_shutdown = SHUTDOWN_MASK;
399  	path	     = u->path;
400  	u->path.dentry = NULL;
401  	u->path.mnt = NULL;
402  	state = sk->sk_state;
403  	sk->sk_state = TCP_CLOSE;
404  	unix_state_unlock(sk);
405  
406  	wake_up_interruptible_all(&u->peer_wait);
407  
408  	skpair = unix_peer(sk);
409  
410  	if (skpair != NULL) {
411  		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
412  			unix_state_lock(skpair);
413  			/* No more writes */
414  			skpair->sk_shutdown = SHUTDOWN_MASK;
415  			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
416  				skpair->sk_err = ECONNRESET;
417  			unix_state_unlock(skpair);
418  			skpair->sk_state_change(skpair);
419  			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
420  		}
421  		sock_put(skpair); /* It may now die */
422  		unix_peer(sk) = NULL;
423  	}
424  
425  	/* Try to flush out this socket. Throw out buffers at least */
426  
427  	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
428  		if (state == TCP_LISTEN)
429  			unix_release_sock(skb->sk, 1);
430  		/* passed fds are erased in the kfree_skb hook	      */
431  		kfree_skb(skb);
432  	}
433  
434  	if (path.dentry)
435  		path_put(&path);
436  
437  	sock_put(sk);
438  
439  	/* ---- Socket is dead now and most probably destroyed ---- */
440  
441  	/*
442  	 * Fixme: BSD difference: In BSD all sockets connected to us get
443  	 *	  ECONNRESET and we die on the spot. In Linux we behave
444  	 *	  like files and pipes do and wait for the last
445  	 *	  dereference.
446  	 *
447  	 * Can't we simply set sock->err?
448  	 *
449  	 *	  What the above comment does talk about? --ANK(980817)
450  	 */
451  
452  	if (unix_tot_inflight)
453  		unix_gc();		/* Garbage collect fds */
454  }
455  
456  static void init_peercred(struct sock *sk)
457  {
458  	put_pid(sk->sk_peer_pid);
459  	if (sk->sk_peer_cred)
460  		put_cred(sk->sk_peer_cred);
461  	sk->sk_peer_pid  = get_pid(task_tgid(current));
462  	sk->sk_peer_cred = get_current_cred();
463  }
464  
465  static void copy_peercred(struct sock *sk, struct sock *peersk)
466  {
467  	put_pid(sk->sk_peer_pid);
468  	if (sk->sk_peer_cred)
469  		put_cred(sk->sk_peer_cred);
470  	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
471  	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
472  }
473  
474  static int unix_listen(struct socket *sock, int backlog)
475  {
476  	int err;
477  	struct sock *sk = sock->sk;
478  	struct unix_sock *u = unix_sk(sk);
479  	struct pid *old_pid = NULL;
480  
481  	err = -EOPNOTSUPP;
482  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
483  		goto out;	/* Only stream/seqpacket sockets accept */
484  	err = -EINVAL;
485  	if (!u->addr)
486  		goto out;	/* No listens on an unbound socket */
487  	unix_state_lock(sk);
488  	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
489  		goto out_unlock;
490  	if (backlog > sk->sk_max_ack_backlog)
491  		wake_up_interruptible_all(&u->peer_wait);
492  	sk->sk_max_ack_backlog	= backlog;
493  	sk->sk_state		= TCP_LISTEN;
494  	/* set credentials so connect can copy them */
495  	init_peercred(sk);
496  	err = 0;
497  
498  out_unlock:
499  	unix_state_unlock(sk);
500  	put_pid(old_pid);
501  out:
502  	return err;
503  }
504  
505  static int unix_release(struct socket *);
506  static int unix_bind(struct socket *, struct sockaddr *, int);
507  static int unix_stream_connect(struct socket *, struct sockaddr *,
508  			       int addr_len, int flags);
509  static int unix_socketpair(struct socket *, struct socket *);
510  static int unix_accept(struct socket *, struct socket *, int);
511  static int unix_getname(struct socket *, struct sockaddr *, int *, int);
512  static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
513  static unsigned int unix_dgram_poll(struct file *, struct socket *,
514  				    poll_table *);
515  static int unix_ioctl(struct socket *, unsigned int, unsigned long);
516  static int unix_shutdown(struct socket *, int);
517  static int unix_stream_sendmsg(struct kiocb *, struct socket *,
518  			       struct msghdr *, size_t);
519  static int unix_stream_recvmsg(struct kiocb *, struct socket *,
520  			       struct msghdr *, size_t, int);
521  static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
522  			      struct msghdr *, size_t);
523  static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
524  			      struct msghdr *, size_t, int);
525  static int unix_dgram_connect(struct socket *, struct sockaddr *,
526  			      int, int);
527  static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
528  				  struct msghdr *, size_t);
529  static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
530  				  struct msghdr *, size_t, int);
531  
532  static void unix_set_peek_off(struct sock *sk, int val)
533  {
534  	struct unix_sock *u = unix_sk(sk);
535  
536  	mutex_lock(&u->readlock);
537  	sk->sk_peek_off = val;
538  	mutex_unlock(&u->readlock);
539  }
540  
541  
542  static const struct proto_ops unix_stream_ops = {
543  	.family =	PF_UNIX,
544  	.owner =	THIS_MODULE,
545  	.release =	unix_release,
546  	.bind =		unix_bind,
547  	.connect =	unix_stream_connect,
548  	.socketpair =	unix_socketpair,
549  	.accept =	unix_accept,
550  	.getname =	unix_getname,
551  	.poll =		unix_poll,
552  	.ioctl =	unix_ioctl,
553  	.listen =	unix_listen,
554  	.shutdown =	unix_shutdown,
555  	.setsockopt =	sock_no_setsockopt,
556  	.getsockopt =	sock_no_getsockopt,
557  	.sendmsg =	unix_stream_sendmsg,
558  	.recvmsg =	unix_stream_recvmsg,
559  	.mmap =		sock_no_mmap,
560  	.sendpage =	sock_no_sendpage,
561  	.set_peek_off =	unix_set_peek_off,
562  };
563  
564  static const struct proto_ops unix_dgram_ops = {
565  	.family =	PF_UNIX,
566  	.owner =	THIS_MODULE,
567  	.release =	unix_release,
568  	.bind =		unix_bind,
569  	.connect =	unix_dgram_connect,
570  	.socketpair =	unix_socketpair,
571  	.accept =	sock_no_accept,
572  	.getname =	unix_getname,
573  	.poll =		unix_dgram_poll,
574  	.ioctl =	unix_ioctl,
575  	.listen =	sock_no_listen,
576  	.shutdown =	unix_shutdown,
577  	.setsockopt =	sock_no_setsockopt,
578  	.getsockopt =	sock_no_getsockopt,
579  	.sendmsg =	unix_dgram_sendmsg,
580  	.recvmsg =	unix_dgram_recvmsg,
581  	.mmap =		sock_no_mmap,
582  	.sendpage =	sock_no_sendpage,
583  	.set_peek_off =	unix_set_peek_off,
584  };
585  
586  static const struct proto_ops unix_seqpacket_ops = {
587  	.family =	PF_UNIX,
588  	.owner =	THIS_MODULE,
589  	.release =	unix_release,
590  	.bind =		unix_bind,
591  	.connect =	unix_stream_connect,
592  	.socketpair =	unix_socketpair,
593  	.accept =	unix_accept,
594  	.getname =	unix_getname,
595  	.poll =		unix_dgram_poll,
596  	.ioctl =	unix_ioctl,
597  	.listen =	unix_listen,
598  	.shutdown =	unix_shutdown,
599  	.setsockopt =	sock_no_setsockopt,
600  	.getsockopt =	sock_no_getsockopt,
601  	.sendmsg =	unix_seqpacket_sendmsg,
602  	.recvmsg =	unix_seqpacket_recvmsg,
603  	.mmap =		sock_no_mmap,
604  	.sendpage =	sock_no_sendpage,
605  	.set_peek_off =	unix_set_peek_off,
606  };
607  
608  static struct proto unix_proto = {
609  	.name			= "UNIX",
610  	.owner			= THIS_MODULE,
611  	.obj_size		= sizeof(struct unix_sock),
612  };
613  
614  /*
615   * AF_UNIX sockets do not interact with hardware, hence they
616   * dont trigger interrupts - so it's safe for them to have
617   * bh-unsafe locking for their sk_receive_queue.lock. Split off
618   * this special lock-class by reinitializing the spinlock key:
619   */
620  static struct lock_class_key af_unix_sk_receive_queue_lock_key;
621  
622  static struct sock *unix_create1(struct net *net, struct socket *sock)
623  {
624  	struct sock *sk = NULL;
625  	struct unix_sock *u;
626  
627  	atomic_long_inc(&unix_nr_socks);
628  	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
629  		goto out;
630  
631  	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
632  	if (!sk)
633  		goto out;
634  
635  	sock_init_data(sock, sk);
636  	lockdep_set_class(&sk->sk_receive_queue.lock,
637  				&af_unix_sk_receive_queue_lock_key);
638  
639  	sk->sk_write_space	= unix_write_space;
640  	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
641  	sk->sk_destruct		= unix_sock_destructor;
642  	u	  = unix_sk(sk);
643  	u->path.dentry = NULL;
644  	u->path.mnt = NULL;
645  	spin_lock_init(&u->lock);
646  	atomic_long_set(&u->inflight, 0);
647  	INIT_LIST_HEAD(&u->link);
648  	mutex_init(&u->readlock); /* single task reading lock */
649  	init_waitqueue_head(&u->peer_wait);
650  	unix_insert_socket(unix_sockets_unbound(sk), sk);
651  out:
652  	if (sk == NULL)
653  		atomic_long_dec(&unix_nr_socks);
654  	else {
655  		local_bh_disable();
656  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
657  		local_bh_enable();
658  	}
659  	return sk;
660  }
661  
662  static int unix_create(struct net *net, struct socket *sock, int protocol,
663  		       int kern)
664  {
665  	if (protocol && protocol != PF_UNIX)
666  		return -EPROTONOSUPPORT;
667  
668  	sock->state = SS_UNCONNECTED;
669  
670  	switch (sock->type) {
671  	case SOCK_STREAM:
672  		sock->ops = &unix_stream_ops;
673  		break;
674  		/*
675  		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
676  		 *	nothing uses it.
677  		 */
678  	case SOCK_RAW:
679  		sock->type = SOCK_DGRAM;
680  	case SOCK_DGRAM:
681  		sock->ops = &unix_dgram_ops;
682  		break;
683  	case SOCK_SEQPACKET:
684  		sock->ops = &unix_seqpacket_ops;
685  		break;
686  	default:
687  		return -ESOCKTNOSUPPORT;
688  	}
689  
690  	return unix_create1(net, sock) ? 0 : -ENOMEM;
691  }
692  
693  static int unix_release(struct socket *sock)
694  {
695  	struct sock *sk = sock->sk;
696  
697  	if (!sk)
698  		return 0;
699  
700  	unix_release_sock(sk, 0);
701  	sock->sk = NULL;
702  
703  	return 0;
704  }
705  
706  static int unix_autobind(struct socket *sock)
707  {
708  	struct sock *sk = sock->sk;
709  	struct net *net = sock_net(sk);
710  	struct unix_sock *u = unix_sk(sk);
711  	static u32 ordernum = 1;
712  	struct unix_address *addr;
713  	int err;
714  	unsigned int retries = 0;
715  
716  	mutex_lock(&u->readlock);
717  
718  	err = 0;
719  	if (u->addr)
720  		goto out;
721  
722  	err = -ENOMEM;
723  	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
724  	if (!addr)
725  		goto out;
726  
727  	addr->name->sun_family = AF_UNIX;
728  	atomic_set(&addr->refcnt, 1);
729  
730  retry:
731  	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
732  	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
733  
734  	spin_lock(&unix_table_lock);
735  	ordernum = (ordernum+1)&0xFFFFF;
736  
737  	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
738  				      addr->hash)) {
739  		spin_unlock(&unix_table_lock);
740  		/*
741  		 * __unix_find_socket_byname() may take long time if many names
742  		 * are already in use.
743  		 */
744  		cond_resched();
745  		/* Give up if all names seems to be in use. */
746  		if (retries++ == 0xFFFFF) {
747  			err = -ENOSPC;
748  			kfree(addr);
749  			goto out;
750  		}
751  		goto retry;
752  	}
753  	addr->hash ^= sk->sk_type;
754  
755  	__unix_remove_socket(sk);
756  	u->addr = addr;
757  	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
758  	spin_unlock(&unix_table_lock);
759  	err = 0;
760  
761  out:	mutex_unlock(&u->readlock);
762  	return err;
763  }
764  
765  static struct sock *unix_find_other(struct net *net,
766  				    struct sockaddr_un *sunname, int len,
767  				    int type, unsigned int hash, int *error)
768  {
769  	struct sock *u;
770  	struct path path;
771  	int err = 0;
772  
773  	if (sunname->sun_path[0]) {
774  		struct inode *inode;
775  		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
776  		if (err)
777  			goto fail;
778  		inode = path.dentry->d_inode;
779  		err = inode_permission(inode, MAY_WRITE);
780  		if (err)
781  			goto put_fail;
782  
783  		err = -ECONNREFUSED;
784  		if (!S_ISSOCK(inode->i_mode))
785  			goto put_fail;
786  		u = unix_find_socket_byinode(inode);
787  		if (!u)
788  			goto put_fail;
789  
790  		if (u->sk_type == type)
791  			touch_atime(&path);
792  
793  		path_put(&path);
794  
795  		err = -EPROTOTYPE;
796  		if (u->sk_type != type) {
797  			sock_put(u);
798  			goto fail;
799  		}
800  	} else {
801  		err = -ECONNREFUSED;
802  		u = unix_find_socket_byname(net, sunname, len, type, hash);
803  		if (u) {
804  			struct dentry *dentry;
805  			dentry = unix_sk(u)->path.dentry;
806  			if (dentry)
807  				touch_atime(&unix_sk(u)->path);
808  		} else
809  			goto fail;
810  	}
811  	return u;
812  
813  put_fail:
814  	path_put(&path);
815  fail:
816  	*error = err;
817  	return NULL;
818  }
819  
820  static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
821  {
822  	struct dentry *dentry;
823  	struct path path;
824  	int err = 0;
825  	/*
826  	 * Get the parent directory, calculate the hash for last
827  	 * component.
828  	 */
829  	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
830  	err = PTR_ERR(dentry);
831  	if (IS_ERR(dentry))
832  		return err;
833  
834  	/*
835  	 * All right, let's create it.
836  	 */
837  	err = security_path_mknod(&path, dentry, mode, 0);
838  	if (!err) {
839  		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
840  		if (!err) {
841  			res->mnt = mntget(path.mnt);
842  			res->dentry = dget(dentry);
843  		}
844  	}
845  	done_path_create(&path, dentry);
846  	return err;
847  }
848  
849  static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
850  {
851  	struct sock *sk = sock->sk;
852  	struct net *net = sock_net(sk);
853  	struct unix_sock *u = unix_sk(sk);
854  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
855  	char *sun_path = sunaddr->sun_path;
856  	int err;
857  	unsigned int hash;
858  	struct unix_address *addr;
859  	struct hlist_head *list;
860  
861  	err = -EINVAL;
862  	if (sunaddr->sun_family != AF_UNIX)
863  		goto out;
864  
865  	if (addr_len == sizeof(short)) {
866  		err = unix_autobind(sock);
867  		goto out;
868  	}
869  
870  	err = unix_mkname(sunaddr, addr_len, &hash);
871  	if (err < 0)
872  		goto out;
873  	addr_len = err;
874  
875  	mutex_lock(&u->readlock);
876  
877  	err = -EINVAL;
878  	if (u->addr)
879  		goto out_up;
880  
881  	err = -ENOMEM;
882  	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
883  	if (!addr)
884  		goto out_up;
885  
886  	memcpy(addr->name, sunaddr, addr_len);
887  	addr->len = addr_len;
888  	addr->hash = hash ^ sk->sk_type;
889  	atomic_set(&addr->refcnt, 1);
890  
891  	if (sun_path[0]) {
892  		struct path path;
893  		umode_t mode = S_IFSOCK |
894  		       (SOCK_INODE(sock)->i_mode & ~current_umask());
895  		err = unix_mknod(sun_path, mode, &path);
896  		if (err) {
897  			if (err == -EEXIST)
898  				err = -EADDRINUSE;
899  			unix_release_addr(addr);
900  			goto out_up;
901  		}
902  		addr->hash = UNIX_HASH_SIZE;
903  		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
904  		spin_lock(&unix_table_lock);
905  		u->path = path;
906  		list = &unix_socket_table[hash];
907  	} else {
908  		spin_lock(&unix_table_lock);
909  		err = -EADDRINUSE;
910  		if (__unix_find_socket_byname(net, sunaddr, addr_len,
911  					      sk->sk_type, hash)) {
912  			unix_release_addr(addr);
913  			goto out_unlock;
914  		}
915  
916  		list = &unix_socket_table[addr->hash];
917  	}
918  
919  	err = 0;
920  	__unix_remove_socket(sk);
921  	u->addr = addr;
922  	__unix_insert_socket(list, sk);
923  
924  out_unlock:
925  	spin_unlock(&unix_table_lock);
926  out_up:
927  	mutex_unlock(&u->readlock);
928  out:
929  	return err;
930  }
931  
932  static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
933  {
934  	if (unlikely(sk1 == sk2) || !sk2) {
935  		unix_state_lock(sk1);
936  		return;
937  	}
938  	if (sk1 < sk2) {
939  		unix_state_lock(sk1);
940  		unix_state_lock_nested(sk2);
941  	} else {
942  		unix_state_lock(sk2);
943  		unix_state_lock_nested(sk1);
944  	}
945  }
946  
947  static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
948  {
949  	if (unlikely(sk1 == sk2) || !sk2) {
950  		unix_state_unlock(sk1);
951  		return;
952  	}
953  	unix_state_unlock(sk1);
954  	unix_state_unlock(sk2);
955  }
956  
957  static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
958  			      int alen, int flags)
959  {
960  	struct sock *sk = sock->sk;
961  	struct net *net = sock_net(sk);
962  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
963  	struct sock *other;
964  	unsigned int hash;
965  	int err;
966  
967  	if (addr->sa_family != AF_UNSPEC) {
968  		err = unix_mkname(sunaddr, alen, &hash);
969  		if (err < 0)
970  			goto out;
971  		alen = err;
972  
973  		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
974  		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
975  			goto out;
976  
977  restart:
978  		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
979  		if (!other)
980  			goto out;
981  
982  		unix_state_double_lock(sk, other);
983  
984  		/* Apparently VFS overslept socket death. Retry. */
985  		if (sock_flag(other, SOCK_DEAD)) {
986  			unix_state_double_unlock(sk, other);
987  			sock_put(other);
988  			goto restart;
989  		}
990  
991  		err = -EPERM;
992  		if (!unix_may_send(sk, other))
993  			goto out_unlock;
994  
995  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
996  		if (err)
997  			goto out_unlock;
998  
999  	} else {
1000  		/*
1001  		 *	1003.1g breaking connected state with AF_UNSPEC
1002  		 */
1003  		other = NULL;
1004  		unix_state_double_lock(sk, other);
1005  	}
1006  
1007  	/*
1008  	 * If it was connected, reconnect.
1009  	 */
1010  	if (unix_peer(sk)) {
1011  		struct sock *old_peer = unix_peer(sk);
1012  		unix_peer(sk) = other;
1013  		unix_state_double_unlock(sk, other);
1014  
1015  		if (other != old_peer)
1016  			unix_dgram_disconnected(sk, old_peer);
1017  		sock_put(old_peer);
1018  	} else {
1019  		unix_peer(sk) = other;
1020  		unix_state_double_unlock(sk, other);
1021  	}
1022  	return 0;
1023  
1024  out_unlock:
1025  	unix_state_double_unlock(sk, other);
1026  	sock_put(other);
1027  out:
1028  	return err;
1029  }
1030  
1031  static long unix_wait_for_peer(struct sock *other, long timeo)
1032  {
1033  	struct unix_sock *u = unix_sk(other);
1034  	int sched;
1035  	DEFINE_WAIT(wait);
1036  
1037  	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1038  
1039  	sched = !sock_flag(other, SOCK_DEAD) &&
1040  		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1041  		unix_recvq_full(other);
1042  
1043  	unix_state_unlock(other);
1044  
1045  	if (sched)
1046  		timeo = schedule_timeout(timeo);
1047  
1048  	finish_wait(&u->peer_wait, &wait);
1049  	return timeo;
1050  }
1051  
1052  static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1053  			       int addr_len, int flags)
1054  {
1055  	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1056  	struct sock *sk = sock->sk;
1057  	struct net *net = sock_net(sk);
1058  	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1059  	struct sock *newsk = NULL;
1060  	struct sock *other = NULL;
1061  	struct sk_buff *skb = NULL;
1062  	unsigned int hash;
1063  	int st;
1064  	int err;
1065  	long timeo;
1066  
1067  	err = unix_mkname(sunaddr, addr_len, &hash);
1068  	if (err < 0)
1069  		goto out;
1070  	addr_len = err;
1071  
1072  	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1073  	    (err = unix_autobind(sock)) != 0)
1074  		goto out;
1075  
1076  	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1077  
1078  	/* First of all allocate resources.
1079  	   If we will make it after state is locked,
1080  	   we will have to recheck all again in any case.
1081  	 */
1082  
1083  	err = -ENOMEM;
1084  
1085  	/* create new sock for complete connection */
1086  	newsk = unix_create1(sock_net(sk), NULL);
1087  	if (newsk == NULL)
1088  		goto out;
1089  
1090  	/* Allocate skb for sending to listening sock */
1091  	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1092  	if (skb == NULL)
1093  		goto out;
1094  
1095  restart:
1096  	/*  Find listening sock. */
1097  	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1098  	if (!other)
1099  		goto out;
1100  
1101  	/* Latch state of peer */
1102  	unix_state_lock(other);
1103  
1104  	/* Apparently VFS overslept socket death. Retry. */
1105  	if (sock_flag(other, SOCK_DEAD)) {
1106  		unix_state_unlock(other);
1107  		sock_put(other);
1108  		goto restart;
1109  	}
1110  
1111  	err = -ECONNREFUSED;
1112  	if (other->sk_state != TCP_LISTEN)
1113  		goto out_unlock;
1114  	if (other->sk_shutdown & RCV_SHUTDOWN)
1115  		goto out_unlock;
1116  
1117  	if (unix_recvq_full(other)) {
1118  		err = -EAGAIN;
1119  		if (!timeo)
1120  			goto out_unlock;
1121  
1122  		timeo = unix_wait_for_peer(other, timeo);
1123  
1124  		err = sock_intr_errno(timeo);
1125  		if (signal_pending(current))
1126  			goto out;
1127  		sock_put(other);
1128  		goto restart;
1129  	}
1130  
1131  	/* Latch our state.
1132  
1133  	   It is tricky place. We need to grab our state lock and cannot
1134  	   drop lock on peer. It is dangerous because deadlock is
1135  	   possible. Connect to self case and simultaneous
1136  	   attempt to connect are eliminated by checking socket
1137  	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1138  	   check this before attempt to grab lock.
1139  
1140  	   Well, and we have to recheck the state after socket locked.
1141  	 */
1142  	st = sk->sk_state;
1143  
1144  	switch (st) {
1145  	case TCP_CLOSE:
1146  		/* This is ok... continue with connect */
1147  		break;
1148  	case TCP_ESTABLISHED:
1149  		/* Socket is already connected */
1150  		err = -EISCONN;
1151  		goto out_unlock;
1152  	default:
1153  		err = -EINVAL;
1154  		goto out_unlock;
1155  	}
1156  
1157  	unix_state_lock_nested(sk);
1158  
1159  	if (sk->sk_state != st) {
1160  		unix_state_unlock(sk);
1161  		unix_state_unlock(other);
1162  		sock_put(other);
1163  		goto restart;
1164  	}
1165  
1166  	err = security_unix_stream_connect(sk, other, newsk);
1167  	if (err) {
1168  		unix_state_unlock(sk);
1169  		goto out_unlock;
1170  	}
1171  
1172  	/* The way is open! Fastly set all the necessary fields... */
1173  
1174  	sock_hold(sk);
1175  	unix_peer(newsk)	= sk;
1176  	newsk->sk_state		= TCP_ESTABLISHED;
1177  	newsk->sk_type		= sk->sk_type;
1178  	init_peercred(newsk);
1179  	newu = unix_sk(newsk);
1180  	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1181  	otheru = unix_sk(other);
1182  
1183  	/* copy address information from listening to new sock*/
1184  	if (otheru->addr) {
1185  		atomic_inc(&otheru->addr->refcnt);
1186  		newu->addr = otheru->addr;
1187  	}
1188  	if (otheru->path.dentry) {
1189  		path_get(&otheru->path);
1190  		newu->path = otheru->path;
1191  	}
1192  
1193  	/* Set credentials */
1194  	copy_peercred(sk, other);
1195  
1196  	sock->state	= SS_CONNECTED;
1197  	sk->sk_state	= TCP_ESTABLISHED;
1198  	sock_hold(newsk);
1199  
1200  	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1201  	unix_peer(sk)	= newsk;
1202  
1203  	unix_state_unlock(sk);
1204  
1205  	/* take ten and and send info to listening sock */
1206  	spin_lock(&other->sk_receive_queue.lock);
1207  	__skb_queue_tail(&other->sk_receive_queue, skb);
1208  	spin_unlock(&other->sk_receive_queue.lock);
1209  	unix_state_unlock(other);
1210  	other->sk_data_ready(other, 0);
1211  	sock_put(other);
1212  	return 0;
1213  
1214  out_unlock:
1215  	if (other)
1216  		unix_state_unlock(other);
1217  
1218  out:
1219  	kfree_skb(skb);
1220  	if (newsk)
1221  		unix_release_sock(newsk, 0);
1222  	if (other)
1223  		sock_put(other);
1224  	return err;
1225  }
1226  
1227  static int unix_socketpair(struct socket *socka, struct socket *sockb)
1228  {
1229  	struct sock *ska = socka->sk, *skb = sockb->sk;
1230  
1231  	/* Join our sockets back to back */
1232  	sock_hold(ska);
1233  	sock_hold(skb);
1234  	unix_peer(ska) = skb;
1235  	unix_peer(skb) = ska;
1236  	init_peercred(ska);
1237  	init_peercred(skb);
1238  
1239  	if (ska->sk_type != SOCK_DGRAM) {
1240  		ska->sk_state = TCP_ESTABLISHED;
1241  		skb->sk_state = TCP_ESTABLISHED;
1242  		socka->state  = SS_CONNECTED;
1243  		sockb->state  = SS_CONNECTED;
1244  	}
1245  	return 0;
1246  }
1247  
1248  static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1249  {
1250  	struct sock *sk = sock->sk;
1251  	struct sock *tsk;
1252  	struct sk_buff *skb;
1253  	int err;
1254  
1255  	err = -EOPNOTSUPP;
1256  	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1257  		goto out;
1258  
1259  	err = -EINVAL;
1260  	if (sk->sk_state != TCP_LISTEN)
1261  		goto out;
1262  
1263  	/* If socket state is TCP_LISTEN it cannot change (for now...),
1264  	 * so that no locks are necessary.
1265  	 */
1266  
1267  	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1268  	if (!skb) {
1269  		/* This means receive shutdown. */
1270  		if (err == 0)
1271  			err = -EINVAL;
1272  		goto out;
1273  	}
1274  
1275  	tsk = skb->sk;
1276  	skb_free_datagram(sk, skb);
1277  	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1278  
1279  	/* attach accepted sock to socket */
1280  	unix_state_lock(tsk);
1281  	newsock->state = SS_CONNECTED;
1282  	sock_graft(tsk, newsock);
1283  	unix_state_unlock(tsk);
1284  	return 0;
1285  
1286  out:
1287  	return err;
1288  }
1289  
1290  
1291  static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1292  {
1293  	struct sock *sk = sock->sk;
1294  	struct unix_sock *u;
1295  	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1296  	int err = 0;
1297  
1298  	if (peer) {
1299  		sk = unix_peer_get(sk);
1300  
1301  		err = -ENOTCONN;
1302  		if (!sk)
1303  			goto out;
1304  		err = 0;
1305  	} else {
1306  		sock_hold(sk);
1307  	}
1308  
1309  	u = unix_sk(sk);
1310  	unix_state_lock(sk);
1311  	if (!u->addr) {
1312  		sunaddr->sun_family = AF_UNIX;
1313  		sunaddr->sun_path[0] = 0;
1314  		*uaddr_len = sizeof(short);
1315  	} else {
1316  		struct unix_address *addr = u->addr;
1317  
1318  		*uaddr_len = addr->len;
1319  		memcpy(sunaddr, addr->name, *uaddr_len);
1320  	}
1321  	unix_state_unlock(sk);
1322  	sock_put(sk);
1323  out:
1324  	return err;
1325  }
1326  
1327  static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1328  {
1329  	int i;
1330  
1331  	scm->fp = UNIXCB(skb).fp;
1332  	UNIXCB(skb).fp = NULL;
1333  
1334  	for (i = scm->fp->count-1; i >= 0; i--)
1335  		unix_notinflight(scm->fp->fp[i]);
1336  }
1337  
1338  static void unix_destruct_scm(struct sk_buff *skb)
1339  {
1340  	struct scm_cookie scm;
1341  	memset(&scm, 0, sizeof(scm));
1342  	scm.pid  = UNIXCB(skb).pid;
1343  	if (UNIXCB(skb).fp)
1344  		unix_detach_fds(&scm, skb);
1345  
1346  	/* Alas, it calls VFS */
1347  	/* So fscking what? fput() had been SMP-safe since the last Summer */
1348  	scm_destroy(&scm);
1349  	sock_wfree(skb);
1350  }
1351  
1352  #define MAX_RECURSION_LEVEL 4
1353  
1354  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1355  {
1356  	int i;
1357  	unsigned char max_level = 0;
1358  	int unix_sock_count = 0;
1359  
1360  	for (i = scm->fp->count - 1; i >= 0; i--) {
1361  		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1362  
1363  		if (sk) {
1364  			unix_sock_count++;
1365  			max_level = max(max_level,
1366  					unix_sk(sk)->recursion_level);
1367  		}
1368  	}
1369  	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1370  		return -ETOOMANYREFS;
1371  
1372  	/*
1373  	 * Need to duplicate file references for the sake of garbage
1374  	 * collection.  Otherwise a socket in the fps might become a
1375  	 * candidate for GC while the skb is not yet queued.
1376  	 */
1377  	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1378  	if (!UNIXCB(skb).fp)
1379  		return -ENOMEM;
1380  
1381  	if (unix_sock_count) {
1382  		for (i = scm->fp->count - 1; i >= 0; i--)
1383  			unix_inflight(scm->fp->fp[i]);
1384  	}
1385  	return max_level;
1386  }
1387  
1388  static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1389  {
1390  	int err = 0;
1391  
1392  	UNIXCB(skb).pid  = get_pid(scm->pid);
1393  	UNIXCB(skb).uid = scm->creds.uid;
1394  	UNIXCB(skb).gid = scm->creds.gid;
1395  	UNIXCB(skb).fp = NULL;
1396  	if (scm->fp && send_fds)
1397  		err = unix_attach_fds(scm, skb);
1398  
1399  	skb->destructor = unix_destruct_scm;
1400  	return err;
1401  }
1402  
1403  /*
1404   * Some apps rely on write() giving SCM_CREDENTIALS
1405   * We include credentials if source or destination socket
1406   * asserted SOCK_PASSCRED.
1407   */
1408  static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1409  			    const struct sock *other)
1410  {
1411  	if (UNIXCB(skb).pid)
1412  		return;
1413  	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1414  	    !other->sk_socket ||
1415  	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1416  		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1417  		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1418  	}
1419  }
1420  
1421  /*
1422   *	Send AF_UNIX data.
1423   */
1424  
1425  static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1426  			      struct msghdr *msg, size_t len)
1427  {
1428  	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1429  	struct sock *sk = sock->sk;
1430  	struct net *net = sock_net(sk);
1431  	struct unix_sock *u = unix_sk(sk);
1432  	struct sockaddr_un *sunaddr = msg->msg_name;
1433  	struct sock *other = NULL;
1434  	int namelen = 0; /* fake GCC */
1435  	int err;
1436  	unsigned int hash;
1437  	struct sk_buff *skb;
1438  	long timeo;
1439  	struct scm_cookie tmp_scm;
1440  	int max_level;
1441  	int data_len = 0;
1442  
1443  	if (NULL == siocb->scm)
1444  		siocb->scm = &tmp_scm;
1445  	wait_for_unix_gc();
1446  	err = scm_send(sock, msg, siocb->scm, false);
1447  	if (err < 0)
1448  		return err;
1449  
1450  	err = -EOPNOTSUPP;
1451  	if (msg->msg_flags&MSG_OOB)
1452  		goto out;
1453  
1454  	if (msg->msg_namelen) {
1455  		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1456  		if (err < 0)
1457  			goto out;
1458  		namelen = err;
1459  	} else {
1460  		sunaddr = NULL;
1461  		err = -ENOTCONN;
1462  		other = unix_peer_get(sk);
1463  		if (!other)
1464  			goto out;
1465  	}
1466  
1467  	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1468  	    && (err = unix_autobind(sock)) != 0)
1469  		goto out;
1470  
1471  	err = -EMSGSIZE;
1472  	if (len > sk->sk_sndbuf - 32)
1473  		goto out;
1474  
1475  	if (len > SKB_MAX_ALLOC)
1476  		data_len = min_t(size_t,
1477  				 len - SKB_MAX_ALLOC,
1478  				 MAX_SKB_FRAGS * PAGE_SIZE);
1479  
1480  	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1481  				   msg->msg_flags & MSG_DONTWAIT, &err);
1482  	if (skb == NULL)
1483  		goto out;
1484  
1485  	err = unix_scm_to_skb(siocb->scm, skb, true);
1486  	if (err < 0)
1487  		goto out_free;
1488  	max_level = err + 1;
1489  	unix_get_secdata(siocb->scm, skb);
1490  
1491  	skb_put(skb, len - data_len);
1492  	skb->data_len = data_len;
1493  	skb->len = len;
1494  	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1495  	if (err)
1496  		goto out_free;
1497  
1498  	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1499  
1500  restart:
1501  	if (!other) {
1502  		err = -ECONNRESET;
1503  		if (sunaddr == NULL)
1504  			goto out_free;
1505  
1506  		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1507  					hash, &err);
1508  		if (other == NULL)
1509  			goto out_free;
1510  	}
1511  
1512  	if (sk_filter(other, skb) < 0) {
1513  		/* Toss the packet but do not return any error to the sender */
1514  		err = len;
1515  		goto out_free;
1516  	}
1517  
1518  	unix_state_lock(other);
1519  	err = -EPERM;
1520  	if (!unix_may_send(sk, other))
1521  		goto out_unlock;
1522  
1523  	if (sock_flag(other, SOCK_DEAD)) {
1524  		/*
1525  		 *	Check with 1003.1g - what should
1526  		 *	datagram error
1527  		 */
1528  		unix_state_unlock(other);
1529  		sock_put(other);
1530  
1531  		err = 0;
1532  		unix_state_lock(sk);
1533  		if (unix_peer(sk) == other) {
1534  			unix_peer(sk) = NULL;
1535  			unix_state_unlock(sk);
1536  
1537  			unix_dgram_disconnected(sk, other);
1538  			sock_put(other);
1539  			err = -ECONNREFUSED;
1540  		} else {
1541  			unix_state_unlock(sk);
1542  		}
1543  
1544  		other = NULL;
1545  		if (err)
1546  			goto out_free;
1547  		goto restart;
1548  	}
1549  
1550  	err = -EPIPE;
1551  	if (other->sk_shutdown & RCV_SHUTDOWN)
1552  		goto out_unlock;
1553  
1554  	if (sk->sk_type != SOCK_SEQPACKET) {
1555  		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1556  		if (err)
1557  			goto out_unlock;
1558  	}
1559  
1560  	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1561  		if (!timeo) {
1562  			err = -EAGAIN;
1563  			goto out_unlock;
1564  		}
1565  
1566  		timeo = unix_wait_for_peer(other, timeo);
1567  
1568  		err = sock_intr_errno(timeo);
1569  		if (signal_pending(current))
1570  			goto out_free;
1571  
1572  		goto restart;
1573  	}
1574  
1575  	if (sock_flag(other, SOCK_RCVTSTAMP))
1576  		__net_timestamp(skb);
1577  	maybe_add_creds(skb, sock, other);
1578  	skb_queue_tail(&other->sk_receive_queue, skb);
1579  	if (max_level > unix_sk(other)->recursion_level)
1580  		unix_sk(other)->recursion_level = max_level;
1581  	unix_state_unlock(other);
1582  	other->sk_data_ready(other, len);
1583  	sock_put(other);
1584  	scm_destroy(siocb->scm);
1585  	return len;
1586  
1587  out_unlock:
1588  	unix_state_unlock(other);
1589  out_free:
1590  	kfree_skb(skb);
1591  out:
1592  	if (other)
1593  		sock_put(other);
1594  	scm_destroy(siocb->scm);
1595  	return err;
1596  }
1597  
1598  
1599  static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1600  			       struct msghdr *msg, size_t len)
1601  {
1602  	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1603  	struct sock *sk = sock->sk;
1604  	struct sock *other = NULL;
1605  	int err, size;
1606  	struct sk_buff *skb;
1607  	int sent = 0;
1608  	struct scm_cookie tmp_scm;
1609  	bool fds_sent = false;
1610  	int max_level;
1611  
1612  	if (NULL == siocb->scm)
1613  		siocb->scm = &tmp_scm;
1614  	wait_for_unix_gc();
1615  	err = scm_send(sock, msg, siocb->scm, false);
1616  	if (err < 0)
1617  		return err;
1618  
1619  	err = -EOPNOTSUPP;
1620  	if (msg->msg_flags&MSG_OOB)
1621  		goto out_err;
1622  
1623  	if (msg->msg_namelen) {
1624  		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1625  		goto out_err;
1626  	} else {
1627  		err = -ENOTCONN;
1628  		other = unix_peer(sk);
1629  		if (!other)
1630  			goto out_err;
1631  	}
1632  
1633  	if (sk->sk_shutdown & SEND_SHUTDOWN)
1634  		goto pipe_err;
1635  
1636  	while (sent < len) {
1637  		/*
1638  		 *	Optimisation for the fact that under 0.01% of X
1639  		 *	messages typically need breaking up.
1640  		 */
1641  
1642  		size = len-sent;
1643  
1644  		/* Keep two messages in the pipe so it schedules better */
1645  		if (size > ((sk->sk_sndbuf >> 1) - 64))
1646  			size = (sk->sk_sndbuf >> 1) - 64;
1647  
1648  		if (size > SKB_MAX_ALLOC)
1649  			size = SKB_MAX_ALLOC;
1650  
1651  		/*
1652  		 *	Grab a buffer
1653  		 */
1654  
1655  		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1656  					  &err);
1657  
1658  		if (skb == NULL)
1659  			goto out_err;
1660  
1661  		/*
1662  		 *	If you pass two values to the sock_alloc_send_skb
1663  		 *	it tries to grab the large buffer with GFP_NOFS
1664  		 *	(which can fail easily), and if it fails grab the
1665  		 *	fallback size buffer which is under a page and will
1666  		 *	succeed. [Alan]
1667  		 */
1668  		size = min_t(int, size, skb_tailroom(skb));
1669  
1670  
1671  		/* Only send the fds in the first buffer */
1672  		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1673  		if (err < 0) {
1674  			kfree_skb(skb);
1675  			goto out_err;
1676  		}
1677  		max_level = err + 1;
1678  		fds_sent = true;
1679  
1680  		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1681  		if (err) {
1682  			kfree_skb(skb);
1683  			goto out_err;
1684  		}
1685  
1686  		unix_state_lock(other);
1687  
1688  		if (sock_flag(other, SOCK_DEAD) ||
1689  		    (other->sk_shutdown & RCV_SHUTDOWN))
1690  			goto pipe_err_free;
1691  
1692  		maybe_add_creds(skb, sock, other);
1693  		skb_queue_tail(&other->sk_receive_queue, skb);
1694  		if (max_level > unix_sk(other)->recursion_level)
1695  			unix_sk(other)->recursion_level = max_level;
1696  		unix_state_unlock(other);
1697  		other->sk_data_ready(other, size);
1698  		sent += size;
1699  	}
1700  
1701  	scm_destroy(siocb->scm);
1702  	siocb->scm = NULL;
1703  
1704  	return sent;
1705  
1706  pipe_err_free:
1707  	unix_state_unlock(other);
1708  	kfree_skb(skb);
1709  pipe_err:
1710  	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1711  		send_sig(SIGPIPE, current, 0);
1712  	err = -EPIPE;
1713  out_err:
1714  	scm_destroy(siocb->scm);
1715  	siocb->scm = NULL;
1716  	return sent ? : err;
1717  }
1718  
1719  static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1720  				  struct msghdr *msg, size_t len)
1721  {
1722  	int err;
1723  	struct sock *sk = sock->sk;
1724  
1725  	err = sock_error(sk);
1726  	if (err)
1727  		return err;
1728  
1729  	if (sk->sk_state != TCP_ESTABLISHED)
1730  		return -ENOTCONN;
1731  
1732  	if (msg->msg_namelen)
1733  		msg->msg_namelen = 0;
1734  
1735  	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1736  }
1737  
1738  static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1739  			      struct msghdr *msg, size_t size,
1740  			      int flags)
1741  {
1742  	struct sock *sk = sock->sk;
1743  
1744  	if (sk->sk_state != TCP_ESTABLISHED)
1745  		return -ENOTCONN;
1746  
1747  	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1748  }
1749  
1750  static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1751  {
1752  	struct unix_sock *u = unix_sk(sk);
1753  
1754  	msg->msg_namelen = 0;
1755  	if (u->addr) {
1756  		msg->msg_namelen = u->addr->len;
1757  		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1758  	}
1759  }
1760  
1761  static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1762  			      struct msghdr *msg, size_t size,
1763  			      int flags)
1764  {
1765  	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1766  	struct scm_cookie tmp_scm;
1767  	struct sock *sk = sock->sk;
1768  	struct unix_sock *u = unix_sk(sk);
1769  	int noblock = flags & MSG_DONTWAIT;
1770  	struct sk_buff *skb;
1771  	int err;
1772  	int peeked, skip;
1773  
1774  	err = -EOPNOTSUPP;
1775  	if (flags&MSG_OOB)
1776  		goto out;
1777  
1778  	msg->msg_namelen = 0;
1779  
1780  	err = mutex_lock_interruptible(&u->readlock);
1781  	if (err) {
1782  		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1783  		goto out;
1784  	}
1785  
1786  	skip = sk_peek_offset(sk, flags);
1787  
1788  	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1789  	if (!skb) {
1790  		unix_state_lock(sk);
1791  		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1792  		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1793  		    (sk->sk_shutdown & RCV_SHUTDOWN))
1794  			err = 0;
1795  		unix_state_unlock(sk);
1796  		goto out_unlock;
1797  	}
1798  
1799  	wake_up_interruptible_sync_poll(&u->peer_wait,
1800  					POLLOUT | POLLWRNORM | POLLWRBAND);
1801  
1802  	if (msg->msg_name)
1803  		unix_copy_addr(msg, skb->sk);
1804  
1805  	if (size > skb->len - skip)
1806  		size = skb->len - skip;
1807  	else if (size < skb->len - skip)
1808  		msg->msg_flags |= MSG_TRUNC;
1809  
1810  	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1811  	if (err)
1812  		goto out_free;
1813  
1814  	if (sock_flag(sk, SOCK_RCVTSTAMP))
1815  		__sock_recv_timestamp(msg, sk, skb);
1816  
1817  	if (!siocb->scm) {
1818  		siocb->scm = &tmp_scm;
1819  		memset(&tmp_scm, 0, sizeof(tmp_scm));
1820  	}
1821  	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1822  	unix_set_secdata(siocb->scm, skb);
1823  
1824  	if (!(flags & MSG_PEEK)) {
1825  		if (UNIXCB(skb).fp)
1826  			unix_detach_fds(siocb->scm, skb);
1827  
1828  		sk_peek_offset_bwd(sk, skb->len);
1829  	} else {
1830  		/* It is questionable: on PEEK we could:
1831  		   - do not return fds - good, but too simple 8)
1832  		   - return fds, and do not return them on read (old strategy,
1833  		     apparently wrong)
1834  		   - clone fds (I chose it for now, it is the most universal
1835  		     solution)
1836  
1837  		   POSIX 1003.1g does not actually define this clearly
1838  		   at all. POSIX 1003.1g doesn't define a lot of things
1839  		   clearly however!
1840  
1841  		*/
1842  
1843  		sk_peek_offset_fwd(sk, size);
1844  
1845  		if (UNIXCB(skb).fp)
1846  			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1847  	}
1848  	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1849  
1850  	scm_recv(sock, msg, siocb->scm, flags);
1851  
1852  out_free:
1853  	skb_free_datagram(sk, skb);
1854  out_unlock:
1855  	mutex_unlock(&u->readlock);
1856  out:
1857  	return err;
1858  }
1859  
1860  /*
1861   *	Sleep until more data has arrived. But check for races..
1862   */
1863  static long unix_stream_data_wait(struct sock *sk, long timeo,
1864  				  struct sk_buff *last)
1865  {
1866  	DEFINE_WAIT(wait);
1867  
1868  	unix_state_lock(sk);
1869  
1870  	for (;;) {
1871  		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1872  
1873  		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1874  		    sk->sk_err ||
1875  		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1876  		    signal_pending(current) ||
1877  		    !timeo)
1878  			break;
1879  
1880  		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1881  		unix_state_unlock(sk);
1882  		timeo = schedule_timeout(timeo);
1883  		unix_state_lock(sk);
1884  		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1885  	}
1886  
1887  	finish_wait(sk_sleep(sk), &wait);
1888  	unix_state_unlock(sk);
1889  	return timeo;
1890  }
1891  
1892  static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1893  			       struct msghdr *msg, size_t size,
1894  			       int flags)
1895  {
1896  	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1897  	struct scm_cookie tmp_scm;
1898  	struct sock *sk = sock->sk;
1899  	struct unix_sock *u = unix_sk(sk);
1900  	struct sockaddr_un *sunaddr = msg->msg_name;
1901  	int copied = 0;
1902  	int check_creds = 0;
1903  	int target;
1904  	int err = 0;
1905  	long timeo;
1906  	int skip;
1907  
1908  	err = -EINVAL;
1909  	if (sk->sk_state != TCP_ESTABLISHED)
1910  		goto out;
1911  
1912  	err = -EOPNOTSUPP;
1913  	if (flags&MSG_OOB)
1914  		goto out;
1915  
1916  	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1917  	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1918  
1919  	msg->msg_namelen = 0;
1920  
1921  	/* Lock the socket to prevent queue disordering
1922  	 * while sleeps in memcpy_tomsg
1923  	 */
1924  
1925  	if (!siocb->scm) {
1926  		siocb->scm = &tmp_scm;
1927  		memset(&tmp_scm, 0, sizeof(tmp_scm));
1928  	}
1929  
1930  	err = mutex_lock_interruptible(&u->readlock);
1931  	if (err) {
1932  		err = sock_intr_errno(timeo);
1933  		goto out;
1934  	}
1935  
1936  	do {
1937  		int chunk;
1938  		struct sk_buff *skb, *last;
1939  
1940  		unix_state_lock(sk);
1941  		last = skb = skb_peek(&sk->sk_receive_queue);
1942  again:
1943  		if (skb == NULL) {
1944  			unix_sk(sk)->recursion_level = 0;
1945  			if (copied >= target)
1946  				goto unlock;
1947  
1948  			/*
1949  			 *	POSIX 1003.1g mandates this order.
1950  			 */
1951  
1952  			err = sock_error(sk);
1953  			if (err)
1954  				goto unlock;
1955  			if (sk->sk_shutdown & RCV_SHUTDOWN)
1956  				goto unlock;
1957  
1958  			unix_state_unlock(sk);
1959  			err = -EAGAIN;
1960  			if (!timeo)
1961  				break;
1962  			mutex_unlock(&u->readlock);
1963  
1964  			timeo = unix_stream_data_wait(sk, timeo, last);
1965  
1966  			if (signal_pending(current)
1967  			    ||  mutex_lock_interruptible(&u->readlock)) {
1968  				err = sock_intr_errno(timeo);
1969  				goto out;
1970  			}
1971  
1972  			continue;
1973   unlock:
1974  			unix_state_unlock(sk);
1975  			break;
1976  		}
1977  
1978  		skip = sk_peek_offset(sk, flags);
1979  		while (skip >= skb->len) {
1980  			skip -= skb->len;
1981  			last = skb;
1982  			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1983  			if (!skb)
1984  				goto again;
1985  		}
1986  
1987  		unix_state_unlock(sk);
1988  
1989  		if (check_creds) {
1990  			/* Never glue messages from different writers */
1991  			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1992  			    !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
1993  			    !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
1994  				break;
1995  		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1996  			/* Copy credentials */
1997  			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1998  			check_creds = 1;
1999  		}
2000  
2001  		/* Copy address just once */
2002  		if (sunaddr) {
2003  			unix_copy_addr(msg, skb->sk);
2004  			sunaddr = NULL;
2005  		}
2006  
2007  		chunk = min_t(unsigned int, skb->len - skip, size);
2008  		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
2009  			if (copied == 0)
2010  				copied = -EFAULT;
2011  			break;
2012  		}
2013  		copied += chunk;
2014  		size -= chunk;
2015  
2016  		/* Mark read part of skb as used */
2017  		if (!(flags & MSG_PEEK)) {
2018  			skb_pull(skb, chunk);
2019  
2020  			sk_peek_offset_bwd(sk, chunk);
2021  
2022  			if (UNIXCB(skb).fp)
2023  				unix_detach_fds(siocb->scm, skb);
2024  
2025  			if (skb->len)
2026  				break;
2027  
2028  			skb_unlink(skb, &sk->sk_receive_queue);
2029  			consume_skb(skb);
2030  
2031  			if (siocb->scm->fp)
2032  				break;
2033  		} else {
2034  			/* It is questionable, see note in unix_dgram_recvmsg.
2035  			 */
2036  			if (UNIXCB(skb).fp)
2037  				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2038  
2039  			sk_peek_offset_fwd(sk, chunk);
2040  
2041  			break;
2042  		}
2043  	} while (size);
2044  
2045  	mutex_unlock(&u->readlock);
2046  	scm_recv(sock, msg, siocb->scm, flags);
2047  out:
2048  	return copied ? : err;
2049  }
2050  
2051  static int unix_shutdown(struct socket *sock, int mode)
2052  {
2053  	struct sock *sk = sock->sk;
2054  	struct sock *other;
2055  
2056  	if (mode < SHUT_RD || mode > SHUT_RDWR)
2057  		return -EINVAL;
2058  	/* This maps:
2059  	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2060  	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2061  	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2062  	 */
2063  	++mode;
2064  
2065  	unix_state_lock(sk);
2066  	sk->sk_shutdown |= mode;
2067  	other = unix_peer(sk);
2068  	if (other)
2069  		sock_hold(other);
2070  	unix_state_unlock(sk);
2071  	sk->sk_state_change(sk);
2072  
2073  	if (other &&
2074  		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2075  
2076  		int peer_mode = 0;
2077  
2078  		if (mode&RCV_SHUTDOWN)
2079  			peer_mode |= SEND_SHUTDOWN;
2080  		if (mode&SEND_SHUTDOWN)
2081  			peer_mode |= RCV_SHUTDOWN;
2082  		unix_state_lock(other);
2083  		other->sk_shutdown |= peer_mode;
2084  		unix_state_unlock(other);
2085  		other->sk_state_change(other);
2086  		if (peer_mode == SHUTDOWN_MASK)
2087  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2088  		else if (peer_mode & RCV_SHUTDOWN)
2089  			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2090  	}
2091  	if (other)
2092  		sock_put(other);
2093  
2094  	return 0;
2095  }
2096  
2097  long unix_inq_len(struct sock *sk)
2098  {
2099  	struct sk_buff *skb;
2100  	long amount = 0;
2101  
2102  	if (sk->sk_state == TCP_LISTEN)
2103  		return -EINVAL;
2104  
2105  	spin_lock(&sk->sk_receive_queue.lock);
2106  	if (sk->sk_type == SOCK_STREAM ||
2107  	    sk->sk_type == SOCK_SEQPACKET) {
2108  		skb_queue_walk(&sk->sk_receive_queue, skb)
2109  			amount += skb->len;
2110  	} else {
2111  		skb = skb_peek(&sk->sk_receive_queue);
2112  		if (skb)
2113  			amount = skb->len;
2114  	}
2115  	spin_unlock(&sk->sk_receive_queue.lock);
2116  
2117  	return amount;
2118  }
2119  EXPORT_SYMBOL_GPL(unix_inq_len);
2120  
2121  long unix_outq_len(struct sock *sk)
2122  {
2123  	return sk_wmem_alloc_get(sk);
2124  }
2125  EXPORT_SYMBOL_GPL(unix_outq_len);
2126  
2127  static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2128  {
2129  	struct sock *sk = sock->sk;
2130  	long amount = 0;
2131  	int err;
2132  
2133  	switch (cmd) {
2134  	case SIOCOUTQ:
2135  		amount = unix_outq_len(sk);
2136  		err = put_user(amount, (int __user *)arg);
2137  		break;
2138  	case SIOCINQ:
2139  		amount = unix_inq_len(sk);
2140  		if (amount < 0)
2141  			err = amount;
2142  		else
2143  			err = put_user(amount, (int __user *)arg);
2144  		break;
2145  	default:
2146  		err = -ENOIOCTLCMD;
2147  		break;
2148  	}
2149  	return err;
2150  }
2151  
2152  static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2153  {
2154  	struct sock *sk = sock->sk;
2155  	unsigned int mask;
2156  
2157  	sock_poll_wait(file, sk_sleep(sk), wait);
2158  	mask = 0;
2159  
2160  	/* exceptional events? */
2161  	if (sk->sk_err)
2162  		mask |= POLLERR;
2163  	if (sk->sk_shutdown == SHUTDOWN_MASK)
2164  		mask |= POLLHUP;
2165  	if (sk->sk_shutdown & RCV_SHUTDOWN)
2166  		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2167  
2168  	/* readable? */
2169  	if (!skb_queue_empty(&sk->sk_receive_queue))
2170  		mask |= POLLIN | POLLRDNORM;
2171  
2172  	/* Connection-based need to check for termination and startup */
2173  	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2174  	    sk->sk_state == TCP_CLOSE)
2175  		mask |= POLLHUP;
2176  
2177  	/*
2178  	 * we set writable also when the other side has shut down the
2179  	 * connection. This prevents stuck sockets.
2180  	 */
2181  	if (unix_writable(sk))
2182  		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2183  
2184  	return mask;
2185  }
2186  
2187  static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2188  				    poll_table *wait)
2189  {
2190  	struct sock *sk = sock->sk, *other;
2191  	unsigned int mask, writable;
2192  
2193  	sock_poll_wait(file, sk_sleep(sk), wait);
2194  	mask = 0;
2195  
2196  	/* exceptional events? */
2197  	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2198  		mask |= POLLERR |
2199  			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2200  
2201  	if (sk->sk_shutdown & RCV_SHUTDOWN)
2202  		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2203  	if (sk->sk_shutdown == SHUTDOWN_MASK)
2204  		mask |= POLLHUP;
2205  
2206  	/* readable? */
2207  	if (!skb_queue_empty(&sk->sk_receive_queue))
2208  		mask |= POLLIN | POLLRDNORM;
2209  
2210  	/* Connection-based need to check for termination and startup */
2211  	if (sk->sk_type == SOCK_SEQPACKET) {
2212  		if (sk->sk_state == TCP_CLOSE)
2213  			mask |= POLLHUP;
2214  		/* connection hasn't started yet? */
2215  		if (sk->sk_state == TCP_SYN_SENT)
2216  			return mask;
2217  	}
2218  
2219  	/* No write status requested, avoid expensive OUT tests. */
2220  	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2221  		return mask;
2222  
2223  	writable = unix_writable(sk);
2224  	other = unix_peer_get(sk);
2225  	if (other) {
2226  		if (unix_peer(other) != sk) {
2227  			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2228  			if (unix_recvq_full(other))
2229  				writable = 0;
2230  		}
2231  		sock_put(other);
2232  	}
2233  
2234  	if (writable)
2235  		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2236  	else
2237  		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2238  
2239  	return mask;
2240  }
2241  
2242  #ifdef CONFIG_PROC_FS
2243  
2244  #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2245  
2246  #define get_bucket(x) ((x) >> BUCKET_SPACE)
2247  #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2248  #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2249  
2250  static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2251  {
2252  	unsigned long offset = get_offset(*pos);
2253  	unsigned long bucket = get_bucket(*pos);
2254  	struct sock *sk;
2255  	unsigned long count = 0;
2256  
2257  	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2258  		if (sock_net(sk) != seq_file_net(seq))
2259  			continue;
2260  		if (++count == offset)
2261  			break;
2262  	}
2263  
2264  	return sk;
2265  }
2266  
2267  static struct sock *unix_next_socket(struct seq_file *seq,
2268  				     struct sock *sk,
2269  				     loff_t *pos)
2270  {
2271  	unsigned long bucket;
2272  
2273  	while (sk > (struct sock *)SEQ_START_TOKEN) {
2274  		sk = sk_next(sk);
2275  		if (!sk)
2276  			goto next_bucket;
2277  		if (sock_net(sk) == seq_file_net(seq))
2278  			return sk;
2279  	}
2280  
2281  	do {
2282  		sk = unix_from_bucket(seq, pos);
2283  		if (sk)
2284  			return sk;
2285  
2286  next_bucket:
2287  		bucket = get_bucket(*pos) + 1;
2288  		*pos = set_bucket_offset(bucket, 1);
2289  	} while (bucket < ARRAY_SIZE(unix_socket_table));
2290  
2291  	return NULL;
2292  }
2293  
2294  static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2295  	__acquires(unix_table_lock)
2296  {
2297  	spin_lock(&unix_table_lock);
2298  
2299  	if (!*pos)
2300  		return SEQ_START_TOKEN;
2301  
2302  	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2303  		return NULL;
2304  
2305  	return unix_next_socket(seq, NULL, pos);
2306  }
2307  
2308  static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2309  {
2310  	++*pos;
2311  	return unix_next_socket(seq, v, pos);
2312  }
2313  
2314  static void unix_seq_stop(struct seq_file *seq, void *v)
2315  	__releases(unix_table_lock)
2316  {
2317  	spin_unlock(&unix_table_lock);
2318  }
2319  
2320  static int unix_seq_show(struct seq_file *seq, void *v)
2321  {
2322  
2323  	if (v == SEQ_START_TOKEN)
2324  		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2325  			 "Inode Path\n");
2326  	else {
2327  		struct sock *s = v;
2328  		struct unix_sock *u = unix_sk(s);
2329  		unix_state_lock(s);
2330  
2331  		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2332  			s,
2333  			atomic_read(&s->sk_refcnt),
2334  			0,
2335  			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2336  			s->sk_type,
2337  			s->sk_socket ?
2338  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2339  			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2340  			sock_i_ino(s));
2341  
2342  		if (u->addr) {
2343  			int i, len;
2344  			seq_putc(seq, ' ');
2345  
2346  			i = 0;
2347  			len = u->addr->len - sizeof(short);
2348  			if (!UNIX_ABSTRACT(s))
2349  				len--;
2350  			else {
2351  				seq_putc(seq, '@');
2352  				i++;
2353  			}
2354  			for ( ; i < len; i++)
2355  				seq_putc(seq, u->addr->name->sun_path[i]);
2356  		}
2357  		unix_state_unlock(s);
2358  		seq_putc(seq, '\n');
2359  	}
2360  
2361  	return 0;
2362  }
2363  
2364  static const struct seq_operations unix_seq_ops = {
2365  	.start  = unix_seq_start,
2366  	.next   = unix_seq_next,
2367  	.stop   = unix_seq_stop,
2368  	.show   = unix_seq_show,
2369  };
2370  
2371  static int unix_seq_open(struct inode *inode, struct file *file)
2372  {
2373  	return seq_open_net(inode, file, &unix_seq_ops,
2374  			    sizeof(struct seq_net_private));
2375  }
2376  
2377  static const struct file_operations unix_seq_fops = {
2378  	.owner		= THIS_MODULE,
2379  	.open		= unix_seq_open,
2380  	.read		= seq_read,
2381  	.llseek		= seq_lseek,
2382  	.release	= seq_release_net,
2383  };
2384  
2385  #endif
2386  
2387  static const struct net_proto_family unix_family_ops = {
2388  	.family = PF_UNIX,
2389  	.create = unix_create,
2390  	.owner	= THIS_MODULE,
2391  };
2392  
2393  
2394  static int __net_init unix_net_init(struct net *net)
2395  {
2396  	int error = -ENOMEM;
2397  
2398  	net->unx.sysctl_max_dgram_qlen = 10;
2399  	if (unix_sysctl_register(net))
2400  		goto out;
2401  
2402  #ifdef CONFIG_PROC_FS
2403  	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2404  		unix_sysctl_unregister(net);
2405  		goto out;
2406  	}
2407  #endif
2408  	error = 0;
2409  out:
2410  	return error;
2411  }
2412  
2413  static void __net_exit unix_net_exit(struct net *net)
2414  {
2415  	unix_sysctl_unregister(net);
2416  	remove_proc_entry("unix", net->proc_net);
2417  }
2418  
2419  static struct pernet_operations unix_net_ops = {
2420  	.init = unix_net_init,
2421  	.exit = unix_net_exit,
2422  };
2423  
2424  static int __init af_unix_init(void)
2425  {
2426  	int rc = -1;
2427  
2428  	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2429  
2430  	rc = proto_register(&unix_proto, 1);
2431  	if (rc != 0) {
2432  		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2433  		       __func__);
2434  		goto out;
2435  	}
2436  
2437  	sock_register(&unix_family_ops);
2438  	register_pernet_subsys(&unix_net_ops);
2439  out:
2440  	return rc;
2441  }
2442  
2443  static void __exit af_unix_exit(void)
2444  {
2445  	sock_unregister(PF_UNIX);
2446  	proto_unregister(&unix_proto);
2447  	unregister_pernet_subsys(&unix_net_ops);
2448  }
2449  
2450  /* Earlier than device_initcall() so that other drivers invoking
2451     request_module() don't end up in a loop when modprobe tries
2452     to use a UNIX socket. But later than subsys_initcall() because
2453     we depend on stuff initialised there */
2454  fs_initcall(af_unix_init);
2455  module_exit(af_unix_exit);
2456  
2457  MODULE_LICENSE("GPL");
2458  MODULE_ALIAS_NETPROTO(PF_UNIX);
2459