1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
4 *
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
68 *
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/bpf-cgroup.h>
81 #include <linux/btf_ids.h>
82 #include <linux/dcache.h>
83 #include <linux/errno.h>
84 #include <linux/fcntl.h>
85 #include <linux/file.h>
86 #include <linux/filter.h>
87 #include <linux/fs.h>
88 #include <linux/init.h>
89 #include <linux/kernel.h>
90 #include <linux/mount.h>
91 #include <linux/namei.h>
92 #include <linux/poll.h>
93 #include <linux/proc_fs.h>
94 #include <linux/sched/signal.h>
95 #include <linux/security.h>
96 #include <linux/seq_file.h>
97 #include <linux/skbuff.h>
98 #include <linux/slab.h>
99 #include <linux/socket.h>
100 #include <linux/splice.h>
101 #include <linux/string.h>
102 #include <linux/uaccess.h>
103 #include <net/af_unix.h>
104 #include <net/net_namespace.h>
105 #include <net/scm.h>
106 #include <net/tcp_states.h>
107 #include <uapi/linux/sockios.h>
108 #include <uapi/linux/termios.h>
109
110 #include "af_unix.h"
111
112 static atomic_long_t unix_nr_socks;
113 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
114 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
115
116 /* SMP locking strategy:
117 * hash table is protected with spinlock.
118 * each socket state is protected by separate spinlock.
119 */
120 #ifdef CONFIG_PROVE_LOCKING
121 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r)))
122
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)123 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
124 const struct lockdep_map *b)
125 {
126 return cmp_ptr(a, b);
127 }
128
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)129 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
130 const struct lockdep_map *_b)
131 {
132 const struct unix_sock *a, *b;
133
134 a = container_of(_a, struct unix_sock, lock.dep_map);
135 b = container_of(_b, struct unix_sock, lock.dep_map);
136
137 if (a->sk.sk_state == TCP_LISTEN) {
138 /* unix_stream_connect(): Before the 2nd unix_state_lock(),
139 *
140 * 1. a is TCP_LISTEN.
141 * 2. b is not a.
142 * 3. concurrent connect(b -> a) must fail.
143 *
144 * Except for 2. & 3., the b's state can be any possible
145 * value due to concurrent connect() or listen().
146 *
147 * 2. is detected in debug_spin_lock_before(), and 3. cannot
148 * be expressed as lock_cmp_fn.
149 */
150 switch (b->sk.sk_state) {
151 case TCP_CLOSE:
152 case TCP_ESTABLISHED:
153 case TCP_LISTEN:
154 return -1;
155 default:
156 /* Invalid case. */
157 return 0;
158 }
159 }
160
161 /* Should never happen. Just to be symmetric. */
162 if (b->sk.sk_state == TCP_LISTEN) {
163 switch (b->sk.sk_state) {
164 case TCP_CLOSE:
165 case TCP_ESTABLISHED:
166 return 1;
167 default:
168 return 0;
169 }
170 }
171
172 /* unix_state_double_lock(): ascending address order. */
173 return cmp_ptr(a, b);
174 }
175
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)176 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
177 const struct lockdep_map *_b)
178 {
179 const struct sock *a, *b;
180
181 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
182 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
183
184 /* unix_collect_skb(): listener -> embryo order. */
185 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
186 return -1;
187
188 /* Should never happen. Just to be symmetric. */
189 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
190 return 1;
191
192 return 0;
193 }
194 #endif
195
unix_unbound_hash(struct sock * sk)196 static unsigned int unix_unbound_hash(struct sock *sk)
197 {
198 unsigned long hash = (unsigned long)sk;
199
200 hash ^= hash >> 16;
201 hash ^= hash >> 8;
202 hash ^= sk->sk_type;
203
204 return hash & UNIX_HASH_MOD;
205 }
206
unix_bsd_hash(struct inode * i)207 static unsigned int unix_bsd_hash(struct inode *i)
208 {
209 return i->i_ino & UNIX_HASH_MOD;
210 }
211
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)212 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
213 int addr_len, int type)
214 {
215 __wsum csum = csum_partial(sunaddr, addr_len, 0);
216 unsigned int hash;
217
218 hash = (__force unsigned int)csum_fold(csum);
219 hash ^= hash >> 8;
220 hash ^= type;
221
222 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
223 }
224
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)225 static void unix_table_double_lock(struct net *net,
226 unsigned int hash1, unsigned int hash2)
227 {
228 if (hash1 == hash2) {
229 spin_lock(&net->unx.table.locks[hash1]);
230 return;
231 }
232
233 if (hash1 > hash2)
234 swap(hash1, hash2);
235
236 spin_lock(&net->unx.table.locks[hash1]);
237 spin_lock(&net->unx.table.locks[hash2]);
238 }
239
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)240 static void unix_table_double_unlock(struct net *net,
241 unsigned int hash1, unsigned int hash2)
242 {
243 if (hash1 == hash2) {
244 spin_unlock(&net->unx.table.locks[hash1]);
245 return;
246 }
247
248 spin_unlock(&net->unx.table.locks[hash1]);
249 spin_unlock(&net->unx.table.locks[hash2]);
250 }
251
252 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)253 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
254 {
255 UNIXCB(skb).secid = scm->secid;
256 }
257
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)258 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
259 {
260 scm->secid = UNIXCB(skb).secid;
261 }
262
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)263 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
264 {
265 return (scm->secid == UNIXCB(skb).secid);
266 }
267 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)268 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
269 { }
270
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)271 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
272 { }
273
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)274 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
275 {
276 return true;
277 }
278 #endif /* CONFIG_SECURITY_NETWORK */
279
unix_may_send(struct sock * sk,struct sock * osk)280 static inline int unix_may_send(struct sock *sk, struct sock *osk)
281 {
282 return !unix_peer(osk) || unix_peer(osk) == sk;
283 }
284
unix_recvq_full_lockless(const struct sock * sk)285 static inline int unix_recvq_full_lockless(const struct sock *sk)
286 {
287 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
288 }
289
unix_peer_get(struct sock * s)290 struct sock *unix_peer_get(struct sock *s)
291 {
292 struct sock *peer;
293
294 unix_state_lock(s);
295 peer = unix_peer(s);
296 if (peer)
297 sock_hold(peer);
298 unix_state_unlock(s);
299 return peer;
300 }
301 EXPORT_SYMBOL_GPL(unix_peer_get);
302
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)303 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
304 int addr_len)
305 {
306 struct unix_address *addr;
307
308 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
309 if (!addr)
310 return NULL;
311
312 refcount_set(&addr->refcnt, 1);
313 addr->len = addr_len;
314 memcpy(addr->name, sunaddr, addr_len);
315
316 return addr;
317 }
318
unix_release_addr(struct unix_address * addr)319 static inline void unix_release_addr(struct unix_address *addr)
320 {
321 if (refcount_dec_and_test(&addr->refcnt))
322 kfree(addr);
323 }
324
325 /*
326 * Check unix socket name:
327 * - should be not zero length.
328 * - if started by not zero, should be NULL terminated (FS object)
329 * - if started by zero, it is abstract name.
330 */
331
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)332 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
333 {
334 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
335 addr_len > sizeof(*sunaddr))
336 return -EINVAL;
337
338 if (sunaddr->sun_family != AF_UNIX)
339 return -EINVAL;
340
341 return 0;
342 }
343
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)344 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
345 {
346 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
347 short offset = offsetof(struct sockaddr_storage, __data);
348
349 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
350
351 /* This may look like an off by one error but it is a bit more
352 * subtle. 108 is the longest valid AF_UNIX path for a binding.
353 * sun_path[108] doesn't as such exist. However in kernel space
354 * we are guaranteed that it is a valid memory location in our
355 * kernel address buffer because syscall functions always pass
356 * a pointer of struct sockaddr_storage which has a bigger buffer
357 * than 108. Also, we must terminate sun_path for strlen() in
358 * getname_kernel().
359 */
360 addr->__data[addr_len - offset] = 0;
361
362 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
363 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
364 * know the actual buffer.
365 */
366 return strlen(addr->__data) + offset + 1;
367 }
368
__unix_remove_socket(struct sock * sk)369 static void __unix_remove_socket(struct sock *sk)
370 {
371 sk_del_node_init(sk);
372 }
373
__unix_insert_socket(struct net * net,struct sock * sk)374 static void __unix_insert_socket(struct net *net, struct sock *sk)
375 {
376 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
377 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
378 }
379
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)380 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
381 struct unix_address *addr, unsigned int hash)
382 {
383 __unix_remove_socket(sk);
384 smp_store_release(&unix_sk(sk)->addr, addr);
385
386 sk->sk_hash = hash;
387 __unix_insert_socket(net, sk);
388 }
389
unix_remove_socket(struct net * net,struct sock * sk)390 static void unix_remove_socket(struct net *net, struct sock *sk)
391 {
392 spin_lock(&net->unx.table.locks[sk->sk_hash]);
393 __unix_remove_socket(sk);
394 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
395 }
396
unix_insert_unbound_socket(struct net * net,struct sock * sk)397 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
398 {
399 spin_lock(&net->unx.table.locks[sk->sk_hash]);
400 __unix_insert_socket(net, sk);
401 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
402 }
403
unix_insert_bsd_socket(struct sock * sk)404 static void unix_insert_bsd_socket(struct sock *sk)
405 {
406 spin_lock(&bsd_socket_locks[sk->sk_hash]);
407 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
408 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
409 }
410
unix_remove_bsd_socket(struct sock * sk)411 static void unix_remove_bsd_socket(struct sock *sk)
412 {
413 if (!hlist_unhashed(&sk->sk_bind_node)) {
414 spin_lock(&bsd_socket_locks[sk->sk_hash]);
415 __sk_del_bind_node(sk);
416 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
417
418 sk_node_init(&sk->sk_bind_node);
419 }
420 }
421
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)422 static struct sock *__unix_find_socket_byname(struct net *net,
423 struct sockaddr_un *sunname,
424 int len, unsigned int hash)
425 {
426 struct sock *s;
427
428 sk_for_each(s, &net->unx.table.buckets[hash]) {
429 struct unix_sock *u = unix_sk(s);
430
431 if (u->addr->len == len &&
432 !memcmp(u->addr->name, sunname, len))
433 return s;
434 }
435 return NULL;
436 }
437
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)438 static inline struct sock *unix_find_socket_byname(struct net *net,
439 struct sockaddr_un *sunname,
440 int len, unsigned int hash)
441 {
442 struct sock *s;
443
444 spin_lock(&net->unx.table.locks[hash]);
445 s = __unix_find_socket_byname(net, sunname, len, hash);
446 if (s)
447 sock_hold(s);
448 spin_unlock(&net->unx.table.locks[hash]);
449 return s;
450 }
451
unix_find_socket_byinode(struct inode * i)452 static struct sock *unix_find_socket_byinode(struct inode *i)
453 {
454 unsigned int hash = unix_bsd_hash(i);
455 struct sock *s;
456
457 spin_lock(&bsd_socket_locks[hash]);
458 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
459 struct dentry *dentry = unix_sk(s)->path.dentry;
460
461 if (dentry && d_backing_inode(dentry) == i) {
462 sock_hold(s);
463 spin_unlock(&bsd_socket_locks[hash]);
464 return s;
465 }
466 }
467 spin_unlock(&bsd_socket_locks[hash]);
468 return NULL;
469 }
470
471 /* Support code for asymmetrically connected dgram sockets
472 *
473 * If a datagram socket is connected to a socket not itself connected
474 * to the first socket (eg, /dev/log), clients may only enqueue more
475 * messages if the present receive queue of the server socket is not
476 * "too large". This means there's a second writeability condition
477 * poll and sendmsg need to test. The dgram recv code will do a wake
478 * up on the peer_wait wait queue of a socket upon reception of a
479 * datagram which needs to be propagated to sleeping would-be writers
480 * since these might not have sent anything so far. This can't be
481 * accomplished via poll_wait because the lifetime of the server
482 * socket might be less than that of its clients if these break their
483 * association with it or if the server socket is closed while clients
484 * are still connected to it and there's no way to inform "a polling
485 * implementation" that it should let go of a certain wait queue
486 *
487 * In order to propagate a wake up, a wait_queue_entry_t of the client
488 * socket is enqueued on the peer_wait queue of the server socket
489 * whose wake function does a wake_up on the ordinary client socket
490 * wait queue. This connection is established whenever a write (or
491 * poll for write) hit the flow control condition and broken when the
492 * association to the server socket is dissolved or after a wake up
493 * was relayed.
494 */
495
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)496 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
497 void *key)
498 {
499 struct unix_sock *u;
500 wait_queue_head_t *u_sleep;
501
502 u = container_of(q, struct unix_sock, peer_wake);
503
504 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
505 q);
506 u->peer_wake.private = NULL;
507
508 /* relaying can only happen while the wq still exists */
509 u_sleep = sk_sleep(&u->sk);
510 if (u_sleep)
511 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
512
513 return 0;
514 }
515
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)516 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
517 {
518 struct unix_sock *u, *u_other;
519 int rc;
520
521 u = unix_sk(sk);
522 u_other = unix_sk(other);
523 rc = 0;
524 spin_lock(&u_other->peer_wait.lock);
525
526 if (!u->peer_wake.private) {
527 u->peer_wake.private = other;
528 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
529
530 rc = 1;
531 }
532
533 spin_unlock(&u_other->peer_wait.lock);
534 return rc;
535 }
536
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)537 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
538 struct sock *other)
539 {
540 struct unix_sock *u, *u_other;
541
542 u = unix_sk(sk);
543 u_other = unix_sk(other);
544 spin_lock(&u_other->peer_wait.lock);
545
546 if (u->peer_wake.private == other) {
547 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
548 u->peer_wake.private = NULL;
549 }
550
551 spin_unlock(&u_other->peer_wait.lock);
552 }
553
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)554 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
555 struct sock *other)
556 {
557 unix_dgram_peer_wake_disconnect(sk, other);
558 wake_up_interruptible_poll(sk_sleep(sk),
559 EPOLLOUT |
560 EPOLLWRNORM |
561 EPOLLWRBAND);
562 }
563
564 /* preconditions:
565 * - unix_peer(sk) == other
566 * - association is stable
567 */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)568 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
569 {
570 int connected;
571
572 connected = unix_dgram_peer_wake_connect(sk, other);
573
574 /* If other is SOCK_DEAD, we want to make sure we signal
575 * POLLOUT, such that a subsequent write() can get a
576 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
577 * to other and its full, we will hang waiting for POLLOUT.
578 */
579 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
580 return 1;
581
582 if (connected)
583 unix_dgram_peer_wake_disconnect(sk, other);
584
585 return 0;
586 }
587
unix_writable(const struct sock * sk,unsigned char state)588 static int unix_writable(const struct sock *sk, unsigned char state)
589 {
590 return state != TCP_LISTEN &&
591 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
592 }
593
unix_write_space(struct sock * sk)594 static void unix_write_space(struct sock *sk)
595 {
596 struct socket_wq *wq;
597
598 rcu_read_lock();
599 if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
600 wq = rcu_dereference(sk->sk_wq);
601 if (skwq_has_sleeper(wq))
602 wake_up_interruptible_sync_poll(&wq->wait,
603 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
604 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
605 }
606 rcu_read_unlock();
607 }
608
609 /* When dgram socket disconnects (or changes its peer), we clear its receive
610 * queue of packets arrived from previous peer. First, it allows to do
611 * flow control based only on wmem_alloc; second, sk connected to peer
612 * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)613 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
614 {
615 if (!skb_queue_empty(&sk->sk_receive_queue)) {
616 skb_queue_purge_reason(&sk->sk_receive_queue,
617 SKB_DROP_REASON_UNIX_DISCONNECT);
618
619 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
620
621 /* If one link of bidirectional dgram pipe is disconnected,
622 * we signal error. Messages are lost. Do not make this,
623 * when peer was not connected to us.
624 */
625 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
626 WRITE_ONCE(other->sk_err, ECONNRESET);
627 sk_error_report(other);
628 }
629 }
630 }
631
unix_sock_destructor(struct sock * sk)632 static void unix_sock_destructor(struct sock *sk)
633 {
634 struct unix_sock *u = unix_sk(sk);
635
636 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
637
638 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
639 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
640 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
641 if (!sock_flag(sk, SOCK_DEAD)) {
642 pr_info("Attempt to release alive unix socket: %p\n", sk);
643 return;
644 }
645
646 if (u->addr)
647 unix_release_addr(u->addr);
648
649 atomic_long_dec(&unix_nr_socks);
650 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
651 #ifdef UNIX_REFCNT_DEBUG
652 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
653 atomic_long_read(&unix_nr_socks));
654 #endif
655 }
656
unix_release_sock(struct sock * sk,int embrion)657 static void unix_release_sock(struct sock *sk, int embrion)
658 {
659 struct unix_sock *u = unix_sk(sk);
660 struct sock *skpair;
661 struct sk_buff *skb;
662 struct path path;
663 int state;
664
665 unix_remove_socket(sock_net(sk), sk);
666 unix_remove_bsd_socket(sk);
667
668 /* Clear state */
669 unix_state_lock(sk);
670 sock_orphan(sk);
671 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
672 path = u->path;
673 u->path.dentry = NULL;
674 u->path.mnt = NULL;
675 state = sk->sk_state;
676 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
677
678 skpair = unix_peer(sk);
679 unix_peer(sk) = NULL;
680
681 unix_state_unlock(sk);
682
683 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
684 u->oob_skb = NULL;
685 #endif
686
687 wake_up_interruptible_all(&u->peer_wait);
688
689 if (skpair != NULL) {
690 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
691 unix_state_lock(skpair);
692 /* No more writes */
693 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
694 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
695 WRITE_ONCE(skpair->sk_err, ECONNRESET);
696 unix_state_unlock(skpair);
697 skpair->sk_state_change(skpair);
698 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
699 }
700
701 unix_dgram_peer_wake_disconnect(sk, skpair);
702 sock_put(skpair); /* It may now die */
703 }
704
705 /* Try to flush out this socket. Throw out buffers at least */
706
707 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
708 if (state == TCP_LISTEN)
709 unix_release_sock(skb->sk, 1);
710
711 /* passed fds are erased in the kfree_skb hook */
712 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
713 }
714
715 if (path.dentry)
716 path_put(&path);
717
718 sock_put(sk);
719
720 /* ---- Socket is dead now and most probably destroyed ---- */
721
722 /*
723 * Fixme: BSD difference: In BSD all sockets connected to us get
724 * ECONNRESET and we die on the spot. In Linux we behave
725 * like files and pipes do and wait for the last
726 * dereference.
727 *
728 * Can't we simply set sock->err?
729 *
730 * What the above comment does talk about? --ANK(980817)
731 */
732
733 if (READ_ONCE(unix_tot_inflight))
734 unix_gc(); /* Garbage collect fds */
735 }
736
init_peercred(struct sock * sk)737 static void init_peercred(struct sock *sk)
738 {
739 sk->sk_peer_pid = get_pid(task_tgid(current));
740 sk->sk_peer_cred = get_current_cred();
741 }
742
update_peercred(struct sock * sk)743 static void update_peercred(struct sock *sk)
744 {
745 const struct cred *old_cred;
746 struct pid *old_pid;
747
748 spin_lock(&sk->sk_peer_lock);
749 old_pid = sk->sk_peer_pid;
750 old_cred = sk->sk_peer_cred;
751 init_peercred(sk);
752 spin_unlock(&sk->sk_peer_lock);
753
754 put_pid(old_pid);
755 put_cred(old_cred);
756 }
757
copy_peercred(struct sock * sk,struct sock * peersk)758 static void copy_peercred(struct sock *sk, struct sock *peersk)
759 {
760 lockdep_assert_held(&unix_sk(peersk)->lock);
761
762 spin_lock(&sk->sk_peer_lock);
763 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
764 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
765 spin_unlock(&sk->sk_peer_lock);
766 }
767
unix_listen(struct socket * sock,int backlog)768 static int unix_listen(struct socket *sock, int backlog)
769 {
770 int err;
771 struct sock *sk = sock->sk;
772 struct unix_sock *u = unix_sk(sk);
773
774 err = -EOPNOTSUPP;
775 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
776 goto out; /* Only stream/seqpacket sockets accept */
777 err = -EINVAL;
778 if (!READ_ONCE(u->addr))
779 goto out; /* No listens on an unbound socket */
780 unix_state_lock(sk);
781 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
782 goto out_unlock;
783 if (backlog > sk->sk_max_ack_backlog)
784 wake_up_interruptible_all(&u->peer_wait);
785 sk->sk_max_ack_backlog = backlog;
786 WRITE_ONCE(sk->sk_state, TCP_LISTEN);
787
788 /* set credentials so connect can copy them */
789 update_peercred(sk);
790 err = 0;
791
792 out_unlock:
793 unix_state_unlock(sk);
794 out:
795 return err;
796 }
797
798 static int unix_release(struct socket *);
799 static int unix_bind(struct socket *, struct sockaddr *, int);
800 static int unix_stream_connect(struct socket *, struct sockaddr *,
801 int addr_len, int flags);
802 static int unix_socketpair(struct socket *, struct socket *);
803 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
804 static int unix_getname(struct socket *, struct sockaddr *, int);
805 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
806 static __poll_t unix_dgram_poll(struct file *, struct socket *,
807 poll_table *);
808 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
809 #ifdef CONFIG_COMPAT
810 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
811 #endif
812 static int unix_shutdown(struct socket *, int);
813 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
814 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
815 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
816 struct pipe_inode_info *, size_t size,
817 unsigned int flags);
818 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
819 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
820 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
821 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
822 static int unix_dgram_connect(struct socket *, struct sockaddr *,
823 int, int);
824 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
825 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
826 int);
827
828 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)829 static int unix_count_nr_fds(struct sock *sk)
830 {
831 struct sk_buff *skb;
832 struct unix_sock *u;
833 int nr_fds = 0;
834
835 spin_lock(&sk->sk_receive_queue.lock);
836 skb = skb_peek(&sk->sk_receive_queue);
837 while (skb) {
838 u = unix_sk(skb->sk);
839 nr_fds += atomic_read(&u->scm_stat.nr_fds);
840 skb = skb_peek_next(skb, &sk->sk_receive_queue);
841 }
842 spin_unlock(&sk->sk_receive_queue.lock);
843
844 return nr_fds;
845 }
846
unix_show_fdinfo(struct seq_file * m,struct socket * sock)847 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
848 {
849 struct sock *sk = sock->sk;
850 unsigned char s_state;
851 struct unix_sock *u;
852 int nr_fds = 0;
853
854 if (sk) {
855 s_state = READ_ONCE(sk->sk_state);
856 u = unix_sk(sk);
857
858 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
859 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
860 * SOCK_DGRAM is ordinary. So, no lock is needed.
861 */
862 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
863 nr_fds = atomic_read(&u->scm_stat.nr_fds);
864 else if (s_state == TCP_LISTEN)
865 nr_fds = unix_count_nr_fds(sk);
866
867 seq_printf(m, "scm_fds: %u\n", nr_fds);
868 }
869 }
870 #else
871 #define unix_show_fdinfo NULL
872 #endif
873
874 static const struct proto_ops unix_stream_ops = {
875 .family = PF_UNIX,
876 .owner = THIS_MODULE,
877 .release = unix_release,
878 .bind = unix_bind,
879 .connect = unix_stream_connect,
880 .socketpair = unix_socketpair,
881 .accept = unix_accept,
882 .getname = unix_getname,
883 .poll = unix_poll,
884 .ioctl = unix_ioctl,
885 #ifdef CONFIG_COMPAT
886 .compat_ioctl = unix_compat_ioctl,
887 #endif
888 .listen = unix_listen,
889 .shutdown = unix_shutdown,
890 .sendmsg = unix_stream_sendmsg,
891 .recvmsg = unix_stream_recvmsg,
892 .read_skb = unix_stream_read_skb,
893 .mmap = sock_no_mmap,
894 .splice_read = unix_stream_splice_read,
895 .set_peek_off = sk_set_peek_off,
896 .show_fdinfo = unix_show_fdinfo,
897 };
898
899 static const struct proto_ops unix_dgram_ops = {
900 .family = PF_UNIX,
901 .owner = THIS_MODULE,
902 .release = unix_release,
903 .bind = unix_bind,
904 .connect = unix_dgram_connect,
905 .socketpair = unix_socketpair,
906 .accept = sock_no_accept,
907 .getname = unix_getname,
908 .poll = unix_dgram_poll,
909 .ioctl = unix_ioctl,
910 #ifdef CONFIG_COMPAT
911 .compat_ioctl = unix_compat_ioctl,
912 #endif
913 .listen = sock_no_listen,
914 .shutdown = unix_shutdown,
915 .sendmsg = unix_dgram_sendmsg,
916 .read_skb = unix_read_skb,
917 .recvmsg = unix_dgram_recvmsg,
918 .mmap = sock_no_mmap,
919 .set_peek_off = sk_set_peek_off,
920 .show_fdinfo = unix_show_fdinfo,
921 };
922
923 static const struct proto_ops unix_seqpacket_ops = {
924 .family = PF_UNIX,
925 .owner = THIS_MODULE,
926 .release = unix_release,
927 .bind = unix_bind,
928 .connect = unix_stream_connect,
929 .socketpair = unix_socketpair,
930 .accept = unix_accept,
931 .getname = unix_getname,
932 .poll = unix_dgram_poll,
933 .ioctl = unix_ioctl,
934 #ifdef CONFIG_COMPAT
935 .compat_ioctl = unix_compat_ioctl,
936 #endif
937 .listen = unix_listen,
938 .shutdown = unix_shutdown,
939 .sendmsg = unix_seqpacket_sendmsg,
940 .recvmsg = unix_seqpacket_recvmsg,
941 .mmap = sock_no_mmap,
942 .set_peek_off = sk_set_peek_off,
943 .show_fdinfo = unix_show_fdinfo,
944 };
945
unix_close(struct sock * sk,long timeout)946 static void unix_close(struct sock *sk, long timeout)
947 {
948 /* Nothing to do here, unix socket does not need a ->close().
949 * This is merely for sockmap.
950 */
951 }
952
unix_unhash(struct sock * sk)953 static void unix_unhash(struct sock *sk)
954 {
955 /* Nothing to do here, unix socket does not need a ->unhash().
956 * This is merely for sockmap.
957 */
958 }
959
unix_bpf_bypass_getsockopt(int level,int optname)960 static bool unix_bpf_bypass_getsockopt(int level, int optname)
961 {
962 if (level == SOL_SOCKET) {
963 switch (optname) {
964 case SO_PEERPIDFD:
965 return true;
966 default:
967 return false;
968 }
969 }
970
971 return false;
972 }
973
974 struct proto unix_dgram_proto = {
975 .name = "UNIX",
976 .owner = THIS_MODULE,
977 .obj_size = sizeof(struct unix_sock),
978 .close = unix_close,
979 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
980 #ifdef CONFIG_BPF_SYSCALL
981 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
982 #endif
983 };
984
985 struct proto unix_stream_proto = {
986 .name = "UNIX-STREAM",
987 .owner = THIS_MODULE,
988 .obj_size = sizeof(struct unix_sock),
989 .close = unix_close,
990 .unhash = unix_unhash,
991 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
992 #ifdef CONFIG_BPF_SYSCALL
993 .psock_update_sk_prot = unix_stream_bpf_update_proto,
994 #endif
995 };
996
unix_create1(struct net * net,struct socket * sock,int kern,int type)997 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
998 {
999 struct unix_sock *u;
1000 struct sock *sk;
1001 int err;
1002
1003 atomic_long_inc(&unix_nr_socks);
1004 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1005 err = -ENFILE;
1006 goto err;
1007 }
1008
1009 if (type == SOCK_STREAM)
1010 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1011 else /*dgram and seqpacket */
1012 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1013
1014 if (!sk) {
1015 err = -ENOMEM;
1016 goto err;
1017 }
1018
1019 sock_init_data(sock, sk);
1020
1021 sk->sk_hash = unix_unbound_hash(sk);
1022 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
1023 sk->sk_write_space = unix_write_space;
1024 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1025 sk->sk_destruct = unix_sock_destructor;
1026 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1027
1028 u = unix_sk(sk);
1029 u->listener = NULL;
1030 u->vertex = NULL;
1031 u->path.dentry = NULL;
1032 u->path.mnt = NULL;
1033 spin_lock_init(&u->lock);
1034 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1035 mutex_init(&u->iolock); /* single task reading lock */
1036 mutex_init(&u->bindlock); /* single task binding lock */
1037 init_waitqueue_head(&u->peer_wait);
1038 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1039 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1040 unix_insert_unbound_socket(net, sk);
1041
1042 sock_prot_inuse_add(net, sk->sk_prot, 1);
1043
1044 return sk;
1045
1046 err:
1047 atomic_long_dec(&unix_nr_socks);
1048 return ERR_PTR(err);
1049 }
1050
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1051 static int unix_create(struct net *net, struct socket *sock, int protocol,
1052 int kern)
1053 {
1054 struct sock *sk;
1055
1056 if (protocol && protocol != PF_UNIX)
1057 return -EPROTONOSUPPORT;
1058
1059 sock->state = SS_UNCONNECTED;
1060
1061 switch (sock->type) {
1062 case SOCK_STREAM:
1063 sock->ops = &unix_stream_ops;
1064 break;
1065 /*
1066 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1067 * nothing uses it.
1068 */
1069 case SOCK_RAW:
1070 sock->type = SOCK_DGRAM;
1071 fallthrough;
1072 case SOCK_DGRAM:
1073 sock->ops = &unix_dgram_ops;
1074 break;
1075 case SOCK_SEQPACKET:
1076 sock->ops = &unix_seqpacket_ops;
1077 break;
1078 default:
1079 return -ESOCKTNOSUPPORT;
1080 }
1081
1082 sk = unix_create1(net, sock, kern, sock->type);
1083 if (IS_ERR(sk))
1084 return PTR_ERR(sk);
1085
1086 return 0;
1087 }
1088
unix_release(struct socket * sock)1089 static int unix_release(struct socket *sock)
1090 {
1091 struct sock *sk = sock->sk;
1092
1093 if (!sk)
1094 return 0;
1095
1096 sk->sk_prot->close(sk, 0);
1097 unix_release_sock(sk, 0);
1098 sock->sk = NULL;
1099
1100 return 0;
1101 }
1102
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1103 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1104 int type)
1105 {
1106 struct inode *inode;
1107 struct path path;
1108 struct sock *sk;
1109 int err;
1110
1111 unix_mkname_bsd(sunaddr, addr_len);
1112 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1113 if (err)
1114 goto fail;
1115
1116 err = path_permission(&path, MAY_WRITE);
1117 if (err)
1118 goto path_put;
1119
1120 err = -ECONNREFUSED;
1121 inode = d_backing_inode(path.dentry);
1122 if (!S_ISSOCK(inode->i_mode))
1123 goto path_put;
1124
1125 sk = unix_find_socket_byinode(inode);
1126 if (!sk)
1127 goto path_put;
1128
1129 err = -EPROTOTYPE;
1130 if (sk->sk_type == type)
1131 touch_atime(&path);
1132 else
1133 goto sock_put;
1134
1135 path_put(&path);
1136
1137 return sk;
1138
1139 sock_put:
1140 sock_put(sk);
1141 path_put:
1142 path_put(&path);
1143 fail:
1144 return ERR_PTR(err);
1145 }
1146
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1147 static struct sock *unix_find_abstract(struct net *net,
1148 struct sockaddr_un *sunaddr,
1149 int addr_len, int type)
1150 {
1151 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1152 struct dentry *dentry;
1153 struct sock *sk;
1154
1155 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1156 if (!sk)
1157 return ERR_PTR(-ECONNREFUSED);
1158
1159 dentry = unix_sk(sk)->path.dentry;
1160 if (dentry)
1161 touch_atime(&unix_sk(sk)->path);
1162
1163 return sk;
1164 }
1165
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1166 static struct sock *unix_find_other(struct net *net,
1167 struct sockaddr_un *sunaddr,
1168 int addr_len, int type)
1169 {
1170 struct sock *sk;
1171
1172 if (sunaddr->sun_path[0])
1173 sk = unix_find_bsd(sunaddr, addr_len, type);
1174 else
1175 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1176
1177 return sk;
1178 }
1179
unix_autobind(struct sock * sk)1180 static int unix_autobind(struct sock *sk)
1181 {
1182 struct unix_sock *u = unix_sk(sk);
1183 unsigned int new_hash, old_hash;
1184 struct net *net = sock_net(sk);
1185 struct unix_address *addr;
1186 u32 lastnum, ordernum;
1187 int err;
1188
1189 err = mutex_lock_interruptible(&u->bindlock);
1190 if (err)
1191 return err;
1192
1193 if (u->addr)
1194 goto out;
1195
1196 err = -ENOMEM;
1197 addr = kzalloc(sizeof(*addr) +
1198 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1199 if (!addr)
1200 goto out;
1201
1202 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1203 addr->name->sun_family = AF_UNIX;
1204 refcount_set(&addr->refcnt, 1);
1205
1206 old_hash = sk->sk_hash;
1207 ordernum = get_random_u32();
1208 lastnum = ordernum & 0xFFFFF;
1209 retry:
1210 ordernum = (ordernum + 1) & 0xFFFFF;
1211 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1212
1213 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1214 unix_table_double_lock(net, old_hash, new_hash);
1215
1216 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1217 unix_table_double_unlock(net, old_hash, new_hash);
1218
1219 /* __unix_find_socket_byname() may take long time if many names
1220 * are already in use.
1221 */
1222 cond_resched();
1223
1224 if (ordernum == lastnum) {
1225 /* Give up if all names seems to be in use. */
1226 err = -ENOSPC;
1227 unix_release_addr(addr);
1228 goto out;
1229 }
1230
1231 goto retry;
1232 }
1233
1234 __unix_set_addr_hash(net, sk, addr, new_hash);
1235 unix_table_double_unlock(net, old_hash, new_hash);
1236 err = 0;
1237
1238 out: mutex_unlock(&u->bindlock);
1239 return err;
1240 }
1241
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1242 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1243 int addr_len)
1244 {
1245 umode_t mode = S_IFSOCK |
1246 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1247 struct unix_sock *u = unix_sk(sk);
1248 unsigned int new_hash, old_hash;
1249 struct net *net = sock_net(sk);
1250 struct mnt_idmap *idmap;
1251 struct unix_address *addr;
1252 struct dentry *dentry;
1253 struct path parent;
1254 int err;
1255
1256 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1257 addr = unix_create_addr(sunaddr, addr_len);
1258 if (!addr)
1259 return -ENOMEM;
1260
1261 /*
1262 * Get the parent directory, calculate the hash for last
1263 * component.
1264 */
1265 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1266 if (IS_ERR(dentry)) {
1267 err = PTR_ERR(dentry);
1268 goto out;
1269 }
1270
1271 /*
1272 * All right, let's create it.
1273 */
1274 idmap = mnt_idmap(parent.mnt);
1275 err = security_path_mknod(&parent, dentry, mode, 0);
1276 if (!err)
1277 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1278 if (err)
1279 goto out_path;
1280 err = mutex_lock_interruptible(&u->bindlock);
1281 if (err)
1282 goto out_unlink;
1283 if (u->addr)
1284 goto out_unlock;
1285
1286 old_hash = sk->sk_hash;
1287 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1288 unix_table_double_lock(net, old_hash, new_hash);
1289 u->path.mnt = mntget(parent.mnt);
1290 u->path.dentry = dget(dentry);
1291 __unix_set_addr_hash(net, sk, addr, new_hash);
1292 unix_table_double_unlock(net, old_hash, new_hash);
1293 unix_insert_bsd_socket(sk);
1294 mutex_unlock(&u->bindlock);
1295 done_path_create(&parent, dentry);
1296 return 0;
1297
1298 out_unlock:
1299 mutex_unlock(&u->bindlock);
1300 err = -EINVAL;
1301 out_unlink:
1302 /* failed after successful mknod? unlink what we'd created... */
1303 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1304 out_path:
1305 done_path_create(&parent, dentry);
1306 out:
1307 unix_release_addr(addr);
1308 return err == -EEXIST ? -EADDRINUSE : err;
1309 }
1310
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1311 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1312 int addr_len)
1313 {
1314 struct unix_sock *u = unix_sk(sk);
1315 unsigned int new_hash, old_hash;
1316 struct net *net = sock_net(sk);
1317 struct unix_address *addr;
1318 int err;
1319
1320 addr = unix_create_addr(sunaddr, addr_len);
1321 if (!addr)
1322 return -ENOMEM;
1323
1324 err = mutex_lock_interruptible(&u->bindlock);
1325 if (err)
1326 goto out;
1327
1328 if (u->addr) {
1329 err = -EINVAL;
1330 goto out_mutex;
1331 }
1332
1333 old_hash = sk->sk_hash;
1334 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1335 unix_table_double_lock(net, old_hash, new_hash);
1336
1337 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1338 goto out_spin;
1339
1340 __unix_set_addr_hash(net, sk, addr, new_hash);
1341 unix_table_double_unlock(net, old_hash, new_hash);
1342 mutex_unlock(&u->bindlock);
1343 return 0;
1344
1345 out_spin:
1346 unix_table_double_unlock(net, old_hash, new_hash);
1347 err = -EADDRINUSE;
1348 out_mutex:
1349 mutex_unlock(&u->bindlock);
1350 out:
1351 unix_release_addr(addr);
1352 return err;
1353 }
1354
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1355 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1356 {
1357 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1358 struct sock *sk = sock->sk;
1359 int err;
1360
1361 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1362 sunaddr->sun_family == AF_UNIX)
1363 return unix_autobind(sk);
1364
1365 err = unix_validate_addr(sunaddr, addr_len);
1366 if (err)
1367 return err;
1368
1369 if (sunaddr->sun_path[0])
1370 err = unix_bind_bsd(sk, sunaddr, addr_len);
1371 else
1372 err = unix_bind_abstract(sk, sunaddr, addr_len);
1373
1374 return err;
1375 }
1376
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1377 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1378 {
1379 if (unlikely(sk1 == sk2) || !sk2) {
1380 unix_state_lock(sk1);
1381 return;
1382 }
1383
1384 if (sk1 > sk2)
1385 swap(sk1, sk2);
1386
1387 unix_state_lock(sk1);
1388 unix_state_lock(sk2);
1389 }
1390
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1391 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1392 {
1393 if (unlikely(sk1 == sk2) || !sk2) {
1394 unix_state_unlock(sk1);
1395 return;
1396 }
1397 unix_state_unlock(sk1);
1398 unix_state_unlock(sk2);
1399 }
1400
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1401 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1402 int alen, int flags)
1403 {
1404 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1405 struct sock *sk = sock->sk;
1406 struct sock *other;
1407 int err;
1408
1409 err = -EINVAL;
1410 if (alen < offsetofend(struct sockaddr, sa_family))
1411 goto out;
1412
1413 if (addr->sa_family != AF_UNSPEC) {
1414 err = unix_validate_addr(sunaddr, alen);
1415 if (err)
1416 goto out;
1417
1418 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1419 if (err)
1420 goto out;
1421
1422 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1423 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1424 !READ_ONCE(unix_sk(sk)->addr)) {
1425 err = unix_autobind(sk);
1426 if (err)
1427 goto out;
1428 }
1429
1430 restart:
1431 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1432 if (IS_ERR(other)) {
1433 err = PTR_ERR(other);
1434 goto out;
1435 }
1436
1437 unix_state_double_lock(sk, other);
1438
1439 /* Apparently VFS overslept socket death. Retry. */
1440 if (sock_flag(other, SOCK_DEAD)) {
1441 unix_state_double_unlock(sk, other);
1442 sock_put(other);
1443 goto restart;
1444 }
1445
1446 err = -EPERM;
1447 if (!unix_may_send(sk, other))
1448 goto out_unlock;
1449
1450 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1451 if (err)
1452 goto out_unlock;
1453
1454 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1455 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1456 } else {
1457 /*
1458 * 1003.1g breaking connected state with AF_UNSPEC
1459 */
1460 other = NULL;
1461 unix_state_double_lock(sk, other);
1462 }
1463
1464 /*
1465 * If it was connected, reconnect.
1466 */
1467 if (unix_peer(sk)) {
1468 struct sock *old_peer = unix_peer(sk);
1469
1470 unix_peer(sk) = other;
1471 if (!other)
1472 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1473 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1474
1475 unix_state_double_unlock(sk, other);
1476
1477 if (other != old_peer) {
1478 unix_dgram_disconnected(sk, old_peer);
1479
1480 unix_state_lock(old_peer);
1481 if (!unix_peer(old_peer))
1482 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1483 unix_state_unlock(old_peer);
1484 }
1485
1486 sock_put(old_peer);
1487 } else {
1488 unix_peer(sk) = other;
1489 unix_state_double_unlock(sk, other);
1490 }
1491
1492 return 0;
1493
1494 out_unlock:
1495 unix_state_double_unlock(sk, other);
1496 sock_put(other);
1497 out:
1498 return err;
1499 }
1500
unix_wait_for_peer(struct sock * other,long timeo)1501 static long unix_wait_for_peer(struct sock *other, long timeo)
1502 {
1503 struct unix_sock *u = unix_sk(other);
1504 int sched;
1505 DEFINE_WAIT(wait);
1506
1507 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1508
1509 sched = !sock_flag(other, SOCK_DEAD) &&
1510 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1511 unix_recvq_full_lockless(other);
1512
1513 unix_state_unlock(other);
1514
1515 if (sched)
1516 timeo = schedule_timeout(timeo);
1517
1518 finish_wait(&u->peer_wait, &wait);
1519 return timeo;
1520 }
1521
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1522 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1523 int addr_len, int flags)
1524 {
1525 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1526 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1527 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1528 struct net *net = sock_net(sk);
1529 struct sk_buff *skb = NULL;
1530 unsigned char state;
1531 long timeo;
1532 int err;
1533
1534 err = unix_validate_addr(sunaddr, addr_len);
1535 if (err)
1536 goto out;
1537
1538 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1539 if (err)
1540 goto out;
1541
1542 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1543 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1544 !READ_ONCE(u->addr)) {
1545 err = unix_autobind(sk);
1546 if (err)
1547 goto out;
1548 }
1549
1550 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1551
1552 /* First of all allocate resources.
1553 * If we will make it after state is locked,
1554 * we will have to recheck all again in any case.
1555 */
1556
1557 /* create new sock for complete connection */
1558 newsk = unix_create1(net, NULL, 0, sock->type);
1559 if (IS_ERR(newsk)) {
1560 err = PTR_ERR(newsk);
1561 goto out;
1562 }
1563
1564 /* Allocate skb for sending to listening sock */
1565 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1566 if (!skb) {
1567 err = -ENOMEM;
1568 goto out_free_sk;
1569 }
1570
1571 restart:
1572 /* Find listening sock. */
1573 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1574 if (IS_ERR(other)) {
1575 err = PTR_ERR(other);
1576 goto out_free_skb;
1577 }
1578
1579 unix_state_lock(other);
1580
1581 /* Apparently VFS overslept socket death. Retry. */
1582 if (sock_flag(other, SOCK_DEAD)) {
1583 unix_state_unlock(other);
1584 sock_put(other);
1585 goto restart;
1586 }
1587
1588 if (other->sk_state != TCP_LISTEN ||
1589 other->sk_shutdown & RCV_SHUTDOWN) {
1590 err = -ECONNREFUSED;
1591 goto out_unlock;
1592 }
1593
1594 if (unix_recvq_full_lockless(other)) {
1595 if (!timeo) {
1596 err = -EAGAIN;
1597 goto out_unlock;
1598 }
1599
1600 timeo = unix_wait_for_peer(other, timeo);
1601 sock_put(other);
1602
1603 err = sock_intr_errno(timeo);
1604 if (signal_pending(current))
1605 goto out_free_skb;
1606
1607 goto restart;
1608 }
1609
1610 /* self connect and simultaneous connect are eliminated
1611 * by rejecting TCP_LISTEN socket to avoid deadlock.
1612 */
1613 state = READ_ONCE(sk->sk_state);
1614 if (unlikely(state != TCP_CLOSE)) {
1615 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1616 goto out_unlock;
1617 }
1618
1619 unix_state_lock(sk);
1620
1621 if (unlikely(sk->sk_state != TCP_CLOSE)) {
1622 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1623 unix_state_unlock(sk);
1624 goto out_unlock;
1625 }
1626
1627 err = security_unix_stream_connect(sk, other, newsk);
1628 if (err) {
1629 unix_state_unlock(sk);
1630 goto out_unlock;
1631 }
1632
1633 /* The way is open! Fastly set all the necessary fields... */
1634
1635 sock_hold(sk);
1636 unix_peer(newsk) = sk;
1637 newsk->sk_state = TCP_ESTABLISHED;
1638 newsk->sk_type = sk->sk_type;
1639 init_peercred(newsk);
1640 newu = unix_sk(newsk);
1641 newu->listener = other;
1642 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1643 otheru = unix_sk(other);
1644
1645 /* copy address information from listening to new sock
1646 *
1647 * The contents of *(otheru->addr) and otheru->path
1648 * are seen fully set up here, since we have found
1649 * otheru in hash under its lock. Insertion into the
1650 * hash chain we'd found it in had been done in an
1651 * earlier critical area protected by the chain's lock,
1652 * the same one where we'd set *(otheru->addr) contents,
1653 * as well as otheru->path and otheru->addr itself.
1654 *
1655 * Using smp_store_release() here to set newu->addr
1656 * is enough to make those stores, as well as stores
1657 * to newu->path visible to anyone who gets newu->addr
1658 * by smp_load_acquire(). IOW, the same warranties
1659 * as for unix_sock instances bound in unix_bind() or
1660 * in unix_autobind().
1661 */
1662 if (otheru->path.dentry) {
1663 path_get(&otheru->path);
1664 newu->path = otheru->path;
1665 }
1666 refcount_inc(&otheru->addr->refcnt);
1667 smp_store_release(&newu->addr, otheru->addr);
1668
1669 /* Set credentials */
1670 copy_peercred(sk, other);
1671
1672 sock->state = SS_CONNECTED;
1673 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1674 sock_hold(newsk);
1675
1676 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1677 unix_peer(sk) = newsk;
1678
1679 unix_state_unlock(sk);
1680
1681 /* take ten and send info to listening sock */
1682 spin_lock(&other->sk_receive_queue.lock);
1683 __skb_queue_tail(&other->sk_receive_queue, skb);
1684 spin_unlock(&other->sk_receive_queue.lock);
1685 unix_state_unlock(other);
1686 other->sk_data_ready(other);
1687 sock_put(other);
1688 return 0;
1689
1690 out_unlock:
1691 unix_state_unlock(other);
1692 sock_put(other);
1693 out_free_skb:
1694 consume_skb(skb);
1695 out_free_sk:
1696 unix_release_sock(newsk, 0);
1697 out:
1698 return err;
1699 }
1700
unix_socketpair(struct socket * socka,struct socket * sockb)1701 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1702 {
1703 struct sock *ska = socka->sk, *skb = sockb->sk;
1704
1705 /* Join our sockets back to back */
1706 sock_hold(ska);
1707 sock_hold(skb);
1708 unix_peer(ska) = skb;
1709 unix_peer(skb) = ska;
1710 init_peercred(ska);
1711 init_peercred(skb);
1712
1713 ska->sk_state = TCP_ESTABLISHED;
1714 skb->sk_state = TCP_ESTABLISHED;
1715 socka->state = SS_CONNECTED;
1716 sockb->state = SS_CONNECTED;
1717 return 0;
1718 }
1719
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1720 static void unix_sock_inherit_flags(const struct socket *old,
1721 struct socket *new)
1722 {
1723 if (test_bit(SOCK_PASSCRED, &old->flags))
1724 set_bit(SOCK_PASSCRED, &new->flags);
1725 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1726 set_bit(SOCK_PASSPIDFD, &new->flags);
1727 if (test_bit(SOCK_PASSSEC, &old->flags))
1728 set_bit(SOCK_PASSSEC, &new->flags);
1729 }
1730
unix_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)1731 static int unix_accept(struct socket *sock, struct socket *newsock,
1732 struct proto_accept_arg *arg)
1733 {
1734 struct sock *sk = sock->sk;
1735 struct sk_buff *skb;
1736 struct sock *tsk;
1737
1738 arg->err = -EOPNOTSUPP;
1739 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1740 goto out;
1741
1742 arg->err = -EINVAL;
1743 if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1744 goto out;
1745
1746 /* If socket state is TCP_LISTEN it cannot change (for now...),
1747 * so that no locks are necessary.
1748 */
1749
1750 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1751 &arg->err);
1752 if (!skb) {
1753 /* This means receive shutdown. */
1754 if (arg->err == 0)
1755 arg->err = -EINVAL;
1756 goto out;
1757 }
1758
1759 tsk = skb->sk;
1760 skb_free_datagram(sk, skb);
1761 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1762
1763 /* attach accepted sock to socket */
1764 unix_state_lock(tsk);
1765 unix_update_edges(unix_sk(tsk));
1766 newsock->state = SS_CONNECTED;
1767 unix_sock_inherit_flags(sock, newsock);
1768 sock_graft(tsk, newsock);
1769 unix_state_unlock(tsk);
1770 return 0;
1771
1772 out:
1773 return arg->err;
1774 }
1775
1776
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1777 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1778 {
1779 struct sock *sk = sock->sk;
1780 struct unix_address *addr;
1781 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1782 int err = 0;
1783
1784 if (peer) {
1785 sk = unix_peer_get(sk);
1786
1787 err = -ENOTCONN;
1788 if (!sk)
1789 goto out;
1790 err = 0;
1791 } else {
1792 sock_hold(sk);
1793 }
1794
1795 addr = smp_load_acquire(&unix_sk(sk)->addr);
1796 if (!addr) {
1797 sunaddr->sun_family = AF_UNIX;
1798 sunaddr->sun_path[0] = 0;
1799 err = offsetof(struct sockaddr_un, sun_path);
1800 } else {
1801 err = addr->len;
1802 memcpy(sunaddr, addr->name, addr->len);
1803
1804 if (peer)
1805 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1806 CGROUP_UNIX_GETPEERNAME);
1807 else
1808 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1809 CGROUP_UNIX_GETSOCKNAME);
1810 }
1811 sock_put(sk);
1812 out:
1813 return err;
1814 }
1815
1816 /* The "user->unix_inflight" variable is protected by the garbage
1817 * collection lock, and we just read it locklessly here. If you go
1818 * over the limit, there might be a tiny race in actually noticing
1819 * it across threads. Tough.
1820 */
too_many_unix_fds(struct task_struct * p)1821 static inline bool too_many_unix_fds(struct task_struct *p)
1822 {
1823 struct user_struct *user = current_user();
1824
1825 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1826 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1827 return false;
1828 }
1829
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1830 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1831 {
1832 if (too_many_unix_fds(current))
1833 return -ETOOMANYREFS;
1834
1835 UNIXCB(skb).fp = scm->fp;
1836 scm->fp = NULL;
1837
1838 if (unix_prepare_fpl(UNIXCB(skb).fp))
1839 return -ENOMEM;
1840
1841 return 0;
1842 }
1843
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1844 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1845 {
1846 scm->fp = UNIXCB(skb).fp;
1847 UNIXCB(skb).fp = NULL;
1848
1849 unix_destroy_fpl(scm->fp);
1850 }
1851
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1852 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1853 {
1854 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1855 }
1856
unix_destruct_scm(struct sk_buff * skb)1857 static void unix_destruct_scm(struct sk_buff *skb)
1858 {
1859 struct scm_cookie scm;
1860
1861 memset(&scm, 0, sizeof(scm));
1862 scm.pid = UNIXCB(skb).pid;
1863 if (UNIXCB(skb).fp)
1864 unix_detach_fds(&scm, skb);
1865
1866 /* Alas, it calls VFS */
1867 /* So fscking what? fput() had been SMP-safe since the last Summer */
1868 scm_destroy(&scm);
1869 sock_wfree(skb);
1870 }
1871
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1872 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1873 {
1874 int err = 0;
1875
1876 UNIXCB(skb).pid = get_pid(scm->pid);
1877 UNIXCB(skb).uid = scm->creds.uid;
1878 UNIXCB(skb).gid = scm->creds.gid;
1879 UNIXCB(skb).fp = NULL;
1880 unix_get_secdata(scm, skb);
1881 if (scm->fp && send_fds)
1882 err = unix_attach_fds(scm, skb);
1883
1884 skb->destructor = unix_destruct_scm;
1885 return err;
1886 }
1887
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1888 static bool unix_passcred_enabled(const struct socket *sock,
1889 const struct sock *other)
1890 {
1891 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1892 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1893 !other->sk_socket ||
1894 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1895 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1896 }
1897
1898 /*
1899 * Some apps rely on write() giving SCM_CREDENTIALS
1900 * We include credentials if source or destination socket
1901 * asserted SOCK_PASSCRED.
1902 */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1903 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1904 const struct sock *other)
1905 {
1906 if (UNIXCB(skb).pid)
1907 return;
1908 if (unix_passcred_enabled(sock, other)) {
1909 UNIXCB(skb).pid = get_pid(task_tgid(current));
1910 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1911 }
1912 }
1913
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1914 static bool unix_skb_scm_eq(struct sk_buff *skb,
1915 struct scm_cookie *scm)
1916 {
1917 return UNIXCB(skb).pid == scm->pid &&
1918 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1919 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1920 unix_secdata_eq(scm, skb);
1921 }
1922
scm_stat_add(struct sock * sk,struct sk_buff * skb)1923 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1924 {
1925 struct scm_fp_list *fp = UNIXCB(skb).fp;
1926 struct unix_sock *u = unix_sk(sk);
1927
1928 if (unlikely(fp && fp->count)) {
1929 atomic_add(fp->count, &u->scm_stat.nr_fds);
1930 unix_add_edges(fp, u);
1931 }
1932 }
1933
scm_stat_del(struct sock * sk,struct sk_buff * skb)1934 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1935 {
1936 struct scm_fp_list *fp = UNIXCB(skb).fp;
1937 struct unix_sock *u = unix_sk(sk);
1938
1939 if (unlikely(fp && fp->count)) {
1940 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1941 unix_del_edges(fp);
1942 }
1943 }
1944
1945 /*
1946 * Send AF_UNIX data.
1947 */
1948
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1949 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1950 size_t len)
1951 {
1952 struct sock *sk = sock->sk, *other = NULL;
1953 struct unix_sock *u = unix_sk(sk);
1954 struct scm_cookie scm;
1955 struct sk_buff *skb;
1956 int data_len = 0;
1957 int sk_locked;
1958 long timeo;
1959 int err;
1960
1961 err = scm_send(sock, msg, &scm, false);
1962 if (err < 0)
1963 return err;
1964
1965 wait_for_unix_gc(scm.fp);
1966
1967 if (msg->msg_flags & MSG_OOB) {
1968 err = -EOPNOTSUPP;
1969 goto out;
1970 }
1971
1972 if (msg->msg_namelen) {
1973 err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
1974 if (err)
1975 goto out;
1976
1977 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1978 msg->msg_name,
1979 &msg->msg_namelen,
1980 NULL);
1981 if (err)
1982 goto out;
1983 }
1984
1985 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1986 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1987 !READ_ONCE(u->addr)) {
1988 err = unix_autobind(sk);
1989 if (err)
1990 goto out;
1991 }
1992
1993 if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
1994 err = -EMSGSIZE;
1995 goto out;
1996 }
1997
1998 if (len > SKB_MAX_ALLOC) {
1999 data_len = min_t(size_t,
2000 len - SKB_MAX_ALLOC,
2001 MAX_SKB_FRAGS * PAGE_SIZE);
2002 data_len = PAGE_ALIGN(data_len);
2003
2004 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2005 }
2006
2007 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2008 msg->msg_flags & MSG_DONTWAIT, &err,
2009 PAGE_ALLOC_COSTLY_ORDER);
2010 if (!skb)
2011 goto out;
2012
2013 err = unix_scm_to_skb(&scm, skb, true);
2014 if (err < 0)
2015 goto out_free;
2016
2017 skb_put(skb, len - data_len);
2018 skb->data_len = data_len;
2019 skb->len = len;
2020 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2021 if (err)
2022 goto out_free;
2023
2024 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2025
2026 if (msg->msg_namelen) {
2027 lookup:
2028 other = unix_find_other(sock_net(sk), msg->msg_name,
2029 msg->msg_namelen, sk->sk_type);
2030 if (IS_ERR(other)) {
2031 err = PTR_ERR(other);
2032 goto out_free;
2033 }
2034 } else {
2035 other = unix_peer_get(sk);
2036 if (!other) {
2037 err = -ENOTCONN;
2038 goto out_free;
2039 }
2040 }
2041
2042 if (sk_filter(other, skb) < 0) {
2043 /* Toss the packet but do not return any error to the sender */
2044 err = len;
2045 goto out_sock_put;
2046 }
2047
2048 restart:
2049 sk_locked = 0;
2050 unix_state_lock(other);
2051 restart_locked:
2052
2053 if (!unix_may_send(sk, other)) {
2054 err = -EPERM;
2055 goto out_unlock;
2056 }
2057
2058 if (unlikely(sock_flag(other, SOCK_DEAD))) {
2059 /* Check with 1003.1g - what should datagram error */
2060
2061 unix_state_unlock(other);
2062
2063 if (sk->sk_type == SOCK_SEQPACKET) {
2064 /* We are here only when racing with unix_release_sock()
2065 * is clearing @other. Never change state to TCP_CLOSE
2066 * unlike SOCK_DGRAM wants.
2067 */
2068 err = -EPIPE;
2069 goto out_sock_put;
2070 }
2071
2072 if (!sk_locked)
2073 unix_state_lock(sk);
2074
2075 if (unix_peer(sk) == other) {
2076 unix_peer(sk) = NULL;
2077 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2078
2079 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2080 unix_state_unlock(sk);
2081
2082 unix_dgram_disconnected(sk, other);
2083 sock_put(other);
2084 err = -ECONNREFUSED;
2085 goto out_sock_put;
2086 }
2087
2088 unix_state_unlock(sk);
2089
2090 if (!msg->msg_namelen) {
2091 err = -ECONNRESET;
2092 goto out_sock_put;
2093 }
2094
2095 sock_put(other);
2096 goto lookup;
2097 }
2098
2099 if (other->sk_shutdown & RCV_SHUTDOWN) {
2100 err = -EPIPE;
2101 goto out_unlock;
2102 }
2103
2104 if (sk->sk_type != SOCK_SEQPACKET) {
2105 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2106 if (err)
2107 goto out_unlock;
2108 }
2109
2110 /* other == sk && unix_peer(other) != sk if
2111 * - unix_peer(sk) == NULL, destination address bound to sk
2112 * - unix_peer(sk) == sk by time of get but disconnected before lock
2113 */
2114 if (other != sk &&
2115 unlikely(unix_peer(other) != sk &&
2116 unix_recvq_full_lockless(other))) {
2117 if (timeo) {
2118 timeo = unix_wait_for_peer(other, timeo);
2119
2120 err = sock_intr_errno(timeo);
2121 if (signal_pending(current))
2122 goto out_sock_put;
2123
2124 goto restart;
2125 }
2126
2127 if (!sk_locked) {
2128 unix_state_unlock(other);
2129 unix_state_double_lock(sk, other);
2130 }
2131
2132 if (unix_peer(sk) != other ||
2133 unix_dgram_peer_wake_me(sk, other)) {
2134 err = -EAGAIN;
2135 sk_locked = 1;
2136 goto out_unlock;
2137 }
2138
2139 if (!sk_locked) {
2140 sk_locked = 1;
2141 goto restart_locked;
2142 }
2143 }
2144
2145 if (unlikely(sk_locked))
2146 unix_state_unlock(sk);
2147
2148 if (sock_flag(other, SOCK_RCVTSTAMP))
2149 __net_timestamp(skb);
2150 maybe_add_creds(skb, sock, other);
2151 scm_stat_add(other, skb);
2152 skb_queue_tail(&other->sk_receive_queue, skb);
2153 unix_state_unlock(other);
2154 other->sk_data_ready(other);
2155 sock_put(other);
2156 scm_destroy(&scm);
2157 return len;
2158
2159 out_unlock:
2160 if (sk_locked)
2161 unix_state_unlock(sk);
2162 unix_state_unlock(other);
2163 out_sock_put:
2164 sock_put(other);
2165 out_free:
2166 consume_skb(skb);
2167 out:
2168 scm_destroy(&scm);
2169 return err;
2170 }
2171
2172 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2173 * bytes, and a minimum of a full page.
2174 */
2175 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2176
2177 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2178 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2179 struct scm_cookie *scm, bool fds_sent)
2180 {
2181 struct unix_sock *ousk = unix_sk(other);
2182 struct sk_buff *skb;
2183 int err;
2184
2185 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2186
2187 if (!skb)
2188 return err;
2189
2190 err = unix_scm_to_skb(scm, skb, !fds_sent);
2191 if (err < 0)
2192 goto out;
2193
2194 skb_put(skb, 1);
2195 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2196
2197 if (err)
2198 goto out;
2199
2200 unix_state_lock(other);
2201
2202 if (sock_flag(other, SOCK_DEAD) ||
2203 (other->sk_shutdown & RCV_SHUTDOWN)) {
2204 unix_state_unlock(other);
2205 err = -EPIPE;
2206 goto out;
2207 }
2208
2209 maybe_add_creds(skb, sock, other);
2210 scm_stat_add(other, skb);
2211
2212 spin_lock(&other->sk_receive_queue.lock);
2213 WRITE_ONCE(ousk->oob_skb, skb);
2214 __skb_queue_tail(&other->sk_receive_queue, skb);
2215 spin_unlock(&other->sk_receive_queue.lock);
2216
2217 sk_send_sigurg(other);
2218 unix_state_unlock(other);
2219 other->sk_data_ready(other);
2220
2221 return 0;
2222 out:
2223 consume_skb(skb);
2224 return err;
2225 }
2226 #endif
2227
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2228 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2229 size_t len)
2230 {
2231 struct sock *sk = sock->sk;
2232 struct sk_buff *skb = NULL;
2233 struct sock *other = NULL;
2234 struct scm_cookie scm;
2235 bool fds_sent = false;
2236 int err, sent = 0;
2237
2238 err = scm_send(sock, msg, &scm, false);
2239 if (err < 0)
2240 return err;
2241
2242 wait_for_unix_gc(scm.fp);
2243
2244 if (msg->msg_flags & MSG_OOB) {
2245 err = -EOPNOTSUPP;
2246 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2247 if (len)
2248 len--;
2249 else
2250 #endif
2251 goto out_err;
2252 }
2253
2254 if (msg->msg_namelen) {
2255 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2256 goto out_err;
2257 } else {
2258 other = unix_peer(sk);
2259 if (!other) {
2260 err = -ENOTCONN;
2261 goto out_err;
2262 }
2263 }
2264
2265 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2266 goto out_pipe;
2267
2268 while (sent < len) {
2269 int size = len - sent;
2270 int data_len;
2271
2272 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2273 skb = sock_alloc_send_pskb(sk, 0, 0,
2274 msg->msg_flags & MSG_DONTWAIT,
2275 &err, 0);
2276 } else {
2277 /* Keep two messages in the pipe so it schedules better */
2278 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2279
2280 /* allow fallback to order-0 allocations */
2281 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2282
2283 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2284
2285 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2286
2287 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2288 msg->msg_flags & MSG_DONTWAIT, &err,
2289 get_order(UNIX_SKB_FRAGS_SZ));
2290 }
2291 if (!skb)
2292 goto out_err;
2293
2294 /* Only send the fds in the first buffer */
2295 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2296 if (err < 0)
2297 goto out_free;
2298
2299 fds_sent = true;
2300
2301 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2302 skb->ip_summed = CHECKSUM_UNNECESSARY;
2303 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2304 sk->sk_allocation);
2305 if (err < 0)
2306 goto out_free;
2307
2308 size = err;
2309 refcount_add(size, &sk->sk_wmem_alloc);
2310 } else {
2311 skb_put(skb, size - data_len);
2312 skb->data_len = data_len;
2313 skb->len = size;
2314 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2315 if (err)
2316 goto out_free;
2317 }
2318
2319 unix_state_lock(other);
2320
2321 if (sock_flag(other, SOCK_DEAD) ||
2322 (other->sk_shutdown & RCV_SHUTDOWN))
2323 goto out_pipe_unlock;
2324
2325 maybe_add_creds(skb, sock, other);
2326 scm_stat_add(other, skb);
2327 skb_queue_tail(&other->sk_receive_queue, skb);
2328 unix_state_unlock(other);
2329 other->sk_data_ready(other);
2330 sent += size;
2331 }
2332
2333 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2334 if (msg->msg_flags & MSG_OOB) {
2335 err = queue_oob(sock, msg, other, &scm, fds_sent);
2336 if (err)
2337 goto out_err;
2338 sent++;
2339 }
2340 #endif
2341
2342 scm_destroy(&scm);
2343
2344 return sent;
2345
2346 out_pipe_unlock:
2347 unix_state_unlock(other);
2348 out_pipe:
2349 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2350 send_sig(SIGPIPE, current, 0);
2351 err = -EPIPE;
2352 out_free:
2353 consume_skb(skb);
2354 out_err:
2355 scm_destroy(&scm);
2356 return sent ? : err;
2357 }
2358
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2359 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2360 size_t len)
2361 {
2362 int err;
2363 struct sock *sk = sock->sk;
2364
2365 err = sock_error(sk);
2366 if (err)
2367 return err;
2368
2369 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2370 return -ENOTCONN;
2371
2372 if (msg->msg_namelen)
2373 msg->msg_namelen = 0;
2374
2375 return unix_dgram_sendmsg(sock, msg, len);
2376 }
2377
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2378 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2379 size_t size, int flags)
2380 {
2381 struct sock *sk = sock->sk;
2382
2383 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2384 return -ENOTCONN;
2385
2386 return unix_dgram_recvmsg(sock, msg, size, flags);
2387 }
2388
unix_copy_addr(struct msghdr * msg,struct sock * sk)2389 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2390 {
2391 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2392
2393 if (addr) {
2394 msg->msg_namelen = addr->len;
2395 memcpy(msg->msg_name, addr->name, addr->len);
2396 }
2397 }
2398
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2399 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2400 int flags)
2401 {
2402 struct scm_cookie scm;
2403 struct socket *sock = sk->sk_socket;
2404 struct unix_sock *u = unix_sk(sk);
2405 struct sk_buff *skb, *last;
2406 long timeo;
2407 int skip;
2408 int err;
2409
2410 err = -EOPNOTSUPP;
2411 if (flags&MSG_OOB)
2412 goto out;
2413
2414 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2415
2416 do {
2417 mutex_lock(&u->iolock);
2418
2419 skip = sk_peek_offset(sk, flags);
2420 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2421 &skip, &err, &last);
2422 if (skb) {
2423 if (!(flags & MSG_PEEK))
2424 scm_stat_del(sk, skb);
2425 break;
2426 }
2427
2428 mutex_unlock(&u->iolock);
2429
2430 if (err != -EAGAIN)
2431 break;
2432 } while (timeo &&
2433 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2434 &err, &timeo, last));
2435
2436 if (!skb) { /* implies iolock unlocked */
2437 unix_state_lock(sk);
2438 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2439 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2440 (sk->sk_shutdown & RCV_SHUTDOWN))
2441 err = 0;
2442 unix_state_unlock(sk);
2443 goto out;
2444 }
2445
2446 if (wq_has_sleeper(&u->peer_wait))
2447 wake_up_interruptible_sync_poll(&u->peer_wait,
2448 EPOLLOUT | EPOLLWRNORM |
2449 EPOLLWRBAND);
2450
2451 if (msg->msg_name) {
2452 unix_copy_addr(msg, skb->sk);
2453
2454 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2455 msg->msg_name,
2456 &msg->msg_namelen);
2457 }
2458
2459 if (size > skb->len - skip)
2460 size = skb->len - skip;
2461 else if (size < skb->len - skip)
2462 msg->msg_flags |= MSG_TRUNC;
2463
2464 err = skb_copy_datagram_msg(skb, skip, msg, size);
2465 if (err)
2466 goto out_free;
2467
2468 if (sock_flag(sk, SOCK_RCVTSTAMP))
2469 __sock_recv_timestamp(msg, sk, skb);
2470
2471 memset(&scm, 0, sizeof(scm));
2472
2473 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2474 unix_set_secdata(&scm, skb);
2475
2476 if (!(flags & MSG_PEEK)) {
2477 if (UNIXCB(skb).fp)
2478 unix_detach_fds(&scm, skb);
2479
2480 sk_peek_offset_bwd(sk, skb->len);
2481 } else {
2482 /* It is questionable: on PEEK we could:
2483 - do not return fds - good, but too simple 8)
2484 - return fds, and do not return them on read (old strategy,
2485 apparently wrong)
2486 - clone fds (I chose it for now, it is the most universal
2487 solution)
2488
2489 POSIX 1003.1g does not actually define this clearly
2490 at all. POSIX 1003.1g doesn't define a lot of things
2491 clearly however!
2492
2493 */
2494
2495 sk_peek_offset_fwd(sk, size);
2496
2497 if (UNIXCB(skb).fp)
2498 unix_peek_fds(&scm, skb);
2499 }
2500 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2501
2502 scm_recv_unix(sock, msg, &scm, flags);
2503
2504 out_free:
2505 skb_free_datagram(sk, skb);
2506 mutex_unlock(&u->iolock);
2507 out:
2508 return err;
2509 }
2510
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2511 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2512 int flags)
2513 {
2514 struct sock *sk = sock->sk;
2515
2516 #ifdef CONFIG_BPF_SYSCALL
2517 const struct proto *prot = READ_ONCE(sk->sk_prot);
2518
2519 if (prot != &unix_dgram_proto)
2520 return prot->recvmsg(sk, msg, size, flags, NULL);
2521 #endif
2522 return __unix_dgram_recvmsg(sk, msg, size, flags);
2523 }
2524
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2525 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2526 {
2527 struct unix_sock *u = unix_sk(sk);
2528 struct sk_buff *skb;
2529 int err;
2530
2531 mutex_lock(&u->iolock);
2532 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2533 mutex_unlock(&u->iolock);
2534 if (!skb)
2535 return err;
2536
2537 return recv_actor(sk, skb);
2538 }
2539
2540 /*
2541 * Sleep until more data has arrived. But check for races..
2542 */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2543 static long unix_stream_data_wait(struct sock *sk, long timeo,
2544 struct sk_buff *last, unsigned int last_len,
2545 bool freezable)
2546 {
2547 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2548 struct sk_buff *tail;
2549 DEFINE_WAIT(wait);
2550
2551 unix_state_lock(sk);
2552
2553 for (;;) {
2554 prepare_to_wait(sk_sleep(sk), &wait, state);
2555
2556 tail = skb_peek_tail(&sk->sk_receive_queue);
2557 if (tail != last ||
2558 (tail && tail->len != last_len) ||
2559 sk->sk_err ||
2560 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2561 signal_pending(current) ||
2562 !timeo)
2563 break;
2564
2565 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2566 unix_state_unlock(sk);
2567 timeo = schedule_timeout(timeo);
2568 unix_state_lock(sk);
2569
2570 if (sock_flag(sk, SOCK_DEAD))
2571 break;
2572
2573 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2574 }
2575
2576 finish_wait(sk_sleep(sk), &wait);
2577 unix_state_unlock(sk);
2578 return timeo;
2579 }
2580
unix_skb_len(const struct sk_buff * skb)2581 static unsigned int unix_skb_len(const struct sk_buff *skb)
2582 {
2583 return skb->len - UNIXCB(skb).consumed;
2584 }
2585
2586 struct unix_stream_read_state {
2587 int (*recv_actor)(struct sk_buff *, int, int,
2588 struct unix_stream_read_state *);
2589 struct socket *socket;
2590 struct msghdr *msg;
2591 struct pipe_inode_info *pipe;
2592 size_t size;
2593 int flags;
2594 unsigned int splice_flags;
2595 };
2596
2597 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2598 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2599 {
2600 struct socket *sock = state->socket;
2601 struct sock *sk = sock->sk;
2602 struct unix_sock *u = unix_sk(sk);
2603 int chunk = 1;
2604 struct sk_buff *oob_skb;
2605
2606 mutex_lock(&u->iolock);
2607 unix_state_lock(sk);
2608 spin_lock(&sk->sk_receive_queue.lock);
2609
2610 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2611 spin_unlock(&sk->sk_receive_queue.lock);
2612 unix_state_unlock(sk);
2613 mutex_unlock(&u->iolock);
2614 return -EINVAL;
2615 }
2616
2617 oob_skb = u->oob_skb;
2618
2619 if (!(state->flags & MSG_PEEK))
2620 WRITE_ONCE(u->oob_skb, NULL);
2621
2622 spin_unlock(&sk->sk_receive_queue.lock);
2623 unix_state_unlock(sk);
2624
2625 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2626
2627 if (!(state->flags & MSG_PEEK))
2628 UNIXCB(oob_skb).consumed += 1;
2629
2630 mutex_unlock(&u->iolock);
2631
2632 if (chunk < 0)
2633 return -EFAULT;
2634
2635 state->msg->msg_flags |= MSG_OOB;
2636 return 1;
2637 }
2638
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2639 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2640 int flags, int copied)
2641 {
2642 struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2643 struct unix_sock *u = unix_sk(sk);
2644
2645 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2646 return skb;
2647
2648 spin_lock(&sk->sk_receive_queue.lock);
2649
2650 if (!unix_skb_len(skb)) {
2651 if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2652 skb = NULL;
2653 } else if (flags & MSG_PEEK) {
2654 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2655 } else {
2656 read_skb = skb;
2657 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2658 __skb_unlink(read_skb, &sk->sk_receive_queue);
2659 }
2660
2661 if (!skb)
2662 goto unlock;
2663 }
2664
2665 if (skb != u->oob_skb)
2666 goto unlock;
2667
2668 if (copied) {
2669 skb = NULL;
2670 } else if (!(flags & MSG_PEEK)) {
2671 WRITE_ONCE(u->oob_skb, NULL);
2672
2673 if (!sock_flag(sk, SOCK_URGINLINE)) {
2674 __skb_unlink(skb, &sk->sk_receive_queue);
2675 unread_skb = skb;
2676 skb = skb_peek(&sk->sk_receive_queue);
2677 }
2678 } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2679 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2680 }
2681
2682 unlock:
2683 spin_unlock(&sk->sk_receive_queue.lock);
2684
2685 consume_skb(read_skb);
2686 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2687
2688 return skb;
2689 }
2690 #endif
2691
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2692 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2693 {
2694 struct unix_sock *u = unix_sk(sk);
2695 struct sk_buff *skb;
2696 int err;
2697
2698 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2699 return -ENOTCONN;
2700
2701 mutex_lock(&u->iolock);
2702 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2703 mutex_unlock(&u->iolock);
2704 if (!skb)
2705 return err;
2706
2707 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2708 if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2709 bool drop = false;
2710
2711 unix_state_lock(sk);
2712
2713 if (sock_flag(sk, SOCK_DEAD)) {
2714 unix_state_unlock(sk);
2715 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2716 return -ECONNRESET;
2717 }
2718
2719 spin_lock(&sk->sk_receive_queue.lock);
2720 if (likely(skb == u->oob_skb)) {
2721 WRITE_ONCE(u->oob_skb, NULL);
2722 drop = true;
2723 }
2724 spin_unlock(&sk->sk_receive_queue.lock);
2725
2726 unix_state_unlock(sk);
2727
2728 if (drop) {
2729 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2730 return -EAGAIN;
2731 }
2732 }
2733 #endif
2734
2735 return recv_actor(sk, skb);
2736 }
2737
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2738 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2739 bool freezable)
2740 {
2741 struct scm_cookie scm;
2742 struct socket *sock = state->socket;
2743 struct sock *sk = sock->sk;
2744 struct unix_sock *u = unix_sk(sk);
2745 int copied = 0;
2746 int flags = state->flags;
2747 int noblock = flags & MSG_DONTWAIT;
2748 bool check_creds = false;
2749 int target;
2750 int err = 0;
2751 long timeo;
2752 int skip;
2753 size_t size = state->size;
2754 unsigned int last_len;
2755
2756 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2757 err = -EINVAL;
2758 goto out;
2759 }
2760
2761 if (unlikely(flags & MSG_OOB)) {
2762 err = -EOPNOTSUPP;
2763 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2764 err = unix_stream_recv_urg(state);
2765 #endif
2766 goto out;
2767 }
2768
2769 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2770 timeo = sock_rcvtimeo(sk, noblock);
2771
2772 memset(&scm, 0, sizeof(scm));
2773
2774 /* Lock the socket to prevent queue disordering
2775 * while sleeps in memcpy_tomsg
2776 */
2777 mutex_lock(&u->iolock);
2778
2779 skip = max(sk_peek_offset(sk, flags), 0);
2780
2781 do {
2782 struct sk_buff *skb, *last;
2783 int chunk;
2784
2785 redo:
2786 unix_state_lock(sk);
2787 if (sock_flag(sk, SOCK_DEAD)) {
2788 err = -ECONNRESET;
2789 goto unlock;
2790 }
2791 last = skb = skb_peek(&sk->sk_receive_queue);
2792 last_len = last ? last->len : 0;
2793
2794 again:
2795 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2796 if (skb) {
2797 skb = manage_oob(skb, sk, flags, copied);
2798 if (!skb && copied) {
2799 unix_state_unlock(sk);
2800 break;
2801 }
2802 }
2803 #endif
2804 if (skb == NULL) {
2805 if (copied >= target)
2806 goto unlock;
2807
2808 /*
2809 * POSIX 1003.1g mandates this order.
2810 */
2811
2812 err = sock_error(sk);
2813 if (err)
2814 goto unlock;
2815 if (sk->sk_shutdown & RCV_SHUTDOWN)
2816 goto unlock;
2817
2818 unix_state_unlock(sk);
2819 if (!timeo) {
2820 err = -EAGAIN;
2821 break;
2822 }
2823
2824 mutex_unlock(&u->iolock);
2825
2826 timeo = unix_stream_data_wait(sk, timeo, last,
2827 last_len, freezable);
2828
2829 if (signal_pending(current)) {
2830 err = sock_intr_errno(timeo);
2831 scm_destroy(&scm);
2832 goto out;
2833 }
2834
2835 mutex_lock(&u->iolock);
2836 goto redo;
2837 unlock:
2838 unix_state_unlock(sk);
2839 break;
2840 }
2841
2842 while (skip >= unix_skb_len(skb)) {
2843 skip -= unix_skb_len(skb);
2844 last = skb;
2845 last_len = skb->len;
2846 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2847 if (!skb)
2848 goto again;
2849 }
2850
2851 unix_state_unlock(sk);
2852
2853 if (check_creds) {
2854 /* Never glue messages from different writers */
2855 if (!unix_skb_scm_eq(skb, &scm))
2856 break;
2857 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2858 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2859 /* Copy credentials */
2860 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2861 unix_set_secdata(&scm, skb);
2862 check_creds = true;
2863 }
2864
2865 /* Copy address just once */
2866 if (state->msg && state->msg->msg_name) {
2867 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2868 state->msg->msg_name);
2869 unix_copy_addr(state->msg, skb->sk);
2870
2871 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2872 state->msg->msg_name,
2873 &state->msg->msg_namelen);
2874
2875 sunaddr = NULL;
2876 }
2877
2878 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2879 chunk = state->recv_actor(skb, skip, chunk, state);
2880 if (chunk < 0) {
2881 if (copied == 0)
2882 copied = -EFAULT;
2883 break;
2884 }
2885 copied += chunk;
2886 size -= chunk;
2887
2888 /* Mark read part of skb as used */
2889 if (!(flags & MSG_PEEK)) {
2890 UNIXCB(skb).consumed += chunk;
2891
2892 sk_peek_offset_bwd(sk, chunk);
2893
2894 if (UNIXCB(skb).fp) {
2895 scm_stat_del(sk, skb);
2896 unix_detach_fds(&scm, skb);
2897 }
2898
2899 if (unix_skb_len(skb))
2900 break;
2901
2902 skb_unlink(skb, &sk->sk_receive_queue);
2903 consume_skb(skb);
2904
2905 if (scm.fp)
2906 break;
2907 } else {
2908 /* It is questionable, see note in unix_dgram_recvmsg.
2909 */
2910 if (UNIXCB(skb).fp)
2911 unix_peek_fds(&scm, skb);
2912
2913 sk_peek_offset_fwd(sk, chunk);
2914
2915 if (UNIXCB(skb).fp)
2916 break;
2917
2918 skip = 0;
2919 last = skb;
2920 last_len = skb->len;
2921 unix_state_lock(sk);
2922 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2923 if (skb)
2924 goto again;
2925 unix_state_unlock(sk);
2926 break;
2927 }
2928 } while (size);
2929
2930 mutex_unlock(&u->iolock);
2931 if (state->msg)
2932 scm_recv_unix(sock, state->msg, &scm, flags);
2933 else
2934 scm_destroy(&scm);
2935 out:
2936 return copied ? : err;
2937 }
2938
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2939 static int unix_stream_read_actor(struct sk_buff *skb,
2940 int skip, int chunk,
2941 struct unix_stream_read_state *state)
2942 {
2943 int ret;
2944
2945 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2946 state->msg, chunk);
2947 return ret ?: chunk;
2948 }
2949
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2950 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2951 size_t size, int flags)
2952 {
2953 struct unix_stream_read_state state = {
2954 .recv_actor = unix_stream_read_actor,
2955 .socket = sk->sk_socket,
2956 .msg = msg,
2957 .size = size,
2958 .flags = flags
2959 };
2960
2961 return unix_stream_read_generic(&state, true);
2962 }
2963
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2964 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2965 size_t size, int flags)
2966 {
2967 struct unix_stream_read_state state = {
2968 .recv_actor = unix_stream_read_actor,
2969 .socket = sock,
2970 .msg = msg,
2971 .size = size,
2972 .flags = flags
2973 };
2974
2975 #ifdef CONFIG_BPF_SYSCALL
2976 struct sock *sk = sock->sk;
2977 const struct proto *prot = READ_ONCE(sk->sk_prot);
2978
2979 if (prot != &unix_stream_proto)
2980 return prot->recvmsg(sk, msg, size, flags, NULL);
2981 #endif
2982 return unix_stream_read_generic(&state, true);
2983 }
2984
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2985 static int unix_stream_splice_actor(struct sk_buff *skb,
2986 int skip, int chunk,
2987 struct unix_stream_read_state *state)
2988 {
2989 return skb_splice_bits(skb, state->socket->sk,
2990 UNIXCB(skb).consumed + skip,
2991 state->pipe, chunk, state->splice_flags);
2992 }
2993
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)2994 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2995 struct pipe_inode_info *pipe,
2996 size_t size, unsigned int flags)
2997 {
2998 struct unix_stream_read_state state = {
2999 .recv_actor = unix_stream_splice_actor,
3000 .socket = sock,
3001 .pipe = pipe,
3002 .size = size,
3003 .splice_flags = flags,
3004 };
3005
3006 if (unlikely(*ppos))
3007 return -ESPIPE;
3008
3009 if (sock->file->f_flags & O_NONBLOCK ||
3010 flags & SPLICE_F_NONBLOCK)
3011 state.flags = MSG_DONTWAIT;
3012
3013 return unix_stream_read_generic(&state, false);
3014 }
3015
unix_shutdown(struct socket * sock,int mode)3016 static int unix_shutdown(struct socket *sock, int mode)
3017 {
3018 struct sock *sk = sock->sk;
3019 struct sock *other;
3020
3021 if (mode < SHUT_RD || mode > SHUT_RDWR)
3022 return -EINVAL;
3023 /* This maps:
3024 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
3025 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
3026 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3027 */
3028 ++mode;
3029
3030 unix_state_lock(sk);
3031 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3032 other = unix_peer(sk);
3033 if (other)
3034 sock_hold(other);
3035 unix_state_unlock(sk);
3036 sk->sk_state_change(sk);
3037
3038 if (other &&
3039 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3040
3041 int peer_mode = 0;
3042 const struct proto *prot = READ_ONCE(other->sk_prot);
3043
3044 if (prot->unhash)
3045 prot->unhash(other);
3046 if (mode&RCV_SHUTDOWN)
3047 peer_mode |= SEND_SHUTDOWN;
3048 if (mode&SEND_SHUTDOWN)
3049 peer_mode |= RCV_SHUTDOWN;
3050 unix_state_lock(other);
3051 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3052 unix_state_unlock(other);
3053 other->sk_state_change(other);
3054 if (peer_mode == SHUTDOWN_MASK)
3055 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3056 else if (peer_mode & RCV_SHUTDOWN)
3057 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3058 }
3059 if (other)
3060 sock_put(other);
3061
3062 return 0;
3063 }
3064
unix_inq_len(struct sock * sk)3065 long unix_inq_len(struct sock *sk)
3066 {
3067 struct sk_buff *skb;
3068 long amount = 0;
3069
3070 if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3071 return -EINVAL;
3072
3073 spin_lock(&sk->sk_receive_queue.lock);
3074 if (sk->sk_type == SOCK_STREAM ||
3075 sk->sk_type == SOCK_SEQPACKET) {
3076 skb_queue_walk(&sk->sk_receive_queue, skb)
3077 amount += unix_skb_len(skb);
3078 } else {
3079 skb = skb_peek(&sk->sk_receive_queue);
3080 if (skb)
3081 amount = skb->len;
3082 }
3083 spin_unlock(&sk->sk_receive_queue.lock);
3084
3085 return amount;
3086 }
3087 EXPORT_SYMBOL_GPL(unix_inq_len);
3088
unix_outq_len(struct sock * sk)3089 long unix_outq_len(struct sock *sk)
3090 {
3091 return sk_wmem_alloc_get(sk);
3092 }
3093 EXPORT_SYMBOL_GPL(unix_outq_len);
3094
unix_open_file(struct sock * sk)3095 static int unix_open_file(struct sock *sk)
3096 {
3097 struct path path;
3098 struct file *f;
3099 int fd;
3100
3101 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3102 return -EPERM;
3103
3104 if (!smp_load_acquire(&unix_sk(sk)->addr))
3105 return -ENOENT;
3106
3107 path = unix_sk(sk)->path;
3108 if (!path.dentry)
3109 return -ENOENT;
3110
3111 path_get(&path);
3112
3113 fd = get_unused_fd_flags(O_CLOEXEC);
3114 if (fd < 0)
3115 goto out;
3116
3117 f = dentry_open(&path, O_PATH, current_cred());
3118 if (IS_ERR(f)) {
3119 put_unused_fd(fd);
3120 fd = PTR_ERR(f);
3121 goto out;
3122 }
3123
3124 fd_install(fd, f);
3125 out:
3126 path_put(&path);
3127
3128 return fd;
3129 }
3130
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3131 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3132 {
3133 struct sock *sk = sock->sk;
3134 long amount = 0;
3135 int err;
3136
3137 switch (cmd) {
3138 case SIOCOUTQ:
3139 amount = unix_outq_len(sk);
3140 err = put_user(amount, (int __user *)arg);
3141 break;
3142 case SIOCINQ:
3143 amount = unix_inq_len(sk);
3144 if (amount < 0)
3145 err = amount;
3146 else
3147 err = put_user(amount, (int __user *)arg);
3148 break;
3149 case SIOCUNIXFILE:
3150 err = unix_open_file(sk);
3151 break;
3152 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3153 case SIOCATMARK:
3154 {
3155 struct unix_sock *u = unix_sk(sk);
3156 struct sk_buff *skb;
3157 int answ = 0;
3158
3159 mutex_lock(&u->iolock);
3160
3161 skb = skb_peek(&sk->sk_receive_queue);
3162 if (skb) {
3163 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3164 struct sk_buff *next_skb;
3165
3166 next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3167
3168 if (skb == oob_skb ||
3169 (!unix_skb_len(skb) &&
3170 (!oob_skb || next_skb == oob_skb)))
3171 answ = 1;
3172 }
3173
3174 mutex_unlock(&u->iolock);
3175
3176 err = put_user(answ, (int __user *)arg);
3177 }
3178 break;
3179 #endif
3180 default:
3181 err = -ENOIOCTLCMD;
3182 break;
3183 }
3184 return err;
3185 }
3186
3187 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3188 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3189 {
3190 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3191 }
3192 #endif
3193
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3194 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3195 {
3196 struct sock *sk = sock->sk;
3197 unsigned char state;
3198 __poll_t mask;
3199 u8 shutdown;
3200
3201 sock_poll_wait(file, sock, wait);
3202 mask = 0;
3203 shutdown = READ_ONCE(sk->sk_shutdown);
3204 state = READ_ONCE(sk->sk_state);
3205
3206 /* exceptional events? */
3207 if (READ_ONCE(sk->sk_err))
3208 mask |= EPOLLERR;
3209 if (shutdown == SHUTDOWN_MASK)
3210 mask |= EPOLLHUP;
3211 if (shutdown & RCV_SHUTDOWN)
3212 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3213
3214 /* readable? */
3215 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3216 mask |= EPOLLIN | EPOLLRDNORM;
3217 if (sk_is_readable(sk))
3218 mask |= EPOLLIN | EPOLLRDNORM;
3219 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3220 if (READ_ONCE(unix_sk(sk)->oob_skb))
3221 mask |= EPOLLPRI;
3222 #endif
3223
3224 /* Connection-based need to check for termination and startup */
3225 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3226 state == TCP_CLOSE)
3227 mask |= EPOLLHUP;
3228
3229 /*
3230 * we set writable also when the other side has shut down the
3231 * connection. This prevents stuck sockets.
3232 */
3233 if (unix_writable(sk, state))
3234 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3235
3236 return mask;
3237 }
3238
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3239 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3240 poll_table *wait)
3241 {
3242 struct sock *sk = sock->sk, *other;
3243 unsigned int writable;
3244 unsigned char state;
3245 __poll_t mask;
3246 u8 shutdown;
3247
3248 sock_poll_wait(file, sock, wait);
3249 mask = 0;
3250 shutdown = READ_ONCE(sk->sk_shutdown);
3251 state = READ_ONCE(sk->sk_state);
3252
3253 /* exceptional events? */
3254 if (READ_ONCE(sk->sk_err) ||
3255 !skb_queue_empty_lockless(&sk->sk_error_queue))
3256 mask |= EPOLLERR |
3257 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3258
3259 if (shutdown & RCV_SHUTDOWN)
3260 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3261 if (shutdown == SHUTDOWN_MASK)
3262 mask |= EPOLLHUP;
3263
3264 /* readable? */
3265 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3266 mask |= EPOLLIN | EPOLLRDNORM;
3267 if (sk_is_readable(sk))
3268 mask |= EPOLLIN | EPOLLRDNORM;
3269
3270 /* Connection-based need to check for termination and startup */
3271 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3272 mask |= EPOLLHUP;
3273
3274 /* No write status requested, avoid expensive OUT tests. */
3275 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3276 return mask;
3277
3278 writable = unix_writable(sk, state);
3279 if (writable) {
3280 unix_state_lock(sk);
3281
3282 other = unix_peer(sk);
3283 if (other && unix_peer(other) != sk &&
3284 unix_recvq_full_lockless(other) &&
3285 unix_dgram_peer_wake_me(sk, other))
3286 writable = 0;
3287
3288 unix_state_unlock(sk);
3289 }
3290
3291 if (writable)
3292 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3293 else
3294 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3295
3296 return mask;
3297 }
3298
3299 #ifdef CONFIG_PROC_FS
3300
3301 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3302
3303 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3304 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3305 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3306
unix_from_bucket(struct seq_file * seq,loff_t * pos)3307 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3308 {
3309 unsigned long offset = get_offset(*pos);
3310 unsigned long bucket = get_bucket(*pos);
3311 unsigned long count = 0;
3312 struct sock *sk;
3313
3314 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3315 sk; sk = sk_next(sk)) {
3316 if (++count == offset)
3317 break;
3318 }
3319
3320 return sk;
3321 }
3322
unix_get_first(struct seq_file * seq,loff_t * pos)3323 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3324 {
3325 unsigned long bucket = get_bucket(*pos);
3326 struct net *net = seq_file_net(seq);
3327 struct sock *sk;
3328
3329 while (bucket < UNIX_HASH_SIZE) {
3330 spin_lock(&net->unx.table.locks[bucket]);
3331
3332 sk = unix_from_bucket(seq, pos);
3333 if (sk)
3334 return sk;
3335
3336 spin_unlock(&net->unx.table.locks[bucket]);
3337
3338 *pos = set_bucket_offset(++bucket, 1);
3339 }
3340
3341 return NULL;
3342 }
3343
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3344 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3345 loff_t *pos)
3346 {
3347 unsigned long bucket = get_bucket(*pos);
3348
3349 sk = sk_next(sk);
3350 if (sk)
3351 return sk;
3352
3353
3354 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3355
3356 *pos = set_bucket_offset(++bucket, 1);
3357
3358 return unix_get_first(seq, pos);
3359 }
3360
unix_seq_start(struct seq_file * seq,loff_t * pos)3361 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3362 {
3363 if (!*pos)
3364 return SEQ_START_TOKEN;
3365
3366 return unix_get_first(seq, pos);
3367 }
3368
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3369 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3370 {
3371 ++*pos;
3372
3373 if (v == SEQ_START_TOKEN)
3374 return unix_get_first(seq, pos);
3375
3376 return unix_get_next(seq, v, pos);
3377 }
3378
unix_seq_stop(struct seq_file * seq,void * v)3379 static void unix_seq_stop(struct seq_file *seq, void *v)
3380 {
3381 struct sock *sk = v;
3382
3383 if (sk)
3384 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3385 }
3386
unix_seq_show(struct seq_file * seq,void * v)3387 static int unix_seq_show(struct seq_file *seq, void *v)
3388 {
3389
3390 if (v == SEQ_START_TOKEN)
3391 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3392 "Inode Path\n");
3393 else {
3394 struct sock *s = v;
3395 struct unix_sock *u = unix_sk(s);
3396 unix_state_lock(s);
3397
3398 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3399 s,
3400 refcount_read(&s->sk_refcnt),
3401 0,
3402 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3403 s->sk_type,
3404 s->sk_socket ?
3405 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3406 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3407 sock_i_ino(s));
3408
3409 if (u->addr) { // under a hash table lock here
3410 int i, len;
3411 seq_putc(seq, ' ');
3412
3413 i = 0;
3414 len = u->addr->len -
3415 offsetof(struct sockaddr_un, sun_path);
3416 if (u->addr->name->sun_path[0]) {
3417 len--;
3418 } else {
3419 seq_putc(seq, '@');
3420 i++;
3421 }
3422 for ( ; i < len; i++)
3423 seq_putc(seq, u->addr->name->sun_path[i] ?:
3424 '@');
3425 }
3426 unix_state_unlock(s);
3427 seq_putc(seq, '\n');
3428 }
3429
3430 return 0;
3431 }
3432
3433 static const struct seq_operations unix_seq_ops = {
3434 .start = unix_seq_start,
3435 .next = unix_seq_next,
3436 .stop = unix_seq_stop,
3437 .show = unix_seq_show,
3438 };
3439
3440 #ifdef CONFIG_BPF_SYSCALL
3441 struct bpf_unix_iter_state {
3442 struct seq_net_private p;
3443 unsigned int cur_sk;
3444 unsigned int end_sk;
3445 unsigned int max_sk;
3446 struct sock **batch;
3447 bool st_bucket_done;
3448 };
3449
3450 struct bpf_iter__unix {
3451 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3452 __bpf_md_ptr(struct unix_sock *, unix_sk);
3453 uid_t uid __aligned(8);
3454 };
3455
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3456 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3457 struct unix_sock *unix_sk, uid_t uid)
3458 {
3459 struct bpf_iter__unix ctx;
3460
3461 meta->seq_num--; /* skip SEQ_START_TOKEN */
3462 ctx.meta = meta;
3463 ctx.unix_sk = unix_sk;
3464 ctx.uid = uid;
3465 return bpf_iter_run_prog(prog, &ctx);
3466 }
3467
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3468 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3469
3470 {
3471 struct bpf_unix_iter_state *iter = seq->private;
3472 unsigned int expected = 1;
3473 struct sock *sk;
3474
3475 sock_hold(start_sk);
3476 iter->batch[iter->end_sk++] = start_sk;
3477
3478 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3479 if (iter->end_sk < iter->max_sk) {
3480 sock_hold(sk);
3481 iter->batch[iter->end_sk++] = sk;
3482 }
3483
3484 expected++;
3485 }
3486
3487 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3488
3489 return expected;
3490 }
3491
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3492 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3493 {
3494 while (iter->cur_sk < iter->end_sk)
3495 sock_put(iter->batch[iter->cur_sk++]);
3496 }
3497
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3498 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3499 unsigned int new_batch_sz)
3500 {
3501 struct sock **new_batch;
3502
3503 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3504 GFP_USER | __GFP_NOWARN);
3505 if (!new_batch)
3506 return -ENOMEM;
3507
3508 bpf_iter_unix_put_batch(iter);
3509 kvfree(iter->batch);
3510 iter->batch = new_batch;
3511 iter->max_sk = new_batch_sz;
3512
3513 return 0;
3514 }
3515
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3516 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3517 loff_t *pos)
3518 {
3519 struct bpf_unix_iter_state *iter = seq->private;
3520 unsigned int expected;
3521 bool resized = false;
3522 struct sock *sk;
3523
3524 if (iter->st_bucket_done)
3525 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3526
3527 again:
3528 /* Get a new batch */
3529 iter->cur_sk = 0;
3530 iter->end_sk = 0;
3531
3532 sk = unix_get_first(seq, pos);
3533 if (!sk)
3534 return NULL; /* Done */
3535
3536 expected = bpf_iter_unix_hold_batch(seq, sk);
3537
3538 if (iter->end_sk == expected) {
3539 iter->st_bucket_done = true;
3540 return sk;
3541 }
3542
3543 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3544 resized = true;
3545 goto again;
3546 }
3547
3548 return sk;
3549 }
3550
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3551 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3552 {
3553 if (!*pos)
3554 return SEQ_START_TOKEN;
3555
3556 /* bpf iter does not support lseek, so it always
3557 * continue from where it was stop()-ped.
3558 */
3559 return bpf_iter_unix_batch(seq, pos);
3560 }
3561
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3562 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3563 {
3564 struct bpf_unix_iter_state *iter = seq->private;
3565 struct sock *sk;
3566
3567 /* Whenever seq_next() is called, the iter->cur_sk is
3568 * done with seq_show(), so advance to the next sk in
3569 * the batch.
3570 */
3571 if (iter->cur_sk < iter->end_sk)
3572 sock_put(iter->batch[iter->cur_sk++]);
3573
3574 ++*pos;
3575
3576 if (iter->cur_sk < iter->end_sk)
3577 sk = iter->batch[iter->cur_sk];
3578 else
3579 sk = bpf_iter_unix_batch(seq, pos);
3580
3581 return sk;
3582 }
3583
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3584 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3585 {
3586 struct bpf_iter_meta meta;
3587 struct bpf_prog *prog;
3588 struct sock *sk = v;
3589 uid_t uid;
3590 bool slow;
3591 int ret;
3592
3593 if (v == SEQ_START_TOKEN)
3594 return 0;
3595
3596 slow = lock_sock_fast(sk);
3597
3598 if (unlikely(sk_unhashed(sk))) {
3599 ret = SEQ_SKIP;
3600 goto unlock;
3601 }
3602
3603 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3604 meta.seq = seq;
3605 prog = bpf_iter_get_info(&meta, false);
3606 ret = unix_prog_seq_show(prog, &meta, v, uid);
3607 unlock:
3608 unlock_sock_fast(sk, slow);
3609 return ret;
3610 }
3611
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3612 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3613 {
3614 struct bpf_unix_iter_state *iter = seq->private;
3615 struct bpf_iter_meta meta;
3616 struct bpf_prog *prog;
3617
3618 if (!v) {
3619 meta.seq = seq;
3620 prog = bpf_iter_get_info(&meta, true);
3621 if (prog)
3622 (void)unix_prog_seq_show(prog, &meta, v, 0);
3623 }
3624
3625 if (iter->cur_sk < iter->end_sk)
3626 bpf_iter_unix_put_batch(iter);
3627 }
3628
3629 static const struct seq_operations bpf_iter_unix_seq_ops = {
3630 .start = bpf_iter_unix_seq_start,
3631 .next = bpf_iter_unix_seq_next,
3632 .stop = bpf_iter_unix_seq_stop,
3633 .show = bpf_iter_unix_seq_show,
3634 };
3635 #endif
3636 #endif
3637
3638 static const struct net_proto_family unix_family_ops = {
3639 .family = PF_UNIX,
3640 .create = unix_create,
3641 .owner = THIS_MODULE,
3642 };
3643
3644
unix_net_init(struct net * net)3645 static int __net_init unix_net_init(struct net *net)
3646 {
3647 int i;
3648
3649 net->unx.sysctl_max_dgram_qlen = 10;
3650 if (unix_sysctl_register(net))
3651 goto out;
3652
3653 #ifdef CONFIG_PROC_FS
3654 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3655 sizeof(struct seq_net_private)))
3656 goto err_sysctl;
3657 #endif
3658
3659 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3660 sizeof(spinlock_t), GFP_KERNEL);
3661 if (!net->unx.table.locks)
3662 goto err_proc;
3663
3664 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3665 sizeof(struct hlist_head),
3666 GFP_KERNEL);
3667 if (!net->unx.table.buckets)
3668 goto free_locks;
3669
3670 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3671 spin_lock_init(&net->unx.table.locks[i]);
3672 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3673 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3674 }
3675
3676 return 0;
3677
3678 free_locks:
3679 kvfree(net->unx.table.locks);
3680 err_proc:
3681 #ifdef CONFIG_PROC_FS
3682 remove_proc_entry("unix", net->proc_net);
3683 err_sysctl:
3684 #endif
3685 unix_sysctl_unregister(net);
3686 out:
3687 return -ENOMEM;
3688 }
3689
unix_net_exit(struct net * net)3690 static void __net_exit unix_net_exit(struct net *net)
3691 {
3692 kvfree(net->unx.table.buckets);
3693 kvfree(net->unx.table.locks);
3694 unix_sysctl_unregister(net);
3695 remove_proc_entry("unix", net->proc_net);
3696 }
3697
3698 static struct pernet_operations unix_net_ops = {
3699 .init = unix_net_init,
3700 .exit = unix_net_exit,
3701 };
3702
3703 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3704 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3705 struct unix_sock *unix_sk, uid_t uid)
3706
3707 #define INIT_BATCH_SZ 16
3708
3709 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3710 {
3711 struct bpf_unix_iter_state *iter = priv_data;
3712 int err;
3713
3714 err = bpf_iter_init_seq_net(priv_data, aux);
3715 if (err)
3716 return err;
3717
3718 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3719 if (err) {
3720 bpf_iter_fini_seq_net(priv_data);
3721 return err;
3722 }
3723
3724 return 0;
3725 }
3726
bpf_iter_fini_unix(void * priv_data)3727 static void bpf_iter_fini_unix(void *priv_data)
3728 {
3729 struct bpf_unix_iter_state *iter = priv_data;
3730
3731 bpf_iter_fini_seq_net(priv_data);
3732 kvfree(iter->batch);
3733 }
3734
3735 static const struct bpf_iter_seq_info unix_seq_info = {
3736 .seq_ops = &bpf_iter_unix_seq_ops,
3737 .init_seq_private = bpf_iter_init_unix,
3738 .fini_seq_private = bpf_iter_fini_unix,
3739 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3740 };
3741
3742 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3743 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3744 const struct bpf_prog *prog)
3745 {
3746 switch (func_id) {
3747 case BPF_FUNC_setsockopt:
3748 return &bpf_sk_setsockopt_proto;
3749 case BPF_FUNC_getsockopt:
3750 return &bpf_sk_getsockopt_proto;
3751 default:
3752 return NULL;
3753 }
3754 }
3755
3756 static struct bpf_iter_reg unix_reg_info = {
3757 .target = "unix",
3758 .ctx_arg_info_size = 1,
3759 .ctx_arg_info = {
3760 { offsetof(struct bpf_iter__unix, unix_sk),
3761 PTR_TO_BTF_ID_OR_NULL },
3762 },
3763 .get_func_proto = bpf_iter_unix_get_func_proto,
3764 .seq_info = &unix_seq_info,
3765 };
3766
bpf_iter_register(void)3767 static void __init bpf_iter_register(void)
3768 {
3769 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3770 if (bpf_iter_reg_target(&unix_reg_info))
3771 pr_warn("Warning: could not register bpf iterator unix\n");
3772 }
3773 #endif
3774
af_unix_init(void)3775 static int __init af_unix_init(void)
3776 {
3777 int i, rc = -1;
3778
3779 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3780
3781 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3782 spin_lock_init(&bsd_socket_locks[i]);
3783 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3784 }
3785
3786 rc = proto_register(&unix_dgram_proto, 1);
3787 if (rc != 0) {
3788 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3789 goto out;
3790 }
3791
3792 rc = proto_register(&unix_stream_proto, 1);
3793 if (rc != 0) {
3794 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3795 proto_unregister(&unix_dgram_proto);
3796 goto out;
3797 }
3798
3799 sock_register(&unix_family_ops);
3800 register_pernet_subsys(&unix_net_ops);
3801 unix_bpf_build_proto();
3802
3803 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3804 bpf_iter_register();
3805 #endif
3806
3807 out:
3808 return rc;
3809 }
3810
3811 /* Later than subsys_initcall() because we depend on stuff initialised there */
3812 fs_initcall(af_unix_init);
3813