1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * NET4: Implementation of BSD Unix domain sockets.
4 *
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 *
7 * Fixes:
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
17 * Mike Shaver's work.
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
24 * reference counting
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27 * Lots of bug fixes.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
39 * dgram receiver.
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
47 *
48 * Known differences from reference BSD that was tested:
49 *
50 * [TO FIX]
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
55 * [NOT TO FIX]
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
63 *
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
68 *
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
75 * with BSD names.
76 */
77
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
90 #include <linux/un.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
96 #include <linux/in.h>
97 #include <linux/fs.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
108 #include <net/scm.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/splice.h>
116 #include <linux/freezer.h>
117 #include <linux/file.h>
118 #include <linux/btf_ids.h>
119 #include <linux/bpf-cgroup.h>
120
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
124
125 /* SMP locking strategy:
126 * hash table is protected with spinlock.
127 * each socket state is protected by separate spinlock.
128 */
129 #ifdef CONFIG_PROVE_LOCKING
130 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r)))
131
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)132 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
133 const struct lockdep_map *b)
134 {
135 return cmp_ptr(a, b);
136 }
137
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)138 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
139 const struct lockdep_map *_b)
140 {
141 const struct unix_sock *a, *b;
142
143 a = container_of(_a, struct unix_sock, lock.dep_map);
144 b = container_of(_b, struct unix_sock, lock.dep_map);
145
146 if (a->sk.sk_state == TCP_LISTEN) {
147 /* unix_stream_connect(): Before the 2nd unix_state_lock(),
148 *
149 * 1. a is TCP_LISTEN.
150 * 2. b is not a.
151 * 3. concurrent connect(b -> a) must fail.
152 *
153 * Except for 2. & 3., the b's state can be any possible
154 * value due to concurrent connect() or listen().
155 *
156 * 2. is detected in debug_spin_lock_before(), and 3. cannot
157 * be expressed as lock_cmp_fn.
158 */
159 switch (b->sk.sk_state) {
160 case TCP_CLOSE:
161 case TCP_ESTABLISHED:
162 case TCP_LISTEN:
163 return -1;
164 default:
165 /* Invalid case. */
166 return 0;
167 }
168 }
169
170 /* Should never happen. Just to be symmetric. */
171 if (b->sk.sk_state == TCP_LISTEN) {
172 switch (b->sk.sk_state) {
173 case TCP_CLOSE:
174 case TCP_ESTABLISHED:
175 return 1;
176 default:
177 return 0;
178 }
179 }
180
181 /* unix_state_double_lock(): ascending address order. */
182 return cmp_ptr(a, b);
183 }
184
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)185 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
186 const struct lockdep_map *_b)
187 {
188 const struct sock *a, *b;
189
190 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
191 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
192
193 /* unix_collect_skb(): listener -> embryo order. */
194 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
195 return -1;
196
197 /* Should never happen. Just to be symmetric. */
198 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
199 return 1;
200
201 return 0;
202 }
203 #endif
204
unix_unbound_hash(struct sock * sk)205 static unsigned int unix_unbound_hash(struct sock *sk)
206 {
207 unsigned long hash = (unsigned long)sk;
208
209 hash ^= hash >> 16;
210 hash ^= hash >> 8;
211 hash ^= sk->sk_type;
212
213 return hash & UNIX_HASH_MOD;
214 }
215
unix_bsd_hash(struct inode * i)216 static unsigned int unix_bsd_hash(struct inode *i)
217 {
218 return i->i_ino & UNIX_HASH_MOD;
219 }
220
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)221 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
222 int addr_len, int type)
223 {
224 __wsum csum = csum_partial(sunaddr, addr_len, 0);
225 unsigned int hash;
226
227 hash = (__force unsigned int)csum_fold(csum);
228 hash ^= hash >> 8;
229 hash ^= type;
230
231 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
232 }
233
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)234 static void unix_table_double_lock(struct net *net,
235 unsigned int hash1, unsigned int hash2)
236 {
237 if (hash1 == hash2) {
238 spin_lock(&net->unx.table.locks[hash1]);
239 return;
240 }
241
242 if (hash1 > hash2)
243 swap(hash1, hash2);
244
245 spin_lock(&net->unx.table.locks[hash1]);
246 spin_lock(&net->unx.table.locks[hash2]);
247 }
248
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)249 static void unix_table_double_unlock(struct net *net,
250 unsigned int hash1, unsigned int hash2)
251 {
252 if (hash1 == hash2) {
253 spin_unlock(&net->unx.table.locks[hash1]);
254 return;
255 }
256
257 spin_unlock(&net->unx.table.locks[hash1]);
258 spin_unlock(&net->unx.table.locks[hash2]);
259 }
260
261 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)262 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
263 {
264 UNIXCB(skb).secid = scm->secid;
265 }
266
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)267 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
268 {
269 scm->secid = UNIXCB(skb).secid;
270 }
271
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)272 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
273 {
274 return (scm->secid == UNIXCB(skb).secid);
275 }
276 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)277 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
278 { }
279
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)280 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
281 { }
282
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)283 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
284 {
285 return true;
286 }
287 #endif /* CONFIG_SECURITY_NETWORK */
288
unix_may_send(struct sock * sk,struct sock * osk)289 static inline int unix_may_send(struct sock *sk, struct sock *osk)
290 {
291 return !unix_peer(osk) || unix_peer(osk) == sk;
292 }
293
unix_recvq_full_lockless(const struct sock * sk)294 static inline int unix_recvq_full_lockless(const struct sock *sk)
295 {
296 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
297 }
298
unix_peer_get(struct sock * s)299 struct sock *unix_peer_get(struct sock *s)
300 {
301 struct sock *peer;
302
303 unix_state_lock(s);
304 peer = unix_peer(s);
305 if (peer)
306 sock_hold(peer);
307 unix_state_unlock(s);
308 return peer;
309 }
310 EXPORT_SYMBOL_GPL(unix_peer_get);
311
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)312 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
313 int addr_len)
314 {
315 struct unix_address *addr;
316
317 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
318 if (!addr)
319 return NULL;
320
321 refcount_set(&addr->refcnt, 1);
322 addr->len = addr_len;
323 memcpy(addr->name, sunaddr, addr_len);
324
325 return addr;
326 }
327
unix_release_addr(struct unix_address * addr)328 static inline void unix_release_addr(struct unix_address *addr)
329 {
330 if (refcount_dec_and_test(&addr->refcnt))
331 kfree(addr);
332 }
333
334 /*
335 * Check unix socket name:
336 * - should be not zero length.
337 * - if started by not zero, should be NULL terminated (FS object)
338 * - if started by zero, it is abstract name.
339 */
340
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)341 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
342 {
343 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
344 addr_len > sizeof(*sunaddr))
345 return -EINVAL;
346
347 if (sunaddr->sun_family != AF_UNIX)
348 return -EINVAL;
349
350 return 0;
351 }
352
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)353 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
354 {
355 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
356 short offset = offsetof(struct sockaddr_storage, __data);
357
358 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
359
360 /* This may look like an off by one error but it is a bit more
361 * subtle. 108 is the longest valid AF_UNIX path for a binding.
362 * sun_path[108] doesn't as such exist. However in kernel space
363 * we are guaranteed that it is a valid memory location in our
364 * kernel address buffer because syscall functions always pass
365 * a pointer of struct sockaddr_storage which has a bigger buffer
366 * than 108. Also, we must terminate sun_path for strlen() in
367 * getname_kernel().
368 */
369 addr->__data[addr_len - offset] = 0;
370
371 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
372 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
373 * know the actual buffer.
374 */
375 return strlen(addr->__data) + offset + 1;
376 }
377
__unix_remove_socket(struct sock * sk)378 static void __unix_remove_socket(struct sock *sk)
379 {
380 sk_del_node_init(sk);
381 }
382
__unix_insert_socket(struct net * net,struct sock * sk)383 static void __unix_insert_socket(struct net *net, struct sock *sk)
384 {
385 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
386 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
387 }
388
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)389 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
390 struct unix_address *addr, unsigned int hash)
391 {
392 __unix_remove_socket(sk);
393 smp_store_release(&unix_sk(sk)->addr, addr);
394
395 sk->sk_hash = hash;
396 __unix_insert_socket(net, sk);
397 }
398
unix_remove_socket(struct net * net,struct sock * sk)399 static void unix_remove_socket(struct net *net, struct sock *sk)
400 {
401 spin_lock(&net->unx.table.locks[sk->sk_hash]);
402 __unix_remove_socket(sk);
403 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
404 }
405
unix_insert_unbound_socket(struct net * net,struct sock * sk)406 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
407 {
408 spin_lock(&net->unx.table.locks[sk->sk_hash]);
409 __unix_insert_socket(net, sk);
410 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
411 }
412
unix_insert_bsd_socket(struct sock * sk)413 static void unix_insert_bsd_socket(struct sock *sk)
414 {
415 spin_lock(&bsd_socket_locks[sk->sk_hash]);
416 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
417 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
418 }
419
unix_remove_bsd_socket(struct sock * sk)420 static void unix_remove_bsd_socket(struct sock *sk)
421 {
422 if (!hlist_unhashed(&sk->sk_bind_node)) {
423 spin_lock(&bsd_socket_locks[sk->sk_hash]);
424 __sk_del_bind_node(sk);
425 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
426
427 sk_node_init(&sk->sk_bind_node);
428 }
429 }
430
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)431 static struct sock *__unix_find_socket_byname(struct net *net,
432 struct sockaddr_un *sunname,
433 int len, unsigned int hash)
434 {
435 struct sock *s;
436
437 sk_for_each(s, &net->unx.table.buckets[hash]) {
438 struct unix_sock *u = unix_sk(s);
439
440 if (u->addr->len == len &&
441 !memcmp(u->addr->name, sunname, len))
442 return s;
443 }
444 return NULL;
445 }
446
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)447 static inline struct sock *unix_find_socket_byname(struct net *net,
448 struct sockaddr_un *sunname,
449 int len, unsigned int hash)
450 {
451 struct sock *s;
452
453 spin_lock(&net->unx.table.locks[hash]);
454 s = __unix_find_socket_byname(net, sunname, len, hash);
455 if (s)
456 sock_hold(s);
457 spin_unlock(&net->unx.table.locks[hash]);
458 return s;
459 }
460
unix_find_socket_byinode(struct inode * i)461 static struct sock *unix_find_socket_byinode(struct inode *i)
462 {
463 unsigned int hash = unix_bsd_hash(i);
464 struct sock *s;
465
466 spin_lock(&bsd_socket_locks[hash]);
467 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
468 struct dentry *dentry = unix_sk(s)->path.dentry;
469
470 if (dentry && d_backing_inode(dentry) == i) {
471 sock_hold(s);
472 spin_unlock(&bsd_socket_locks[hash]);
473 return s;
474 }
475 }
476 spin_unlock(&bsd_socket_locks[hash]);
477 return NULL;
478 }
479
480 /* Support code for asymmetrically connected dgram sockets
481 *
482 * If a datagram socket is connected to a socket not itself connected
483 * to the first socket (eg, /dev/log), clients may only enqueue more
484 * messages if the present receive queue of the server socket is not
485 * "too large". This means there's a second writeability condition
486 * poll and sendmsg need to test. The dgram recv code will do a wake
487 * up on the peer_wait wait queue of a socket upon reception of a
488 * datagram which needs to be propagated to sleeping would-be writers
489 * since these might not have sent anything so far. This can't be
490 * accomplished via poll_wait because the lifetime of the server
491 * socket might be less than that of its clients if these break their
492 * association with it or if the server socket is closed while clients
493 * are still connected to it and there's no way to inform "a polling
494 * implementation" that it should let go of a certain wait queue
495 *
496 * In order to propagate a wake up, a wait_queue_entry_t of the client
497 * socket is enqueued on the peer_wait queue of the server socket
498 * whose wake function does a wake_up on the ordinary client socket
499 * wait queue. This connection is established whenever a write (or
500 * poll for write) hit the flow control condition and broken when the
501 * association to the server socket is dissolved or after a wake up
502 * was relayed.
503 */
504
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)505 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
506 void *key)
507 {
508 struct unix_sock *u;
509 wait_queue_head_t *u_sleep;
510
511 u = container_of(q, struct unix_sock, peer_wake);
512
513 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
514 q);
515 u->peer_wake.private = NULL;
516
517 /* relaying can only happen while the wq still exists */
518 u_sleep = sk_sleep(&u->sk);
519 if (u_sleep)
520 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
521
522 return 0;
523 }
524
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)525 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
526 {
527 struct unix_sock *u, *u_other;
528 int rc;
529
530 u = unix_sk(sk);
531 u_other = unix_sk(other);
532 rc = 0;
533 spin_lock(&u_other->peer_wait.lock);
534
535 if (!u->peer_wake.private) {
536 u->peer_wake.private = other;
537 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
538
539 rc = 1;
540 }
541
542 spin_unlock(&u_other->peer_wait.lock);
543 return rc;
544 }
545
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)546 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
547 struct sock *other)
548 {
549 struct unix_sock *u, *u_other;
550
551 u = unix_sk(sk);
552 u_other = unix_sk(other);
553 spin_lock(&u_other->peer_wait.lock);
554
555 if (u->peer_wake.private == other) {
556 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
557 u->peer_wake.private = NULL;
558 }
559
560 spin_unlock(&u_other->peer_wait.lock);
561 }
562
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)563 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
564 struct sock *other)
565 {
566 unix_dgram_peer_wake_disconnect(sk, other);
567 wake_up_interruptible_poll(sk_sleep(sk),
568 EPOLLOUT |
569 EPOLLWRNORM |
570 EPOLLWRBAND);
571 }
572
573 /* preconditions:
574 * - unix_peer(sk) == other
575 * - association is stable
576 */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)577 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
578 {
579 int connected;
580
581 connected = unix_dgram_peer_wake_connect(sk, other);
582
583 /* If other is SOCK_DEAD, we want to make sure we signal
584 * POLLOUT, such that a subsequent write() can get a
585 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
586 * to other and its full, we will hang waiting for POLLOUT.
587 */
588 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
589 return 1;
590
591 if (connected)
592 unix_dgram_peer_wake_disconnect(sk, other);
593
594 return 0;
595 }
596
unix_writable(const struct sock * sk,unsigned char state)597 static int unix_writable(const struct sock *sk, unsigned char state)
598 {
599 return state != TCP_LISTEN &&
600 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
601 }
602
unix_write_space(struct sock * sk)603 static void unix_write_space(struct sock *sk)
604 {
605 struct socket_wq *wq;
606
607 rcu_read_lock();
608 if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
609 wq = rcu_dereference(sk->sk_wq);
610 if (skwq_has_sleeper(wq))
611 wake_up_interruptible_sync_poll(&wq->wait,
612 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
613 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
614 }
615 rcu_read_unlock();
616 }
617
618 /* When dgram socket disconnects (or changes its peer), we clear its receive
619 * queue of packets arrived from previous peer. First, it allows to do
620 * flow control based only on wmem_alloc; second, sk connected to peer
621 * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)622 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
623 {
624 if (!skb_queue_empty(&sk->sk_receive_queue)) {
625 skb_queue_purge_reason(&sk->sk_receive_queue,
626 SKB_DROP_REASON_UNIX_DISCONNECT);
627
628 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
629
630 /* If one link of bidirectional dgram pipe is disconnected,
631 * we signal error. Messages are lost. Do not make this,
632 * when peer was not connected to us.
633 */
634 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
635 WRITE_ONCE(other->sk_err, ECONNRESET);
636 sk_error_report(other);
637 }
638 }
639 }
640
unix_sock_destructor(struct sock * sk)641 static void unix_sock_destructor(struct sock *sk)
642 {
643 struct unix_sock *u = unix_sk(sk);
644
645 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
646
647 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
648 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
649 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
650 if (!sock_flag(sk, SOCK_DEAD)) {
651 pr_info("Attempt to release alive unix socket: %p\n", sk);
652 return;
653 }
654
655 if (u->addr)
656 unix_release_addr(u->addr);
657
658 atomic_long_dec(&unix_nr_socks);
659 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
660 #ifdef UNIX_REFCNT_DEBUG
661 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
662 atomic_long_read(&unix_nr_socks));
663 #endif
664 }
665
unix_release_sock(struct sock * sk,int embrion)666 static void unix_release_sock(struct sock *sk, int embrion)
667 {
668 struct unix_sock *u = unix_sk(sk);
669 struct sock *skpair;
670 struct sk_buff *skb;
671 struct path path;
672 int state;
673
674 unix_remove_socket(sock_net(sk), sk);
675 unix_remove_bsd_socket(sk);
676
677 /* Clear state */
678 unix_state_lock(sk);
679 sock_orphan(sk);
680 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
681 path = u->path;
682 u->path.dentry = NULL;
683 u->path.mnt = NULL;
684 state = sk->sk_state;
685 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
686
687 skpair = unix_peer(sk);
688 unix_peer(sk) = NULL;
689
690 unix_state_unlock(sk);
691
692 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
693 u->oob_skb = NULL;
694 #endif
695
696 wake_up_interruptible_all(&u->peer_wait);
697
698 if (skpair != NULL) {
699 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
700 unix_state_lock(skpair);
701 /* No more writes */
702 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
703 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
704 WRITE_ONCE(skpair->sk_err, ECONNRESET);
705 unix_state_unlock(skpair);
706 skpair->sk_state_change(skpair);
707 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
708 }
709
710 unix_dgram_peer_wake_disconnect(sk, skpair);
711 sock_put(skpair); /* It may now die */
712 }
713
714 /* Try to flush out this socket. Throw out buffers at least */
715
716 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
717 if (state == TCP_LISTEN)
718 unix_release_sock(skb->sk, 1);
719
720 /* passed fds are erased in the kfree_skb hook */
721 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
722 }
723
724 if (path.dentry)
725 path_put(&path);
726
727 sock_put(sk);
728
729 /* ---- Socket is dead now and most probably destroyed ---- */
730
731 /*
732 * Fixme: BSD difference: In BSD all sockets connected to us get
733 * ECONNRESET and we die on the spot. In Linux we behave
734 * like files and pipes do and wait for the last
735 * dereference.
736 *
737 * Can't we simply set sock->err?
738 *
739 * What the above comment does talk about? --ANK(980817)
740 */
741
742 if (READ_ONCE(unix_tot_inflight))
743 unix_gc(); /* Garbage collect fds */
744 }
745
init_peercred(struct sock * sk)746 static void init_peercred(struct sock *sk)
747 {
748 sk->sk_peer_pid = get_pid(task_tgid(current));
749 sk->sk_peer_cred = get_current_cred();
750 }
751
update_peercred(struct sock * sk)752 static void update_peercred(struct sock *sk)
753 {
754 const struct cred *old_cred;
755 struct pid *old_pid;
756
757 spin_lock(&sk->sk_peer_lock);
758 old_pid = sk->sk_peer_pid;
759 old_cred = sk->sk_peer_cred;
760 init_peercred(sk);
761 spin_unlock(&sk->sk_peer_lock);
762
763 put_pid(old_pid);
764 put_cred(old_cred);
765 }
766
copy_peercred(struct sock * sk,struct sock * peersk)767 static void copy_peercred(struct sock *sk, struct sock *peersk)
768 {
769 lockdep_assert_held(&unix_sk(peersk)->lock);
770
771 spin_lock(&sk->sk_peer_lock);
772 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
773 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
774 spin_unlock(&sk->sk_peer_lock);
775 }
776
unix_listen(struct socket * sock,int backlog)777 static int unix_listen(struct socket *sock, int backlog)
778 {
779 int err;
780 struct sock *sk = sock->sk;
781 struct unix_sock *u = unix_sk(sk);
782
783 err = -EOPNOTSUPP;
784 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
785 goto out; /* Only stream/seqpacket sockets accept */
786 err = -EINVAL;
787 if (!READ_ONCE(u->addr))
788 goto out; /* No listens on an unbound socket */
789 unix_state_lock(sk);
790 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
791 goto out_unlock;
792 if (backlog > sk->sk_max_ack_backlog)
793 wake_up_interruptible_all(&u->peer_wait);
794 sk->sk_max_ack_backlog = backlog;
795 WRITE_ONCE(sk->sk_state, TCP_LISTEN);
796
797 /* set credentials so connect can copy them */
798 update_peercred(sk);
799 err = 0;
800
801 out_unlock:
802 unix_state_unlock(sk);
803 out:
804 return err;
805 }
806
807 static int unix_release(struct socket *);
808 static int unix_bind(struct socket *, struct sockaddr *, int);
809 static int unix_stream_connect(struct socket *, struct sockaddr *,
810 int addr_len, int flags);
811 static int unix_socketpair(struct socket *, struct socket *);
812 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
813 static int unix_getname(struct socket *, struct sockaddr *, int);
814 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
815 static __poll_t unix_dgram_poll(struct file *, struct socket *,
816 poll_table *);
817 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
818 #ifdef CONFIG_COMPAT
819 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
820 #endif
821 static int unix_shutdown(struct socket *, int);
822 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
823 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
824 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
825 struct pipe_inode_info *, size_t size,
826 unsigned int flags);
827 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
828 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
829 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
830 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
831 static int unix_dgram_connect(struct socket *, struct sockaddr *,
832 int, int);
833 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
834 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
835 int);
836
837 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)838 static int unix_count_nr_fds(struct sock *sk)
839 {
840 struct sk_buff *skb;
841 struct unix_sock *u;
842 int nr_fds = 0;
843
844 spin_lock(&sk->sk_receive_queue.lock);
845 skb = skb_peek(&sk->sk_receive_queue);
846 while (skb) {
847 u = unix_sk(skb->sk);
848 nr_fds += atomic_read(&u->scm_stat.nr_fds);
849 skb = skb_peek_next(skb, &sk->sk_receive_queue);
850 }
851 spin_unlock(&sk->sk_receive_queue.lock);
852
853 return nr_fds;
854 }
855
unix_show_fdinfo(struct seq_file * m,struct socket * sock)856 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
857 {
858 struct sock *sk = sock->sk;
859 unsigned char s_state;
860 struct unix_sock *u;
861 int nr_fds = 0;
862
863 if (sk) {
864 s_state = READ_ONCE(sk->sk_state);
865 u = unix_sk(sk);
866
867 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
868 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
869 * SOCK_DGRAM is ordinary. So, no lock is needed.
870 */
871 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
872 nr_fds = atomic_read(&u->scm_stat.nr_fds);
873 else if (s_state == TCP_LISTEN)
874 nr_fds = unix_count_nr_fds(sk);
875
876 seq_printf(m, "scm_fds: %u\n", nr_fds);
877 }
878 }
879 #else
880 #define unix_show_fdinfo NULL
881 #endif
882
883 static const struct proto_ops unix_stream_ops = {
884 .family = PF_UNIX,
885 .owner = THIS_MODULE,
886 .release = unix_release,
887 .bind = unix_bind,
888 .connect = unix_stream_connect,
889 .socketpair = unix_socketpair,
890 .accept = unix_accept,
891 .getname = unix_getname,
892 .poll = unix_poll,
893 .ioctl = unix_ioctl,
894 #ifdef CONFIG_COMPAT
895 .compat_ioctl = unix_compat_ioctl,
896 #endif
897 .listen = unix_listen,
898 .shutdown = unix_shutdown,
899 .sendmsg = unix_stream_sendmsg,
900 .recvmsg = unix_stream_recvmsg,
901 .read_skb = unix_stream_read_skb,
902 .mmap = sock_no_mmap,
903 .splice_read = unix_stream_splice_read,
904 .set_peek_off = sk_set_peek_off,
905 .show_fdinfo = unix_show_fdinfo,
906 };
907
908 static const struct proto_ops unix_dgram_ops = {
909 .family = PF_UNIX,
910 .owner = THIS_MODULE,
911 .release = unix_release,
912 .bind = unix_bind,
913 .connect = unix_dgram_connect,
914 .socketpair = unix_socketpair,
915 .accept = sock_no_accept,
916 .getname = unix_getname,
917 .poll = unix_dgram_poll,
918 .ioctl = unix_ioctl,
919 #ifdef CONFIG_COMPAT
920 .compat_ioctl = unix_compat_ioctl,
921 #endif
922 .listen = sock_no_listen,
923 .shutdown = unix_shutdown,
924 .sendmsg = unix_dgram_sendmsg,
925 .read_skb = unix_read_skb,
926 .recvmsg = unix_dgram_recvmsg,
927 .mmap = sock_no_mmap,
928 .set_peek_off = sk_set_peek_off,
929 .show_fdinfo = unix_show_fdinfo,
930 };
931
932 static const struct proto_ops unix_seqpacket_ops = {
933 .family = PF_UNIX,
934 .owner = THIS_MODULE,
935 .release = unix_release,
936 .bind = unix_bind,
937 .connect = unix_stream_connect,
938 .socketpair = unix_socketpair,
939 .accept = unix_accept,
940 .getname = unix_getname,
941 .poll = unix_dgram_poll,
942 .ioctl = unix_ioctl,
943 #ifdef CONFIG_COMPAT
944 .compat_ioctl = unix_compat_ioctl,
945 #endif
946 .listen = unix_listen,
947 .shutdown = unix_shutdown,
948 .sendmsg = unix_seqpacket_sendmsg,
949 .recvmsg = unix_seqpacket_recvmsg,
950 .mmap = sock_no_mmap,
951 .set_peek_off = sk_set_peek_off,
952 .show_fdinfo = unix_show_fdinfo,
953 };
954
unix_close(struct sock * sk,long timeout)955 static void unix_close(struct sock *sk, long timeout)
956 {
957 /* Nothing to do here, unix socket does not need a ->close().
958 * This is merely for sockmap.
959 */
960 }
961
unix_unhash(struct sock * sk)962 static void unix_unhash(struct sock *sk)
963 {
964 /* Nothing to do here, unix socket does not need a ->unhash().
965 * This is merely for sockmap.
966 */
967 }
968
unix_bpf_bypass_getsockopt(int level,int optname)969 static bool unix_bpf_bypass_getsockopt(int level, int optname)
970 {
971 if (level == SOL_SOCKET) {
972 switch (optname) {
973 case SO_PEERPIDFD:
974 return true;
975 default:
976 return false;
977 }
978 }
979
980 return false;
981 }
982
983 struct proto unix_dgram_proto = {
984 .name = "UNIX",
985 .owner = THIS_MODULE,
986 .obj_size = sizeof(struct unix_sock),
987 .close = unix_close,
988 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
989 #ifdef CONFIG_BPF_SYSCALL
990 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
991 #endif
992 };
993
994 struct proto unix_stream_proto = {
995 .name = "UNIX-STREAM",
996 .owner = THIS_MODULE,
997 .obj_size = sizeof(struct unix_sock),
998 .close = unix_close,
999 .unhash = unix_unhash,
1000 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
1001 #ifdef CONFIG_BPF_SYSCALL
1002 .psock_update_sk_prot = unix_stream_bpf_update_proto,
1003 #endif
1004 };
1005
unix_create1(struct net * net,struct socket * sock,int kern,int type)1006 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1007 {
1008 struct unix_sock *u;
1009 struct sock *sk;
1010 int err;
1011
1012 atomic_long_inc(&unix_nr_socks);
1013 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1014 err = -ENFILE;
1015 goto err;
1016 }
1017
1018 if (type == SOCK_STREAM)
1019 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1020 else /*dgram and seqpacket */
1021 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1022
1023 if (!sk) {
1024 err = -ENOMEM;
1025 goto err;
1026 }
1027
1028 sock_init_data(sock, sk);
1029
1030 sk->sk_hash = unix_unbound_hash(sk);
1031 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
1032 sk->sk_write_space = unix_write_space;
1033 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1034 sk->sk_destruct = unix_sock_destructor;
1035 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1036
1037 u = unix_sk(sk);
1038 u->listener = NULL;
1039 u->vertex = NULL;
1040 u->path.dentry = NULL;
1041 u->path.mnt = NULL;
1042 spin_lock_init(&u->lock);
1043 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1044 mutex_init(&u->iolock); /* single task reading lock */
1045 mutex_init(&u->bindlock); /* single task binding lock */
1046 init_waitqueue_head(&u->peer_wait);
1047 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1048 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1049 unix_insert_unbound_socket(net, sk);
1050
1051 sock_prot_inuse_add(net, sk->sk_prot, 1);
1052
1053 return sk;
1054
1055 err:
1056 atomic_long_dec(&unix_nr_socks);
1057 return ERR_PTR(err);
1058 }
1059
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1060 static int unix_create(struct net *net, struct socket *sock, int protocol,
1061 int kern)
1062 {
1063 struct sock *sk;
1064
1065 if (protocol && protocol != PF_UNIX)
1066 return -EPROTONOSUPPORT;
1067
1068 sock->state = SS_UNCONNECTED;
1069
1070 switch (sock->type) {
1071 case SOCK_STREAM:
1072 sock->ops = &unix_stream_ops;
1073 break;
1074 /*
1075 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1076 * nothing uses it.
1077 */
1078 case SOCK_RAW:
1079 sock->type = SOCK_DGRAM;
1080 fallthrough;
1081 case SOCK_DGRAM:
1082 sock->ops = &unix_dgram_ops;
1083 break;
1084 case SOCK_SEQPACKET:
1085 sock->ops = &unix_seqpacket_ops;
1086 break;
1087 default:
1088 return -ESOCKTNOSUPPORT;
1089 }
1090
1091 sk = unix_create1(net, sock, kern, sock->type);
1092 if (IS_ERR(sk))
1093 return PTR_ERR(sk);
1094
1095 return 0;
1096 }
1097
unix_release(struct socket * sock)1098 static int unix_release(struct socket *sock)
1099 {
1100 struct sock *sk = sock->sk;
1101
1102 if (!sk)
1103 return 0;
1104
1105 sk->sk_prot->close(sk, 0);
1106 unix_release_sock(sk, 0);
1107 sock->sk = NULL;
1108
1109 return 0;
1110 }
1111
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type)1112 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1113 int type)
1114 {
1115 struct inode *inode;
1116 struct path path;
1117 struct sock *sk;
1118 int err;
1119
1120 unix_mkname_bsd(sunaddr, addr_len);
1121 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1122 if (err)
1123 goto fail;
1124
1125 err = path_permission(&path, MAY_WRITE);
1126 if (err)
1127 goto path_put;
1128
1129 err = -ECONNREFUSED;
1130 inode = d_backing_inode(path.dentry);
1131 if (!S_ISSOCK(inode->i_mode))
1132 goto path_put;
1133
1134 sk = unix_find_socket_byinode(inode);
1135 if (!sk)
1136 goto path_put;
1137
1138 err = -EPROTOTYPE;
1139 if (sk->sk_type == type)
1140 touch_atime(&path);
1141 else
1142 goto sock_put;
1143
1144 path_put(&path);
1145
1146 return sk;
1147
1148 sock_put:
1149 sock_put(sk);
1150 path_put:
1151 path_put(&path);
1152 fail:
1153 return ERR_PTR(err);
1154 }
1155
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1156 static struct sock *unix_find_abstract(struct net *net,
1157 struct sockaddr_un *sunaddr,
1158 int addr_len, int type)
1159 {
1160 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1161 struct dentry *dentry;
1162 struct sock *sk;
1163
1164 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1165 if (!sk)
1166 return ERR_PTR(-ECONNREFUSED);
1167
1168 dentry = unix_sk(sk)->path.dentry;
1169 if (dentry)
1170 touch_atime(&unix_sk(sk)->path);
1171
1172 return sk;
1173 }
1174
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1175 static struct sock *unix_find_other(struct net *net,
1176 struct sockaddr_un *sunaddr,
1177 int addr_len, int type)
1178 {
1179 struct sock *sk;
1180
1181 if (sunaddr->sun_path[0])
1182 sk = unix_find_bsd(sunaddr, addr_len, type);
1183 else
1184 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1185
1186 return sk;
1187 }
1188
unix_autobind(struct sock * sk)1189 static int unix_autobind(struct sock *sk)
1190 {
1191 struct unix_sock *u = unix_sk(sk);
1192 unsigned int new_hash, old_hash;
1193 struct net *net = sock_net(sk);
1194 struct unix_address *addr;
1195 u32 lastnum, ordernum;
1196 int err;
1197
1198 err = mutex_lock_interruptible(&u->bindlock);
1199 if (err)
1200 return err;
1201
1202 if (u->addr)
1203 goto out;
1204
1205 err = -ENOMEM;
1206 addr = kzalloc(sizeof(*addr) +
1207 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1208 if (!addr)
1209 goto out;
1210
1211 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1212 addr->name->sun_family = AF_UNIX;
1213 refcount_set(&addr->refcnt, 1);
1214
1215 old_hash = sk->sk_hash;
1216 ordernum = get_random_u32();
1217 lastnum = ordernum & 0xFFFFF;
1218 retry:
1219 ordernum = (ordernum + 1) & 0xFFFFF;
1220 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1221
1222 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1223 unix_table_double_lock(net, old_hash, new_hash);
1224
1225 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1226 unix_table_double_unlock(net, old_hash, new_hash);
1227
1228 /* __unix_find_socket_byname() may take long time if many names
1229 * are already in use.
1230 */
1231 cond_resched();
1232
1233 if (ordernum == lastnum) {
1234 /* Give up if all names seems to be in use. */
1235 err = -ENOSPC;
1236 unix_release_addr(addr);
1237 goto out;
1238 }
1239
1240 goto retry;
1241 }
1242
1243 __unix_set_addr_hash(net, sk, addr, new_hash);
1244 unix_table_double_unlock(net, old_hash, new_hash);
1245 err = 0;
1246
1247 out: mutex_unlock(&u->bindlock);
1248 return err;
1249 }
1250
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1251 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1252 int addr_len)
1253 {
1254 umode_t mode = S_IFSOCK |
1255 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1256 struct unix_sock *u = unix_sk(sk);
1257 unsigned int new_hash, old_hash;
1258 struct net *net = sock_net(sk);
1259 struct mnt_idmap *idmap;
1260 struct unix_address *addr;
1261 struct dentry *dentry;
1262 struct path parent;
1263 int err;
1264
1265 addr_len = unix_mkname_bsd(sunaddr, addr_len);
1266 addr = unix_create_addr(sunaddr, addr_len);
1267 if (!addr)
1268 return -ENOMEM;
1269
1270 /*
1271 * Get the parent directory, calculate the hash for last
1272 * component.
1273 */
1274 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1275 if (IS_ERR(dentry)) {
1276 err = PTR_ERR(dentry);
1277 goto out;
1278 }
1279
1280 /*
1281 * All right, let's create it.
1282 */
1283 idmap = mnt_idmap(parent.mnt);
1284 err = security_path_mknod(&parent, dentry, mode, 0);
1285 if (!err)
1286 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1287 if (err)
1288 goto out_path;
1289 err = mutex_lock_interruptible(&u->bindlock);
1290 if (err)
1291 goto out_unlink;
1292 if (u->addr)
1293 goto out_unlock;
1294
1295 old_hash = sk->sk_hash;
1296 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1297 unix_table_double_lock(net, old_hash, new_hash);
1298 u->path.mnt = mntget(parent.mnt);
1299 u->path.dentry = dget(dentry);
1300 __unix_set_addr_hash(net, sk, addr, new_hash);
1301 unix_table_double_unlock(net, old_hash, new_hash);
1302 unix_insert_bsd_socket(sk);
1303 mutex_unlock(&u->bindlock);
1304 done_path_create(&parent, dentry);
1305 return 0;
1306
1307 out_unlock:
1308 mutex_unlock(&u->bindlock);
1309 err = -EINVAL;
1310 out_unlink:
1311 /* failed after successful mknod? unlink what we'd created... */
1312 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1313 out_path:
1314 done_path_create(&parent, dentry);
1315 out:
1316 unix_release_addr(addr);
1317 return err == -EEXIST ? -EADDRINUSE : err;
1318 }
1319
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1320 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1321 int addr_len)
1322 {
1323 struct unix_sock *u = unix_sk(sk);
1324 unsigned int new_hash, old_hash;
1325 struct net *net = sock_net(sk);
1326 struct unix_address *addr;
1327 int err;
1328
1329 addr = unix_create_addr(sunaddr, addr_len);
1330 if (!addr)
1331 return -ENOMEM;
1332
1333 err = mutex_lock_interruptible(&u->bindlock);
1334 if (err)
1335 goto out;
1336
1337 if (u->addr) {
1338 err = -EINVAL;
1339 goto out_mutex;
1340 }
1341
1342 old_hash = sk->sk_hash;
1343 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1344 unix_table_double_lock(net, old_hash, new_hash);
1345
1346 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1347 goto out_spin;
1348
1349 __unix_set_addr_hash(net, sk, addr, new_hash);
1350 unix_table_double_unlock(net, old_hash, new_hash);
1351 mutex_unlock(&u->bindlock);
1352 return 0;
1353
1354 out_spin:
1355 unix_table_double_unlock(net, old_hash, new_hash);
1356 err = -EADDRINUSE;
1357 out_mutex:
1358 mutex_unlock(&u->bindlock);
1359 out:
1360 unix_release_addr(addr);
1361 return err;
1362 }
1363
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1364 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1365 {
1366 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1367 struct sock *sk = sock->sk;
1368 int err;
1369
1370 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1371 sunaddr->sun_family == AF_UNIX)
1372 return unix_autobind(sk);
1373
1374 err = unix_validate_addr(sunaddr, addr_len);
1375 if (err)
1376 return err;
1377
1378 if (sunaddr->sun_path[0])
1379 err = unix_bind_bsd(sk, sunaddr, addr_len);
1380 else
1381 err = unix_bind_abstract(sk, sunaddr, addr_len);
1382
1383 return err;
1384 }
1385
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1386 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1387 {
1388 if (unlikely(sk1 == sk2) || !sk2) {
1389 unix_state_lock(sk1);
1390 return;
1391 }
1392
1393 if (sk1 > sk2)
1394 swap(sk1, sk2);
1395
1396 unix_state_lock(sk1);
1397 unix_state_lock(sk2);
1398 }
1399
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1400 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1401 {
1402 if (unlikely(sk1 == sk2) || !sk2) {
1403 unix_state_unlock(sk1);
1404 return;
1405 }
1406 unix_state_unlock(sk1);
1407 unix_state_unlock(sk2);
1408 }
1409
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1410 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1411 int alen, int flags)
1412 {
1413 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1414 struct sock *sk = sock->sk;
1415 struct sock *other;
1416 int err;
1417
1418 err = -EINVAL;
1419 if (alen < offsetofend(struct sockaddr, sa_family))
1420 goto out;
1421
1422 if (addr->sa_family != AF_UNSPEC) {
1423 err = unix_validate_addr(sunaddr, alen);
1424 if (err)
1425 goto out;
1426
1427 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1428 if (err)
1429 goto out;
1430
1431 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1432 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1433 !READ_ONCE(unix_sk(sk)->addr)) {
1434 err = unix_autobind(sk);
1435 if (err)
1436 goto out;
1437 }
1438
1439 restart:
1440 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1441 if (IS_ERR(other)) {
1442 err = PTR_ERR(other);
1443 goto out;
1444 }
1445
1446 unix_state_double_lock(sk, other);
1447
1448 /* Apparently VFS overslept socket death. Retry. */
1449 if (sock_flag(other, SOCK_DEAD)) {
1450 unix_state_double_unlock(sk, other);
1451 sock_put(other);
1452 goto restart;
1453 }
1454
1455 err = -EPERM;
1456 if (!unix_may_send(sk, other))
1457 goto out_unlock;
1458
1459 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1460 if (err)
1461 goto out_unlock;
1462
1463 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1464 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1465 } else {
1466 /*
1467 * 1003.1g breaking connected state with AF_UNSPEC
1468 */
1469 other = NULL;
1470 unix_state_double_lock(sk, other);
1471 }
1472
1473 /*
1474 * If it was connected, reconnect.
1475 */
1476 if (unix_peer(sk)) {
1477 struct sock *old_peer = unix_peer(sk);
1478
1479 unix_peer(sk) = other;
1480 if (!other)
1481 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1482 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1483
1484 unix_state_double_unlock(sk, other);
1485
1486 if (other != old_peer) {
1487 unix_dgram_disconnected(sk, old_peer);
1488
1489 unix_state_lock(old_peer);
1490 if (!unix_peer(old_peer))
1491 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1492 unix_state_unlock(old_peer);
1493 }
1494
1495 sock_put(old_peer);
1496 } else {
1497 unix_peer(sk) = other;
1498 unix_state_double_unlock(sk, other);
1499 }
1500
1501 return 0;
1502
1503 out_unlock:
1504 unix_state_double_unlock(sk, other);
1505 sock_put(other);
1506 out:
1507 return err;
1508 }
1509
unix_wait_for_peer(struct sock * other,long timeo)1510 static long unix_wait_for_peer(struct sock *other, long timeo)
1511 __releases(&unix_sk(other)->lock)
1512 {
1513 struct unix_sock *u = unix_sk(other);
1514 int sched;
1515 DEFINE_WAIT(wait);
1516
1517 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1518
1519 sched = !sock_flag(other, SOCK_DEAD) &&
1520 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1521 unix_recvq_full_lockless(other);
1522
1523 unix_state_unlock(other);
1524
1525 if (sched)
1526 timeo = schedule_timeout(timeo);
1527
1528 finish_wait(&u->peer_wait, &wait);
1529 return timeo;
1530 }
1531
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1532 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1533 int addr_len, int flags)
1534 {
1535 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1536 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1537 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1538 struct net *net = sock_net(sk);
1539 struct sk_buff *skb = NULL;
1540 unsigned char state;
1541 long timeo;
1542 int err;
1543
1544 err = unix_validate_addr(sunaddr, addr_len);
1545 if (err)
1546 goto out;
1547
1548 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1549 if (err)
1550 goto out;
1551
1552 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1553 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1554 !READ_ONCE(u->addr)) {
1555 err = unix_autobind(sk);
1556 if (err)
1557 goto out;
1558 }
1559
1560 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1561
1562 /* First of all allocate resources.
1563 * If we will make it after state is locked,
1564 * we will have to recheck all again in any case.
1565 */
1566
1567 /* create new sock for complete connection */
1568 newsk = unix_create1(net, NULL, 0, sock->type);
1569 if (IS_ERR(newsk)) {
1570 err = PTR_ERR(newsk);
1571 goto out;
1572 }
1573
1574 /* Allocate skb for sending to listening sock */
1575 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1576 if (!skb) {
1577 err = -ENOMEM;
1578 goto out_free_sk;
1579 }
1580
1581 restart:
1582 /* Find listening sock. */
1583 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1584 if (IS_ERR(other)) {
1585 err = PTR_ERR(other);
1586 goto out_free_skb;
1587 }
1588
1589 unix_state_lock(other);
1590
1591 /* Apparently VFS overslept socket death. Retry. */
1592 if (sock_flag(other, SOCK_DEAD)) {
1593 unix_state_unlock(other);
1594 sock_put(other);
1595 goto restart;
1596 }
1597
1598 if (other->sk_state != TCP_LISTEN ||
1599 other->sk_shutdown & RCV_SHUTDOWN) {
1600 err = -ECONNREFUSED;
1601 goto out_unlock;
1602 }
1603
1604 if (unix_recvq_full_lockless(other)) {
1605 if (!timeo) {
1606 err = -EAGAIN;
1607 goto out_unlock;
1608 }
1609
1610 timeo = unix_wait_for_peer(other, timeo);
1611 sock_put(other);
1612
1613 err = sock_intr_errno(timeo);
1614 if (signal_pending(current))
1615 goto out_free_skb;
1616
1617 goto restart;
1618 }
1619
1620 /* self connect and simultaneous connect are eliminated
1621 * by rejecting TCP_LISTEN socket to avoid deadlock.
1622 */
1623 state = READ_ONCE(sk->sk_state);
1624 if (unlikely(state != TCP_CLOSE)) {
1625 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1626 goto out_unlock;
1627 }
1628
1629 unix_state_lock(sk);
1630
1631 if (unlikely(sk->sk_state != TCP_CLOSE)) {
1632 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1633 unix_state_unlock(sk);
1634 goto out_unlock;
1635 }
1636
1637 err = security_unix_stream_connect(sk, other, newsk);
1638 if (err) {
1639 unix_state_unlock(sk);
1640 goto out_unlock;
1641 }
1642
1643 /* The way is open! Fastly set all the necessary fields... */
1644
1645 sock_hold(sk);
1646 unix_peer(newsk) = sk;
1647 newsk->sk_state = TCP_ESTABLISHED;
1648 newsk->sk_type = sk->sk_type;
1649 init_peercred(newsk);
1650 newu = unix_sk(newsk);
1651 newu->listener = other;
1652 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1653 otheru = unix_sk(other);
1654
1655 /* copy address information from listening to new sock
1656 *
1657 * The contents of *(otheru->addr) and otheru->path
1658 * are seen fully set up here, since we have found
1659 * otheru in hash under its lock. Insertion into the
1660 * hash chain we'd found it in had been done in an
1661 * earlier critical area protected by the chain's lock,
1662 * the same one where we'd set *(otheru->addr) contents,
1663 * as well as otheru->path and otheru->addr itself.
1664 *
1665 * Using smp_store_release() here to set newu->addr
1666 * is enough to make those stores, as well as stores
1667 * to newu->path visible to anyone who gets newu->addr
1668 * by smp_load_acquire(). IOW, the same warranties
1669 * as for unix_sock instances bound in unix_bind() or
1670 * in unix_autobind().
1671 */
1672 if (otheru->path.dentry) {
1673 path_get(&otheru->path);
1674 newu->path = otheru->path;
1675 }
1676 refcount_inc(&otheru->addr->refcnt);
1677 smp_store_release(&newu->addr, otheru->addr);
1678
1679 /* Set credentials */
1680 copy_peercred(sk, other);
1681
1682 sock->state = SS_CONNECTED;
1683 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1684 sock_hold(newsk);
1685
1686 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1687 unix_peer(sk) = newsk;
1688
1689 unix_state_unlock(sk);
1690
1691 /* take ten and send info to listening sock */
1692 spin_lock(&other->sk_receive_queue.lock);
1693 __skb_queue_tail(&other->sk_receive_queue, skb);
1694 spin_unlock(&other->sk_receive_queue.lock);
1695 unix_state_unlock(other);
1696 other->sk_data_ready(other);
1697 sock_put(other);
1698 return 0;
1699
1700 out_unlock:
1701 unix_state_unlock(other);
1702 sock_put(other);
1703 out_free_skb:
1704 consume_skb(skb);
1705 out_free_sk:
1706 unix_release_sock(newsk, 0);
1707 out:
1708 return err;
1709 }
1710
unix_socketpair(struct socket * socka,struct socket * sockb)1711 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1712 {
1713 struct sock *ska = socka->sk, *skb = sockb->sk;
1714
1715 /* Join our sockets back to back */
1716 sock_hold(ska);
1717 sock_hold(skb);
1718 unix_peer(ska) = skb;
1719 unix_peer(skb) = ska;
1720 init_peercred(ska);
1721 init_peercred(skb);
1722
1723 ska->sk_state = TCP_ESTABLISHED;
1724 skb->sk_state = TCP_ESTABLISHED;
1725 socka->state = SS_CONNECTED;
1726 sockb->state = SS_CONNECTED;
1727 return 0;
1728 }
1729
unix_sock_inherit_flags(const struct socket * old,struct socket * new)1730 static void unix_sock_inherit_flags(const struct socket *old,
1731 struct socket *new)
1732 {
1733 if (test_bit(SOCK_PASSCRED, &old->flags))
1734 set_bit(SOCK_PASSCRED, &new->flags);
1735 if (test_bit(SOCK_PASSPIDFD, &old->flags))
1736 set_bit(SOCK_PASSPIDFD, &new->flags);
1737 if (test_bit(SOCK_PASSSEC, &old->flags))
1738 set_bit(SOCK_PASSSEC, &new->flags);
1739 }
1740
unix_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)1741 static int unix_accept(struct socket *sock, struct socket *newsock,
1742 struct proto_accept_arg *arg)
1743 {
1744 struct sock *sk = sock->sk;
1745 struct sk_buff *skb;
1746 struct sock *tsk;
1747
1748 arg->err = -EOPNOTSUPP;
1749 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1750 goto out;
1751
1752 arg->err = -EINVAL;
1753 if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1754 goto out;
1755
1756 /* If socket state is TCP_LISTEN it cannot change (for now...),
1757 * so that no locks are necessary.
1758 */
1759
1760 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1761 &arg->err);
1762 if (!skb) {
1763 /* This means receive shutdown. */
1764 if (arg->err == 0)
1765 arg->err = -EINVAL;
1766 goto out;
1767 }
1768
1769 tsk = skb->sk;
1770 skb_free_datagram(sk, skb);
1771 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1772
1773 /* attach accepted sock to socket */
1774 unix_state_lock(tsk);
1775 unix_update_edges(unix_sk(tsk));
1776 newsock->state = SS_CONNECTED;
1777 unix_sock_inherit_flags(sock, newsock);
1778 sock_graft(tsk, newsock);
1779 unix_state_unlock(tsk);
1780 return 0;
1781
1782 out:
1783 return arg->err;
1784 }
1785
1786
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1787 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1788 {
1789 struct sock *sk = sock->sk;
1790 struct unix_address *addr;
1791 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1792 int err = 0;
1793
1794 if (peer) {
1795 sk = unix_peer_get(sk);
1796
1797 err = -ENOTCONN;
1798 if (!sk)
1799 goto out;
1800 err = 0;
1801 } else {
1802 sock_hold(sk);
1803 }
1804
1805 addr = smp_load_acquire(&unix_sk(sk)->addr);
1806 if (!addr) {
1807 sunaddr->sun_family = AF_UNIX;
1808 sunaddr->sun_path[0] = 0;
1809 err = offsetof(struct sockaddr_un, sun_path);
1810 } else {
1811 err = addr->len;
1812 memcpy(sunaddr, addr->name, addr->len);
1813
1814 if (peer)
1815 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1816 CGROUP_UNIX_GETPEERNAME);
1817 else
1818 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1819 CGROUP_UNIX_GETSOCKNAME);
1820 }
1821 sock_put(sk);
1822 out:
1823 return err;
1824 }
1825
1826 /* The "user->unix_inflight" variable is protected by the garbage
1827 * collection lock, and we just read it locklessly here. If you go
1828 * over the limit, there might be a tiny race in actually noticing
1829 * it across threads. Tough.
1830 */
too_many_unix_fds(struct task_struct * p)1831 static inline bool too_many_unix_fds(struct task_struct *p)
1832 {
1833 struct user_struct *user = current_user();
1834
1835 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1836 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1837 return false;
1838 }
1839
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1840 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1841 {
1842 if (too_many_unix_fds(current))
1843 return -ETOOMANYREFS;
1844
1845 UNIXCB(skb).fp = scm->fp;
1846 scm->fp = NULL;
1847
1848 if (unix_prepare_fpl(UNIXCB(skb).fp))
1849 return -ENOMEM;
1850
1851 return 0;
1852 }
1853
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1854 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1855 {
1856 scm->fp = UNIXCB(skb).fp;
1857 UNIXCB(skb).fp = NULL;
1858
1859 unix_destroy_fpl(scm->fp);
1860 }
1861
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1862 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1863 {
1864 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1865 }
1866
unix_destruct_scm(struct sk_buff * skb)1867 static void unix_destruct_scm(struct sk_buff *skb)
1868 {
1869 struct scm_cookie scm;
1870
1871 memset(&scm, 0, sizeof(scm));
1872 scm.pid = UNIXCB(skb).pid;
1873 if (UNIXCB(skb).fp)
1874 unix_detach_fds(&scm, skb);
1875
1876 /* Alas, it calls VFS */
1877 /* So fscking what? fput() had been SMP-safe since the last Summer */
1878 scm_destroy(&scm);
1879 sock_wfree(skb);
1880 }
1881
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)1882 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1883 {
1884 int err = 0;
1885
1886 UNIXCB(skb).pid = get_pid(scm->pid);
1887 UNIXCB(skb).uid = scm->creds.uid;
1888 UNIXCB(skb).gid = scm->creds.gid;
1889 UNIXCB(skb).fp = NULL;
1890 unix_get_secdata(scm, skb);
1891 if (scm->fp && send_fds)
1892 err = unix_attach_fds(scm, skb);
1893
1894 skb->destructor = unix_destruct_scm;
1895 return err;
1896 }
1897
unix_passcred_enabled(const struct socket * sock,const struct sock * other)1898 static bool unix_passcred_enabled(const struct socket *sock,
1899 const struct sock *other)
1900 {
1901 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1902 test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1903 !other->sk_socket ||
1904 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1905 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1906 }
1907
1908 /*
1909 * Some apps rely on write() giving SCM_CREDENTIALS
1910 * We include credentials if source or destination socket
1911 * asserted SOCK_PASSCRED.
1912 */
maybe_add_creds(struct sk_buff * skb,const struct socket * sock,const struct sock * other)1913 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1914 const struct sock *other)
1915 {
1916 if (UNIXCB(skb).pid)
1917 return;
1918 if (unix_passcred_enabled(sock, other)) {
1919 UNIXCB(skb).pid = get_pid(task_tgid(current));
1920 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1921 }
1922 }
1923
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)1924 static bool unix_skb_scm_eq(struct sk_buff *skb,
1925 struct scm_cookie *scm)
1926 {
1927 return UNIXCB(skb).pid == scm->pid &&
1928 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1929 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1930 unix_secdata_eq(scm, skb);
1931 }
1932
scm_stat_add(struct sock * sk,struct sk_buff * skb)1933 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1934 {
1935 struct scm_fp_list *fp = UNIXCB(skb).fp;
1936 struct unix_sock *u = unix_sk(sk);
1937
1938 if (unlikely(fp && fp->count)) {
1939 atomic_add(fp->count, &u->scm_stat.nr_fds);
1940 unix_add_edges(fp, u);
1941 }
1942 }
1943
scm_stat_del(struct sock * sk,struct sk_buff * skb)1944 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1945 {
1946 struct scm_fp_list *fp = UNIXCB(skb).fp;
1947 struct unix_sock *u = unix_sk(sk);
1948
1949 if (unlikely(fp && fp->count)) {
1950 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1951 unix_del_edges(fp);
1952 }
1953 }
1954
1955 /*
1956 * Send AF_UNIX data.
1957 */
1958
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)1959 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1960 size_t len)
1961 {
1962 struct sock *sk = sock->sk, *other = NULL;
1963 struct unix_sock *u = unix_sk(sk);
1964 struct scm_cookie scm;
1965 struct sk_buff *skb;
1966 int data_len = 0;
1967 int sk_locked;
1968 long timeo;
1969 int err;
1970
1971 err = scm_send(sock, msg, &scm, false);
1972 if (err < 0)
1973 return err;
1974
1975 wait_for_unix_gc(scm.fp);
1976
1977 if (msg->msg_flags & MSG_OOB) {
1978 err = -EOPNOTSUPP;
1979 goto out;
1980 }
1981
1982 if (msg->msg_namelen) {
1983 err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
1984 if (err)
1985 goto out;
1986
1987 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1988 msg->msg_name,
1989 &msg->msg_namelen,
1990 NULL);
1991 if (err)
1992 goto out;
1993 }
1994
1995 if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1996 test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1997 !READ_ONCE(u->addr)) {
1998 err = unix_autobind(sk);
1999 if (err)
2000 goto out;
2001 }
2002
2003 if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
2004 err = -EMSGSIZE;
2005 goto out;
2006 }
2007
2008 if (len > SKB_MAX_ALLOC) {
2009 data_len = min_t(size_t,
2010 len - SKB_MAX_ALLOC,
2011 MAX_SKB_FRAGS * PAGE_SIZE);
2012 data_len = PAGE_ALIGN(data_len);
2013
2014 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2015 }
2016
2017 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2018 msg->msg_flags & MSG_DONTWAIT, &err,
2019 PAGE_ALLOC_COSTLY_ORDER);
2020 if (!skb)
2021 goto out;
2022
2023 err = unix_scm_to_skb(&scm, skb, true);
2024 if (err < 0)
2025 goto out_free;
2026
2027 skb_put(skb, len - data_len);
2028 skb->data_len = data_len;
2029 skb->len = len;
2030 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2031 if (err)
2032 goto out_free;
2033
2034 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2035
2036 if (msg->msg_namelen) {
2037 lookup:
2038 other = unix_find_other(sock_net(sk), msg->msg_name,
2039 msg->msg_namelen, sk->sk_type);
2040 if (IS_ERR(other)) {
2041 err = PTR_ERR(other);
2042 goto out_free;
2043 }
2044 } else {
2045 other = unix_peer_get(sk);
2046 if (!other) {
2047 err = -ENOTCONN;
2048 goto out_free;
2049 }
2050 }
2051
2052 if (sk_filter(other, skb) < 0) {
2053 /* Toss the packet but do not return any error to the sender */
2054 err = len;
2055 goto out_sock_put;
2056 }
2057
2058 restart:
2059 sk_locked = 0;
2060 unix_state_lock(other);
2061 restart_locked:
2062
2063 if (!unix_may_send(sk, other)) {
2064 err = -EPERM;
2065 goto out_unlock;
2066 }
2067
2068 if (unlikely(sock_flag(other, SOCK_DEAD))) {
2069 /* Check with 1003.1g - what should datagram error */
2070
2071 unix_state_unlock(other);
2072
2073 if (sk->sk_type == SOCK_SEQPACKET) {
2074 /* We are here only when racing with unix_release_sock()
2075 * is clearing @other. Never change state to TCP_CLOSE
2076 * unlike SOCK_DGRAM wants.
2077 */
2078 err = -EPIPE;
2079 goto out_sock_put;
2080 }
2081
2082 if (!sk_locked)
2083 unix_state_lock(sk);
2084
2085 if (unix_peer(sk) == other) {
2086 unix_peer(sk) = NULL;
2087 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2088
2089 WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2090 unix_state_unlock(sk);
2091
2092 unix_dgram_disconnected(sk, other);
2093 sock_put(other);
2094 err = -ECONNREFUSED;
2095 goto out_sock_put;
2096 }
2097
2098 unix_state_unlock(sk);
2099
2100 if (!msg->msg_namelen) {
2101 err = -ECONNRESET;
2102 goto out_sock_put;
2103 }
2104
2105 sock_put(other);
2106 goto lookup;
2107 }
2108
2109 if (other->sk_shutdown & RCV_SHUTDOWN) {
2110 err = -EPIPE;
2111 goto out_unlock;
2112 }
2113
2114 if (sk->sk_type != SOCK_SEQPACKET) {
2115 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2116 if (err)
2117 goto out_unlock;
2118 }
2119
2120 /* other == sk && unix_peer(other) != sk if
2121 * - unix_peer(sk) == NULL, destination address bound to sk
2122 * - unix_peer(sk) == sk by time of get but disconnected before lock
2123 */
2124 if (other != sk &&
2125 unlikely(unix_peer(other) != sk &&
2126 unix_recvq_full_lockless(other))) {
2127 if (timeo) {
2128 timeo = unix_wait_for_peer(other, timeo);
2129
2130 err = sock_intr_errno(timeo);
2131 if (signal_pending(current))
2132 goto out_sock_put;
2133
2134 goto restart;
2135 }
2136
2137 if (!sk_locked) {
2138 unix_state_unlock(other);
2139 unix_state_double_lock(sk, other);
2140 }
2141
2142 if (unix_peer(sk) != other ||
2143 unix_dgram_peer_wake_me(sk, other)) {
2144 err = -EAGAIN;
2145 sk_locked = 1;
2146 goto out_unlock;
2147 }
2148
2149 if (!sk_locked) {
2150 sk_locked = 1;
2151 goto restart_locked;
2152 }
2153 }
2154
2155 if (unlikely(sk_locked))
2156 unix_state_unlock(sk);
2157
2158 if (sock_flag(other, SOCK_RCVTSTAMP))
2159 __net_timestamp(skb);
2160 maybe_add_creds(skb, sock, other);
2161 scm_stat_add(other, skb);
2162 skb_queue_tail(&other->sk_receive_queue, skb);
2163 unix_state_unlock(other);
2164 other->sk_data_ready(other);
2165 sock_put(other);
2166 scm_destroy(&scm);
2167 return len;
2168
2169 out_unlock:
2170 if (sk_locked)
2171 unix_state_unlock(sk);
2172 unix_state_unlock(other);
2173 out_sock_put:
2174 sock_put(other);
2175 out_free:
2176 consume_skb(skb);
2177 out:
2178 scm_destroy(&scm);
2179 return err;
2180 }
2181
2182 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2183 * bytes, and a minimum of a full page.
2184 */
2185 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2186
2187 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct socket * sock,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2188 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2189 struct scm_cookie *scm, bool fds_sent)
2190 {
2191 struct unix_sock *ousk = unix_sk(other);
2192 struct sk_buff *skb;
2193 int err;
2194
2195 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2196
2197 if (!skb)
2198 return err;
2199
2200 err = unix_scm_to_skb(scm, skb, !fds_sent);
2201 if (err < 0)
2202 goto out;
2203
2204 skb_put(skb, 1);
2205 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2206
2207 if (err)
2208 goto out;
2209
2210 unix_state_lock(other);
2211
2212 if (sock_flag(other, SOCK_DEAD) ||
2213 (other->sk_shutdown & RCV_SHUTDOWN)) {
2214 unix_state_unlock(other);
2215 err = -EPIPE;
2216 goto out;
2217 }
2218
2219 maybe_add_creds(skb, sock, other);
2220 scm_stat_add(other, skb);
2221
2222 spin_lock(&other->sk_receive_queue.lock);
2223 WRITE_ONCE(ousk->oob_skb, skb);
2224 __skb_queue_tail(&other->sk_receive_queue, skb);
2225 spin_unlock(&other->sk_receive_queue.lock);
2226
2227 sk_send_sigurg(other);
2228 unix_state_unlock(other);
2229 other->sk_data_ready(other);
2230
2231 return 0;
2232 out:
2233 consume_skb(skb);
2234 return err;
2235 }
2236 #endif
2237
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2238 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2239 size_t len)
2240 {
2241 struct sock *sk = sock->sk;
2242 struct sk_buff *skb = NULL;
2243 struct sock *other = NULL;
2244 struct scm_cookie scm;
2245 bool fds_sent = false;
2246 int err, sent = 0;
2247
2248 err = scm_send(sock, msg, &scm, false);
2249 if (err < 0)
2250 return err;
2251
2252 wait_for_unix_gc(scm.fp);
2253
2254 if (msg->msg_flags & MSG_OOB) {
2255 err = -EOPNOTSUPP;
2256 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2257 if (len)
2258 len--;
2259 else
2260 #endif
2261 goto out_err;
2262 }
2263
2264 if (msg->msg_namelen) {
2265 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2266 goto out_err;
2267 } else {
2268 other = unix_peer(sk);
2269 if (!other) {
2270 err = -ENOTCONN;
2271 goto out_err;
2272 }
2273 }
2274
2275 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2276 goto out_pipe;
2277
2278 while (sent < len) {
2279 int size = len - sent;
2280 int data_len;
2281
2282 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2283 skb = sock_alloc_send_pskb(sk, 0, 0,
2284 msg->msg_flags & MSG_DONTWAIT,
2285 &err, 0);
2286 } else {
2287 /* Keep two messages in the pipe so it schedules better */
2288 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2289
2290 /* allow fallback to order-0 allocations */
2291 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2292
2293 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2294
2295 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2296
2297 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2298 msg->msg_flags & MSG_DONTWAIT, &err,
2299 get_order(UNIX_SKB_FRAGS_SZ));
2300 }
2301 if (!skb)
2302 goto out_err;
2303
2304 /* Only send the fds in the first buffer */
2305 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2306 if (err < 0)
2307 goto out_free;
2308
2309 fds_sent = true;
2310
2311 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2312 skb->ip_summed = CHECKSUM_UNNECESSARY;
2313 err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2314 sk->sk_allocation);
2315 if (err < 0)
2316 goto out_free;
2317
2318 size = err;
2319 refcount_add(size, &sk->sk_wmem_alloc);
2320 } else {
2321 skb_put(skb, size - data_len);
2322 skb->data_len = data_len;
2323 skb->len = size;
2324 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2325 if (err)
2326 goto out_free;
2327 }
2328
2329 unix_state_lock(other);
2330
2331 if (sock_flag(other, SOCK_DEAD) ||
2332 (other->sk_shutdown & RCV_SHUTDOWN))
2333 goto out_pipe_unlock;
2334
2335 maybe_add_creds(skb, sock, other);
2336 scm_stat_add(other, skb);
2337 skb_queue_tail(&other->sk_receive_queue, skb);
2338 unix_state_unlock(other);
2339 other->sk_data_ready(other);
2340 sent += size;
2341 }
2342
2343 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2344 if (msg->msg_flags & MSG_OOB) {
2345 err = queue_oob(sock, msg, other, &scm, fds_sent);
2346 if (err)
2347 goto out_err;
2348 sent++;
2349 }
2350 #endif
2351
2352 scm_destroy(&scm);
2353
2354 return sent;
2355
2356 out_pipe_unlock:
2357 unix_state_unlock(other);
2358 out_pipe:
2359 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2360 send_sig(SIGPIPE, current, 0);
2361 err = -EPIPE;
2362 out_free:
2363 consume_skb(skb);
2364 out_err:
2365 scm_destroy(&scm);
2366 return sent ? : err;
2367 }
2368
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2369 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2370 size_t len)
2371 {
2372 int err;
2373 struct sock *sk = sock->sk;
2374
2375 err = sock_error(sk);
2376 if (err)
2377 return err;
2378
2379 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2380 return -ENOTCONN;
2381
2382 if (msg->msg_namelen)
2383 msg->msg_namelen = 0;
2384
2385 return unix_dgram_sendmsg(sock, msg, len);
2386 }
2387
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2388 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2389 size_t size, int flags)
2390 {
2391 struct sock *sk = sock->sk;
2392
2393 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2394 return -ENOTCONN;
2395
2396 return unix_dgram_recvmsg(sock, msg, size, flags);
2397 }
2398
unix_copy_addr(struct msghdr * msg,struct sock * sk)2399 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2400 {
2401 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2402
2403 if (addr) {
2404 msg->msg_namelen = addr->len;
2405 memcpy(msg->msg_name, addr->name, addr->len);
2406 }
2407 }
2408
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2409 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2410 int flags)
2411 {
2412 struct scm_cookie scm;
2413 struct socket *sock = sk->sk_socket;
2414 struct unix_sock *u = unix_sk(sk);
2415 struct sk_buff *skb, *last;
2416 long timeo;
2417 int skip;
2418 int err;
2419
2420 err = -EOPNOTSUPP;
2421 if (flags&MSG_OOB)
2422 goto out;
2423
2424 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2425
2426 do {
2427 mutex_lock(&u->iolock);
2428
2429 skip = sk_peek_offset(sk, flags);
2430 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2431 &skip, &err, &last);
2432 if (skb) {
2433 if (!(flags & MSG_PEEK))
2434 scm_stat_del(sk, skb);
2435 break;
2436 }
2437
2438 mutex_unlock(&u->iolock);
2439
2440 if (err != -EAGAIN)
2441 break;
2442 } while (timeo &&
2443 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2444 &err, &timeo, last));
2445
2446 if (!skb) { /* implies iolock unlocked */
2447 unix_state_lock(sk);
2448 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2449 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2450 (sk->sk_shutdown & RCV_SHUTDOWN))
2451 err = 0;
2452 unix_state_unlock(sk);
2453 goto out;
2454 }
2455
2456 if (wq_has_sleeper(&u->peer_wait))
2457 wake_up_interruptible_sync_poll(&u->peer_wait,
2458 EPOLLOUT | EPOLLWRNORM |
2459 EPOLLWRBAND);
2460
2461 if (msg->msg_name) {
2462 unix_copy_addr(msg, skb->sk);
2463
2464 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2465 msg->msg_name,
2466 &msg->msg_namelen);
2467 }
2468
2469 if (size > skb->len - skip)
2470 size = skb->len - skip;
2471 else if (size < skb->len - skip)
2472 msg->msg_flags |= MSG_TRUNC;
2473
2474 err = skb_copy_datagram_msg(skb, skip, msg, size);
2475 if (err)
2476 goto out_free;
2477
2478 if (sock_flag(sk, SOCK_RCVTSTAMP))
2479 __sock_recv_timestamp(msg, sk, skb);
2480
2481 memset(&scm, 0, sizeof(scm));
2482
2483 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2484 unix_set_secdata(&scm, skb);
2485
2486 if (!(flags & MSG_PEEK)) {
2487 if (UNIXCB(skb).fp)
2488 unix_detach_fds(&scm, skb);
2489
2490 sk_peek_offset_bwd(sk, skb->len);
2491 } else {
2492 /* It is questionable: on PEEK we could:
2493 - do not return fds - good, but too simple 8)
2494 - return fds, and do not return them on read (old strategy,
2495 apparently wrong)
2496 - clone fds (I chose it for now, it is the most universal
2497 solution)
2498
2499 POSIX 1003.1g does not actually define this clearly
2500 at all. POSIX 1003.1g doesn't define a lot of things
2501 clearly however!
2502
2503 */
2504
2505 sk_peek_offset_fwd(sk, size);
2506
2507 if (UNIXCB(skb).fp)
2508 unix_peek_fds(&scm, skb);
2509 }
2510 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2511
2512 scm_recv_unix(sock, msg, &scm, flags);
2513
2514 out_free:
2515 skb_free_datagram(sk, skb);
2516 mutex_unlock(&u->iolock);
2517 out:
2518 return err;
2519 }
2520
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2521 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2522 int flags)
2523 {
2524 struct sock *sk = sock->sk;
2525
2526 #ifdef CONFIG_BPF_SYSCALL
2527 const struct proto *prot = READ_ONCE(sk->sk_prot);
2528
2529 if (prot != &unix_dgram_proto)
2530 return prot->recvmsg(sk, msg, size, flags, NULL);
2531 #endif
2532 return __unix_dgram_recvmsg(sk, msg, size, flags);
2533 }
2534
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2535 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2536 {
2537 struct unix_sock *u = unix_sk(sk);
2538 struct sk_buff *skb;
2539 int err;
2540
2541 mutex_lock(&u->iolock);
2542 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2543 mutex_unlock(&u->iolock);
2544 if (!skb)
2545 return err;
2546
2547 return recv_actor(sk, skb);
2548 }
2549
2550 /*
2551 * Sleep until more data has arrived. But check for races..
2552 */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2553 static long unix_stream_data_wait(struct sock *sk, long timeo,
2554 struct sk_buff *last, unsigned int last_len,
2555 bool freezable)
2556 {
2557 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2558 struct sk_buff *tail;
2559 DEFINE_WAIT(wait);
2560
2561 unix_state_lock(sk);
2562
2563 for (;;) {
2564 prepare_to_wait(sk_sleep(sk), &wait, state);
2565
2566 tail = skb_peek_tail(&sk->sk_receive_queue);
2567 if (tail != last ||
2568 (tail && tail->len != last_len) ||
2569 sk->sk_err ||
2570 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2571 signal_pending(current) ||
2572 !timeo)
2573 break;
2574
2575 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2576 unix_state_unlock(sk);
2577 timeo = schedule_timeout(timeo);
2578 unix_state_lock(sk);
2579
2580 if (sock_flag(sk, SOCK_DEAD))
2581 break;
2582
2583 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2584 }
2585
2586 finish_wait(sk_sleep(sk), &wait);
2587 unix_state_unlock(sk);
2588 return timeo;
2589 }
2590
unix_skb_len(const struct sk_buff * skb)2591 static unsigned int unix_skb_len(const struct sk_buff *skb)
2592 {
2593 return skb->len - UNIXCB(skb).consumed;
2594 }
2595
2596 struct unix_stream_read_state {
2597 int (*recv_actor)(struct sk_buff *, int, int,
2598 struct unix_stream_read_state *);
2599 struct socket *socket;
2600 struct msghdr *msg;
2601 struct pipe_inode_info *pipe;
2602 size_t size;
2603 int flags;
2604 unsigned int splice_flags;
2605 };
2606
2607 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2608 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2609 {
2610 struct socket *sock = state->socket;
2611 struct sock *sk = sock->sk;
2612 struct unix_sock *u = unix_sk(sk);
2613 int chunk = 1;
2614 struct sk_buff *oob_skb;
2615
2616 mutex_lock(&u->iolock);
2617 unix_state_lock(sk);
2618 spin_lock(&sk->sk_receive_queue.lock);
2619
2620 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2621 spin_unlock(&sk->sk_receive_queue.lock);
2622 unix_state_unlock(sk);
2623 mutex_unlock(&u->iolock);
2624 return -EINVAL;
2625 }
2626
2627 oob_skb = u->oob_skb;
2628
2629 if (!(state->flags & MSG_PEEK))
2630 WRITE_ONCE(u->oob_skb, NULL);
2631
2632 spin_unlock(&sk->sk_receive_queue.lock);
2633 unix_state_unlock(sk);
2634
2635 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2636
2637 if (!(state->flags & MSG_PEEK))
2638 UNIXCB(oob_skb).consumed += 1;
2639
2640 mutex_unlock(&u->iolock);
2641
2642 if (chunk < 0)
2643 return -EFAULT;
2644
2645 state->msg->msg_flags |= MSG_OOB;
2646 return 1;
2647 }
2648
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2649 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2650 int flags, int copied)
2651 {
2652 struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2653 struct unix_sock *u = unix_sk(sk);
2654
2655 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2656 return skb;
2657
2658 spin_lock(&sk->sk_receive_queue.lock);
2659
2660 if (!unix_skb_len(skb)) {
2661 if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2662 skb = NULL;
2663 } else if (flags & MSG_PEEK) {
2664 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2665 } else {
2666 read_skb = skb;
2667 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2668 __skb_unlink(read_skb, &sk->sk_receive_queue);
2669 }
2670
2671 if (!skb)
2672 goto unlock;
2673 }
2674
2675 if (skb != u->oob_skb)
2676 goto unlock;
2677
2678 if (copied) {
2679 skb = NULL;
2680 } else if (!(flags & MSG_PEEK)) {
2681 WRITE_ONCE(u->oob_skb, NULL);
2682
2683 if (!sock_flag(sk, SOCK_URGINLINE)) {
2684 __skb_unlink(skb, &sk->sk_receive_queue);
2685 unread_skb = skb;
2686 skb = skb_peek(&sk->sk_receive_queue);
2687 }
2688 } else if (!sock_flag(sk, SOCK_URGINLINE)) {
2689 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2690 }
2691
2692 unlock:
2693 spin_unlock(&sk->sk_receive_queue.lock);
2694
2695 consume_skb(read_skb);
2696 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2697
2698 return skb;
2699 }
2700 #endif
2701
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2702 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2703 {
2704 struct unix_sock *u = unix_sk(sk);
2705 struct sk_buff *skb;
2706 int err;
2707
2708 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2709 return -ENOTCONN;
2710
2711 mutex_lock(&u->iolock);
2712 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2713 mutex_unlock(&u->iolock);
2714 if (!skb)
2715 return err;
2716
2717 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2718 if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2719 bool drop = false;
2720
2721 unix_state_lock(sk);
2722
2723 if (sock_flag(sk, SOCK_DEAD)) {
2724 unix_state_unlock(sk);
2725 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2726 return -ECONNRESET;
2727 }
2728
2729 spin_lock(&sk->sk_receive_queue.lock);
2730 if (likely(skb == u->oob_skb)) {
2731 WRITE_ONCE(u->oob_skb, NULL);
2732 drop = true;
2733 }
2734 spin_unlock(&sk->sk_receive_queue.lock);
2735
2736 unix_state_unlock(sk);
2737
2738 if (drop) {
2739 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2740 return -EAGAIN;
2741 }
2742 }
2743 #endif
2744
2745 return recv_actor(sk, skb);
2746 }
2747
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2748 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2749 bool freezable)
2750 {
2751 struct scm_cookie scm;
2752 struct socket *sock = state->socket;
2753 struct sock *sk = sock->sk;
2754 struct unix_sock *u = unix_sk(sk);
2755 int copied = 0;
2756 int flags = state->flags;
2757 int noblock = flags & MSG_DONTWAIT;
2758 bool check_creds = false;
2759 int target;
2760 int err = 0;
2761 long timeo;
2762 int skip;
2763 size_t size = state->size;
2764 unsigned int last_len;
2765
2766 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2767 err = -EINVAL;
2768 goto out;
2769 }
2770
2771 if (unlikely(flags & MSG_OOB)) {
2772 err = -EOPNOTSUPP;
2773 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2774 err = unix_stream_recv_urg(state);
2775 #endif
2776 goto out;
2777 }
2778
2779 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2780 timeo = sock_rcvtimeo(sk, noblock);
2781
2782 memset(&scm, 0, sizeof(scm));
2783
2784 /* Lock the socket to prevent queue disordering
2785 * while sleeps in memcpy_tomsg
2786 */
2787 mutex_lock(&u->iolock);
2788
2789 skip = max(sk_peek_offset(sk, flags), 0);
2790
2791 do {
2792 struct sk_buff *skb, *last;
2793 int chunk;
2794
2795 redo:
2796 unix_state_lock(sk);
2797 if (sock_flag(sk, SOCK_DEAD)) {
2798 err = -ECONNRESET;
2799 goto unlock;
2800 }
2801 last = skb = skb_peek(&sk->sk_receive_queue);
2802 last_len = last ? last->len : 0;
2803
2804 again:
2805 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2806 if (skb) {
2807 skb = manage_oob(skb, sk, flags, copied);
2808 if (!skb && copied) {
2809 unix_state_unlock(sk);
2810 break;
2811 }
2812 }
2813 #endif
2814 if (skb == NULL) {
2815 if (copied >= target)
2816 goto unlock;
2817
2818 /*
2819 * POSIX 1003.1g mandates this order.
2820 */
2821
2822 err = sock_error(sk);
2823 if (err)
2824 goto unlock;
2825 if (sk->sk_shutdown & RCV_SHUTDOWN)
2826 goto unlock;
2827
2828 unix_state_unlock(sk);
2829 if (!timeo) {
2830 err = -EAGAIN;
2831 break;
2832 }
2833
2834 mutex_unlock(&u->iolock);
2835
2836 timeo = unix_stream_data_wait(sk, timeo, last,
2837 last_len, freezable);
2838
2839 if (signal_pending(current)) {
2840 err = sock_intr_errno(timeo);
2841 scm_destroy(&scm);
2842 goto out;
2843 }
2844
2845 mutex_lock(&u->iolock);
2846 goto redo;
2847 unlock:
2848 unix_state_unlock(sk);
2849 break;
2850 }
2851
2852 while (skip >= unix_skb_len(skb)) {
2853 skip -= unix_skb_len(skb);
2854 last = skb;
2855 last_len = skb->len;
2856 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2857 if (!skb)
2858 goto again;
2859 }
2860
2861 unix_state_unlock(sk);
2862
2863 if (check_creds) {
2864 /* Never glue messages from different writers */
2865 if (!unix_skb_scm_eq(skb, &scm))
2866 break;
2867 } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2868 test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2869 /* Copy credentials */
2870 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2871 unix_set_secdata(&scm, skb);
2872 check_creds = true;
2873 }
2874
2875 /* Copy address just once */
2876 if (state->msg && state->msg->msg_name) {
2877 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2878 state->msg->msg_name);
2879 unix_copy_addr(state->msg, skb->sk);
2880
2881 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2882 state->msg->msg_name,
2883 &state->msg->msg_namelen);
2884
2885 sunaddr = NULL;
2886 }
2887
2888 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2889 chunk = state->recv_actor(skb, skip, chunk, state);
2890 if (chunk < 0) {
2891 if (copied == 0)
2892 copied = -EFAULT;
2893 break;
2894 }
2895 copied += chunk;
2896 size -= chunk;
2897
2898 /* Mark read part of skb as used */
2899 if (!(flags & MSG_PEEK)) {
2900 UNIXCB(skb).consumed += chunk;
2901
2902 sk_peek_offset_bwd(sk, chunk);
2903
2904 if (UNIXCB(skb).fp) {
2905 scm_stat_del(sk, skb);
2906 unix_detach_fds(&scm, skb);
2907 }
2908
2909 if (unix_skb_len(skb))
2910 break;
2911
2912 skb_unlink(skb, &sk->sk_receive_queue);
2913 consume_skb(skb);
2914
2915 if (scm.fp)
2916 break;
2917 } else {
2918 /* It is questionable, see note in unix_dgram_recvmsg.
2919 */
2920 if (UNIXCB(skb).fp)
2921 unix_peek_fds(&scm, skb);
2922
2923 sk_peek_offset_fwd(sk, chunk);
2924
2925 if (UNIXCB(skb).fp)
2926 break;
2927
2928 skip = 0;
2929 last = skb;
2930 last_len = skb->len;
2931 unix_state_lock(sk);
2932 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2933 if (skb)
2934 goto again;
2935 unix_state_unlock(sk);
2936 break;
2937 }
2938 } while (size);
2939
2940 mutex_unlock(&u->iolock);
2941 if (state->msg)
2942 scm_recv_unix(sock, state->msg, &scm, flags);
2943 else
2944 scm_destroy(&scm);
2945 out:
2946 return copied ? : err;
2947 }
2948
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2949 static int unix_stream_read_actor(struct sk_buff *skb,
2950 int skip, int chunk,
2951 struct unix_stream_read_state *state)
2952 {
2953 int ret;
2954
2955 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2956 state->msg, chunk);
2957 return ret ?: chunk;
2958 }
2959
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2960 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2961 size_t size, int flags)
2962 {
2963 struct unix_stream_read_state state = {
2964 .recv_actor = unix_stream_read_actor,
2965 .socket = sk->sk_socket,
2966 .msg = msg,
2967 .size = size,
2968 .flags = flags
2969 };
2970
2971 return unix_stream_read_generic(&state, true);
2972 }
2973
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2974 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2975 size_t size, int flags)
2976 {
2977 struct unix_stream_read_state state = {
2978 .recv_actor = unix_stream_read_actor,
2979 .socket = sock,
2980 .msg = msg,
2981 .size = size,
2982 .flags = flags
2983 };
2984
2985 #ifdef CONFIG_BPF_SYSCALL
2986 struct sock *sk = sock->sk;
2987 const struct proto *prot = READ_ONCE(sk->sk_prot);
2988
2989 if (prot != &unix_stream_proto)
2990 return prot->recvmsg(sk, msg, size, flags, NULL);
2991 #endif
2992 return unix_stream_read_generic(&state, true);
2993 }
2994
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)2995 static int unix_stream_splice_actor(struct sk_buff *skb,
2996 int skip, int chunk,
2997 struct unix_stream_read_state *state)
2998 {
2999 return skb_splice_bits(skb, state->socket->sk,
3000 UNIXCB(skb).consumed + skip,
3001 state->pipe, chunk, state->splice_flags);
3002 }
3003
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)3004 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
3005 struct pipe_inode_info *pipe,
3006 size_t size, unsigned int flags)
3007 {
3008 struct unix_stream_read_state state = {
3009 .recv_actor = unix_stream_splice_actor,
3010 .socket = sock,
3011 .pipe = pipe,
3012 .size = size,
3013 .splice_flags = flags,
3014 };
3015
3016 if (unlikely(*ppos))
3017 return -ESPIPE;
3018
3019 if (sock->file->f_flags & O_NONBLOCK ||
3020 flags & SPLICE_F_NONBLOCK)
3021 state.flags = MSG_DONTWAIT;
3022
3023 return unix_stream_read_generic(&state, false);
3024 }
3025
unix_shutdown(struct socket * sock,int mode)3026 static int unix_shutdown(struct socket *sock, int mode)
3027 {
3028 struct sock *sk = sock->sk;
3029 struct sock *other;
3030
3031 if (mode < SHUT_RD || mode > SHUT_RDWR)
3032 return -EINVAL;
3033 /* This maps:
3034 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
3035 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
3036 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3037 */
3038 ++mode;
3039
3040 unix_state_lock(sk);
3041 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3042 other = unix_peer(sk);
3043 if (other)
3044 sock_hold(other);
3045 unix_state_unlock(sk);
3046 sk->sk_state_change(sk);
3047
3048 if (other &&
3049 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3050
3051 int peer_mode = 0;
3052 const struct proto *prot = READ_ONCE(other->sk_prot);
3053
3054 if (prot->unhash)
3055 prot->unhash(other);
3056 if (mode&RCV_SHUTDOWN)
3057 peer_mode |= SEND_SHUTDOWN;
3058 if (mode&SEND_SHUTDOWN)
3059 peer_mode |= RCV_SHUTDOWN;
3060 unix_state_lock(other);
3061 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3062 unix_state_unlock(other);
3063 other->sk_state_change(other);
3064 if (peer_mode == SHUTDOWN_MASK)
3065 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3066 else if (peer_mode & RCV_SHUTDOWN)
3067 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3068 }
3069 if (other)
3070 sock_put(other);
3071
3072 return 0;
3073 }
3074
unix_inq_len(struct sock * sk)3075 long unix_inq_len(struct sock *sk)
3076 {
3077 struct sk_buff *skb;
3078 long amount = 0;
3079
3080 if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3081 return -EINVAL;
3082
3083 spin_lock(&sk->sk_receive_queue.lock);
3084 if (sk->sk_type == SOCK_STREAM ||
3085 sk->sk_type == SOCK_SEQPACKET) {
3086 skb_queue_walk(&sk->sk_receive_queue, skb)
3087 amount += unix_skb_len(skb);
3088 } else {
3089 skb = skb_peek(&sk->sk_receive_queue);
3090 if (skb)
3091 amount = skb->len;
3092 }
3093 spin_unlock(&sk->sk_receive_queue.lock);
3094
3095 return amount;
3096 }
3097 EXPORT_SYMBOL_GPL(unix_inq_len);
3098
unix_outq_len(struct sock * sk)3099 long unix_outq_len(struct sock *sk)
3100 {
3101 return sk_wmem_alloc_get(sk);
3102 }
3103 EXPORT_SYMBOL_GPL(unix_outq_len);
3104
unix_open_file(struct sock * sk)3105 static int unix_open_file(struct sock *sk)
3106 {
3107 struct path path;
3108 struct file *f;
3109 int fd;
3110
3111 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3112 return -EPERM;
3113
3114 if (!smp_load_acquire(&unix_sk(sk)->addr))
3115 return -ENOENT;
3116
3117 path = unix_sk(sk)->path;
3118 if (!path.dentry)
3119 return -ENOENT;
3120
3121 path_get(&path);
3122
3123 fd = get_unused_fd_flags(O_CLOEXEC);
3124 if (fd < 0)
3125 goto out;
3126
3127 f = dentry_open(&path, O_PATH, current_cred());
3128 if (IS_ERR(f)) {
3129 put_unused_fd(fd);
3130 fd = PTR_ERR(f);
3131 goto out;
3132 }
3133
3134 fd_install(fd, f);
3135 out:
3136 path_put(&path);
3137
3138 return fd;
3139 }
3140
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3141 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3142 {
3143 struct sock *sk = sock->sk;
3144 long amount = 0;
3145 int err;
3146
3147 switch (cmd) {
3148 case SIOCOUTQ:
3149 amount = unix_outq_len(sk);
3150 err = put_user(amount, (int __user *)arg);
3151 break;
3152 case SIOCINQ:
3153 amount = unix_inq_len(sk);
3154 if (amount < 0)
3155 err = amount;
3156 else
3157 err = put_user(amount, (int __user *)arg);
3158 break;
3159 case SIOCUNIXFILE:
3160 err = unix_open_file(sk);
3161 break;
3162 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3163 case SIOCATMARK:
3164 {
3165 struct unix_sock *u = unix_sk(sk);
3166 struct sk_buff *skb;
3167 int answ = 0;
3168
3169 mutex_lock(&u->iolock);
3170
3171 skb = skb_peek(&sk->sk_receive_queue);
3172 if (skb) {
3173 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3174 struct sk_buff *next_skb;
3175
3176 next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3177
3178 if (skb == oob_skb ||
3179 (!unix_skb_len(skb) &&
3180 (!oob_skb || next_skb == oob_skb)))
3181 answ = 1;
3182 }
3183
3184 mutex_unlock(&u->iolock);
3185
3186 err = put_user(answ, (int __user *)arg);
3187 }
3188 break;
3189 #endif
3190 default:
3191 err = -ENOIOCTLCMD;
3192 break;
3193 }
3194 return err;
3195 }
3196
3197 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3198 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3199 {
3200 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3201 }
3202 #endif
3203
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3204 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3205 {
3206 struct sock *sk = sock->sk;
3207 unsigned char state;
3208 __poll_t mask;
3209 u8 shutdown;
3210
3211 sock_poll_wait(file, sock, wait);
3212 mask = 0;
3213 shutdown = READ_ONCE(sk->sk_shutdown);
3214 state = READ_ONCE(sk->sk_state);
3215
3216 /* exceptional events? */
3217 if (READ_ONCE(sk->sk_err))
3218 mask |= EPOLLERR;
3219 if (shutdown == SHUTDOWN_MASK)
3220 mask |= EPOLLHUP;
3221 if (shutdown & RCV_SHUTDOWN)
3222 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3223
3224 /* readable? */
3225 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3226 mask |= EPOLLIN | EPOLLRDNORM;
3227 if (sk_is_readable(sk))
3228 mask |= EPOLLIN | EPOLLRDNORM;
3229 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3230 if (READ_ONCE(unix_sk(sk)->oob_skb))
3231 mask |= EPOLLPRI;
3232 #endif
3233
3234 /* Connection-based need to check for termination and startup */
3235 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3236 state == TCP_CLOSE)
3237 mask |= EPOLLHUP;
3238
3239 /*
3240 * we set writable also when the other side has shut down the
3241 * connection. This prevents stuck sockets.
3242 */
3243 if (unix_writable(sk, state))
3244 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3245
3246 return mask;
3247 }
3248
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3249 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3250 poll_table *wait)
3251 {
3252 struct sock *sk = sock->sk, *other;
3253 unsigned int writable;
3254 unsigned char state;
3255 __poll_t mask;
3256 u8 shutdown;
3257
3258 sock_poll_wait(file, sock, wait);
3259 mask = 0;
3260 shutdown = READ_ONCE(sk->sk_shutdown);
3261 state = READ_ONCE(sk->sk_state);
3262
3263 /* exceptional events? */
3264 if (READ_ONCE(sk->sk_err) ||
3265 !skb_queue_empty_lockless(&sk->sk_error_queue))
3266 mask |= EPOLLERR |
3267 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3268
3269 if (shutdown & RCV_SHUTDOWN)
3270 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3271 if (shutdown == SHUTDOWN_MASK)
3272 mask |= EPOLLHUP;
3273
3274 /* readable? */
3275 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3276 mask |= EPOLLIN | EPOLLRDNORM;
3277 if (sk_is_readable(sk))
3278 mask |= EPOLLIN | EPOLLRDNORM;
3279
3280 /* Connection-based need to check for termination and startup */
3281 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3282 mask |= EPOLLHUP;
3283
3284 /* No write status requested, avoid expensive OUT tests. */
3285 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3286 return mask;
3287
3288 writable = unix_writable(sk, state);
3289 if (writable) {
3290 unix_state_lock(sk);
3291
3292 other = unix_peer(sk);
3293 if (other && unix_peer(other) != sk &&
3294 unix_recvq_full_lockless(other) &&
3295 unix_dgram_peer_wake_me(sk, other))
3296 writable = 0;
3297
3298 unix_state_unlock(sk);
3299 }
3300
3301 if (writable)
3302 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3303 else
3304 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3305
3306 return mask;
3307 }
3308
3309 #ifdef CONFIG_PROC_FS
3310
3311 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3312
3313 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3314 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3315 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3316
unix_from_bucket(struct seq_file * seq,loff_t * pos)3317 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3318 {
3319 unsigned long offset = get_offset(*pos);
3320 unsigned long bucket = get_bucket(*pos);
3321 unsigned long count = 0;
3322 struct sock *sk;
3323
3324 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3325 sk; sk = sk_next(sk)) {
3326 if (++count == offset)
3327 break;
3328 }
3329
3330 return sk;
3331 }
3332
unix_get_first(struct seq_file * seq,loff_t * pos)3333 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3334 {
3335 unsigned long bucket = get_bucket(*pos);
3336 struct net *net = seq_file_net(seq);
3337 struct sock *sk;
3338
3339 while (bucket < UNIX_HASH_SIZE) {
3340 spin_lock(&net->unx.table.locks[bucket]);
3341
3342 sk = unix_from_bucket(seq, pos);
3343 if (sk)
3344 return sk;
3345
3346 spin_unlock(&net->unx.table.locks[bucket]);
3347
3348 *pos = set_bucket_offset(++bucket, 1);
3349 }
3350
3351 return NULL;
3352 }
3353
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3354 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3355 loff_t *pos)
3356 {
3357 unsigned long bucket = get_bucket(*pos);
3358
3359 sk = sk_next(sk);
3360 if (sk)
3361 return sk;
3362
3363
3364 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3365
3366 *pos = set_bucket_offset(++bucket, 1);
3367
3368 return unix_get_first(seq, pos);
3369 }
3370
unix_seq_start(struct seq_file * seq,loff_t * pos)3371 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3372 {
3373 if (!*pos)
3374 return SEQ_START_TOKEN;
3375
3376 return unix_get_first(seq, pos);
3377 }
3378
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3379 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3380 {
3381 ++*pos;
3382
3383 if (v == SEQ_START_TOKEN)
3384 return unix_get_first(seq, pos);
3385
3386 return unix_get_next(seq, v, pos);
3387 }
3388
unix_seq_stop(struct seq_file * seq,void * v)3389 static void unix_seq_stop(struct seq_file *seq, void *v)
3390 {
3391 struct sock *sk = v;
3392
3393 if (sk)
3394 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3395 }
3396
unix_seq_show(struct seq_file * seq,void * v)3397 static int unix_seq_show(struct seq_file *seq, void *v)
3398 {
3399
3400 if (v == SEQ_START_TOKEN)
3401 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3402 "Inode Path\n");
3403 else {
3404 struct sock *s = v;
3405 struct unix_sock *u = unix_sk(s);
3406 unix_state_lock(s);
3407
3408 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3409 s,
3410 refcount_read(&s->sk_refcnt),
3411 0,
3412 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3413 s->sk_type,
3414 s->sk_socket ?
3415 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3416 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3417 sock_i_ino(s));
3418
3419 if (u->addr) { // under a hash table lock here
3420 int i, len;
3421 seq_putc(seq, ' ');
3422
3423 i = 0;
3424 len = u->addr->len -
3425 offsetof(struct sockaddr_un, sun_path);
3426 if (u->addr->name->sun_path[0]) {
3427 len--;
3428 } else {
3429 seq_putc(seq, '@');
3430 i++;
3431 }
3432 for ( ; i < len; i++)
3433 seq_putc(seq, u->addr->name->sun_path[i] ?:
3434 '@');
3435 }
3436 unix_state_unlock(s);
3437 seq_putc(seq, '\n');
3438 }
3439
3440 return 0;
3441 }
3442
3443 static const struct seq_operations unix_seq_ops = {
3444 .start = unix_seq_start,
3445 .next = unix_seq_next,
3446 .stop = unix_seq_stop,
3447 .show = unix_seq_show,
3448 };
3449
3450 #ifdef CONFIG_BPF_SYSCALL
3451 struct bpf_unix_iter_state {
3452 struct seq_net_private p;
3453 unsigned int cur_sk;
3454 unsigned int end_sk;
3455 unsigned int max_sk;
3456 struct sock **batch;
3457 bool st_bucket_done;
3458 };
3459
3460 struct bpf_iter__unix {
3461 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3462 __bpf_md_ptr(struct unix_sock *, unix_sk);
3463 uid_t uid __aligned(8);
3464 };
3465
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3466 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3467 struct unix_sock *unix_sk, uid_t uid)
3468 {
3469 struct bpf_iter__unix ctx;
3470
3471 meta->seq_num--; /* skip SEQ_START_TOKEN */
3472 ctx.meta = meta;
3473 ctx.unix_sk = unix_sk;
3474 ctx.uid = uid;
3475 return bpf_iter_run_prog(prog, &ctx);
3476 }
3477
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3478 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3479
3480 {
3481 struct bpf_unix_iter_state *iter = seq->private;
3482 unsigned int expected = 1;
3483 struct sock *sk;
3484
3485 sock_hold(start_sk);
3486 iter->batch[iter->end_sk++] = start_sk;
3487
3488 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3489 if (iter->end_sk < iter->max_sk) {
3490 sock_hold(sk);
3491 iter->batch[iter->end_sk++] = sk;
3492 }
3493
3494 expected++;
3495 }
3496
3497 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3498
3499 return expected;
3500 }
3501
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3502 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3503 {
3504 while (iter->cur_sk < iter->end_sk)
3505 sock_put(iter->batch[iter->cur_sk++]);
3506 }
3507
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3508 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3509 unsigned int new_batch_sz)
3510 {
3511 struct sock **new_batch;
3512
3513 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3514 GFP_USER | __GFP_NOWARN);
3515 if (!new_batch)
3516 return -ENOMEM;
3517
3518 bpf_iter_unix_put_batch(iter);
3519 kvfree(iter->batch);
3520 iter->batch = new_batch;
3521 iter->max_sk = new_batch_sz;
3522
3523 return 0;
3524 }
3525
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3526 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3527 loff_t *pos)
3528 {
3529 struct bpf_unix_iter_state *iter = seq->private;
3530 unsigned int expected;
3531 bool resized = false;
3532 struct sock *sk;
3533
3534 if (iter->st_bucket_done)
3535 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3536
3537 again:
3538 /* Get a new batch */
3539 iter->cur_sk = 0;
3540 iter->end_sk = 0;
3541
3542 sk = unix_get_first(seq, pos);
3543 if (!sk)
3544 return NULL; /* Done */
3545
3546 expected = bpf_iter_unix_hold_batch(seq, sk);
3547
3548 if (iter->end_sk == expected) {
3549 iter->st_bucket_done = true;
3550 return sk;
3551 }
3552
3553 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3554 resized = true;
3555 goto again;
3556 }
3557
3558 return sk;
3559 }
3560
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3561 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3562 {
3563 if (!*pos)
3564 return SEQ_START_TOKEN;
3565
3566 /* bpf iter does not support lseek, so it always
3567 * continue from where it was stop()-ped.
3568 */
3569 return bpf_iter_unix_batch(seq, pos);
3570 }
3571
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3572 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3573 {
3574 struct bpf_unix_iter_state *iter = seq->private;
3575 struct sock *sk;
3576
3577 /* Whenever seq_next() is called, the iter->cur_sk is
3578 * done with seq_show(), so advance to the next sk in
3579 * the batch.
3580 */
3581 if (iter->cur_sk < iter->end_sk)
3582 sock_put(iter->batch[iter->cur_sk++]);
3583
3584 ++*pos;
3585
3586 if (iter->cur_sk < iter->end_sk)
3587 sk = iter->batch[iter->cur_sk];
3588 else
3589 sk = bpf_iter_unix_batch(seq, pos);
3590
3591 return sk;
3592 }
3593
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3594 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3595 {
3596 struct bpf_iter_meta meta;
3597 struct bpf_prog *prog;
3598 struct sock *sk = v;
3599 uid_t uid;
3600 bool slow;
3601 int ret;
3602
3603 if (v == SEQ_START_TOKEN)
3604 return 0;
3605
3606 slow = lock_sock_fast(sk);
3607
3608 if (unlikely(sk_unhashed(sk))) {
3609 ret = SEQ_SKIP;
3610 goto unlock;
3611 }
3612
3613 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3614 meta.seq = seq;
3615 prog = bpf_iter_get_info(&meta, false);
3616 ret = unix_prog_seq_show(prog, &meta, v, uid);
3617 unlock:
3618 unlock_sock_fast(sk, slow);
3619 return ret;
3620 }
3621
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3622 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3623 {
3624 struct bpf_unix_iter_state *iter = seq->private;
3625 struct bpf_iter_meta meta;
3626 struct bpf_prog *prog;
3627
3628 if (!v) {
3629 meta.seq = seq;
3630 prog = bpf_iter_get_info(&meta, true);
3631 if (prog)
3632 (void)unix_prog_seq_show(prog, &meta, v, 0);
3633 }
3634
3635 if (iter->cur_sk < iter->end_sk)
3636 bpf_iter_unix_put_batch(iter);
3637 }
3638
3639 static const struct seq_operations bpf_iter_unix_seq_ops = {
3640 .start = bpf_iter_unix_seq_start,
3641 .next = bpf_iter_unix_seq_next,
3642 .stop = bpf_iter_unix_seq_stop,
3643 .show = bpf_iter_unix_seq_show,
3644 };
3645 #endif
3646 #endif
3647
3648 static const struct net_proto_family unix_family_ops = {
3649 .family = PF_UNIX,
3650 .create = unix_create,
3651 .owner = THIS_MODULE,
3652 };
3653
3654
unix_net_init(struct net * net)3655 static int __net_init unix_net_init(struct net *net)
3656 {
3657 int i;
3658
3659 net->unx.sysctl_max_dgram_qlen = 10;
3660 if (unix_sysctl_register(net))
3661 goto out;
3662
3663 #ifdef CONFIG_PROC_FS
3664 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3665 sizeof(struct seq_net_private)))
3666 goto err_sysctl;
3667 #endif
3668
3669 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3670 sizeof(spinlock_t), GFP_KERNEL);
3671 if (!net->unx.table.locks)
3672 goto err_proc;
3673
3674 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3675 sizeof(struct hlist_head),
3676 GFP_KERNEL);
3677 if (!net->unx.table.buckets)
3678 goto free_locks;
3679
3680 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3681 spin_lock_init(&net->unx.table.locks[i]);
3682 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3683 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3684 }
3685
3686 return 0;
3687
3688 free_locks:
3689 kvfree(net->unx.table.locks);
3690 err_proc:
3691 #ifdef CONFIG_PROC_FS
3692 remove_proc_entry("unix", net->proc_net);
3693 err_sysctl:
3694 #endif
3695 unix_sysctl_unregister(net);
3696 out:
3697 return -ENOMEM;
3698 }
3699
unix_net_exit(struct net * net)3700 static void __net_exit unix_net_exit(struct net *net)
3701 {
3702 kvfree(net->unx.table.buckets);
3703 kvfree(net->unx.table.locks);
3704 unix_sysctl_unregister(net);
3705 remove_proc_entry("unix", net->proc_net);
3706 }
3707
3708 static struct pernet_operations unix_net_ops = {
3709 .init = unix_net_init,
3710 .exit = unix_net_exit,
3711 };
3712
3713 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3714 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3715 struct unix_sock *unix_sk, uid_t uid)
3716
3717 #define INIT_BATCH_SZ 16
3718
3719 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3720 {
3721 struct bpf_unix_iter_state *iter = priv_data;
3722 int err;
3723
3724 err = bpf_iter_init_seq_net(priv_data, aux);
3725 if (err)
3726 return err;
3727
3728 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3729 if (err) {
3730 bpf_iter_fini_seq_net(priv_data);
3731 return err;
3732 }
3733
3734 return 0;
3735 }
3736
bpf_iter_fini_unix(void * priv_data)3737 static void bpf_iter_fini_unix(void *priv_data)
3738 {
3739 struct bpf_unix_iter_state *iter = priv_data;
3740
3741 bpf_iter_fini_seq_net(priv_data);
3742 kvfree(iter->batch);
3743 }
3744
3745 static const struct bpf_iter_seq_info unix_seq_info = {
3746 .seq_ops = &bpf_iter_unix_seq_ops,
3747 .init_seq_private = bpf_iter_init_unix,
3748 .fini_seq_private = bpf_iter_fini_unix,
3749 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3750 };
3751
3752 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3753 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3754 const struct bpf_prog *prog)
3755 {
3756 switch (func_id) {
3757 case BPF_FUNC_setsockopt:
3758 return &bpf_sk_setsockopt_proto;
3759 case BPF_FUNC_getsockopt:
3760 return &bpf_sk_getsockopt_proto;
3761 default:
3762 return NULL;
3763 }
3764 }
3765
3766 static struct bpf_iter_reg unix_reg_info = {
3767 .target = "unix",
3768 .ctx_arg_info_size = 1,
3769 .ctx_arg_info = {
3770 { offsetof(struct bpf_iter__unix, unix_sk),
3771 PTR_TO_BTF_ID_OR_NULL },
3772 },
3773 .get_func_proto = bpf_iter_unix_get_func_proto,
3774 .seq_info = &unix_seq_info,
3775 };
3776
bpf_iter_register(void)3777 static void __init bpf_iter_register(void)
3778 {
3779 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3780 if (bpf_iter_reg_target(&unix_reg_info))
3781 pr_warn("Warning: could not register bpf iterator unix\n");
3782 }
3783 #endif
3784
af_unix_init(void)3785 static int __init af_unix_init(void)
3786 {
3787 int i, rc = -1;
3788
3789 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3790
3791 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3792 spin_lock_init(&bsd_socket_locks[i]);
3793 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3794 }
3795
3796 rc = proto_register(&unix_dgram_proto, 1);
3797 if (rc != 0) {
3798 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3799 goto out;
3800 }
3801
3802 rc = proto_register(&unix_stream_proto, 1);
3803 if (rc != 0) {
3804 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3805 proto_unregister(&unix_dgram_proto);
3806 goto out;
3807 }
3808
3809 sock_register(&unix_family_ops);
3810 register_pernet_subsys(&unix_net_ops);
3811 unix_bpf_build_proto();
3812
3813 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3814 bpf_iter_register();
3815 #endif
3816
3817 out:
3818 return rc;
3819 }
3820
3821 /* Later than subsys_initcall() because we depend on stuff initialised there */
3822 fs_initcall(af_unix_init);
3823